From 33917f5cefc841248703f1cad00ecdb316bb67e3 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Wed, 7 Jan 2026 21:31:21 -0500
Subject: [PATCH 01/94] MagpieTTS decoder model working on top of NeMo main
 branch

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 .../magpietts/magpietts_decoder_only.yaml     |  144 ++
 .../magpietts_decoder_only_lhotse.yaml        |  169 ++
 examples/tts/evalset_config.json              |    5 +
 examples/tts/magpietts_decoder_only.py        |   57 +
 examples/tts/magpietts_inference.py           |   10 +-
 .../text_to_speech/tts_tokenizers.py          |    1 +
 .../tts/data/text_to_speech_dataset.py        |   20 +
 .../tts/data/text_to_speech_dataset_lhotse.py |   29 +
 nemo/collections/tts/models/__init__.py       |    2 +
 .../tts/models/magpietts_decoder_only.py      | 1729 +++++++++++++++++
 .../modules/magpietts_inference/inference.py  |  148 +-
 .../tts/modules/magpietts_inference/utils.py  |   18 +-
 12 files changed, 2301 insertions(+), 31 deletions(-)
 create mode 100644 examples/tts/conf/magpietts/magpietts_decoder_only.yaml
 create mode 100644 examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml
 create mode 100644 examples/tts/magpietts_decoder_only.py
 create mode 100644 nemo/collections/tts/models/magpietts_decoder_only.py

diff --git a/examples/tts/conf/magpietts/magpietts_decoder_only.yaml b/examples/tts/conf/magpietts/magpietts_decoder_only.yaml
new file mode 100644
index 000000000000..8518fa79060b
--- /dev/null
+++ b/examples/tts/conf/magpietts/magpietts_decoder_only.yaml
@@ -0,0 +1,144 @@
+name: Magpie-TTS-DecoderOnly-EN
+
+max_epochs: ???
+# Adjust batch size based on GPU memory
+batch_size: 2
+# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch.
+# If null, then weighted sampling is disabled.
+weighted_sampling_steps_per_epoch: null
+
+# Dataset metadata for each manifest
+# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41
+train_ds_meta: ???
+val_ds_meta: ???
+
+model:
+  transformer_hf_backend: "Qwen/Qwen2.5-1.5B"
+  use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided.
+  context_duration_min: 5.0
+  context_duration_max: 5.0
+  load_cached_codes_if_available: true
+  
+  embedding_dim: 1536
+  hidden_dim: 1536
+  codecmodel_path: ???
+  max_epochs: ${max_epochs}
+  steps_per_epoch: ${weighted_sampling_steps_per_epoch}
+  
+  # Local transformer parameters for autoregressive codebook prediction within a frame
+  local_transformer_type: "none" # "none", "autoregressive", "maskgit"
+  # Below args are only relevant if use_local_transformer is autoregressive, maskgit
+  local_transformer_loss_scale: 1.0
+  local_transformer_n_layers: 3
+  local_transformer_n_heads: 1
+  local_transformer_hidden_dim: 256
+
+  cfg_unconditional_prob: 0.1
+  # To get special_tokens of the tokenzer, you can do:
+  # model.tokenizer.first_tokenizer.additional_special_tokens
+  text_input_mode: "streaming"
+  frame_stacking_factor: 1
+  phoneme_stacking_factor: 2
+  streaming_phonemes_delay: 4
+  streaming_speech_delay: 8
+  dropout_text_input_prob: 0.3
+
+  phoneme_tokenizer:
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
+    punct: true
+    apostrophe: true
+    pad_with_space: false
+    g2p:
+      _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+      phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+      heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+      phoneme_probability: 1.0
+      ignore_ambiguous_words: false
+      use_chars: true
+      use_stresses: true
+      
+  text_tokenizers: # Add more languages for multi-lingual TTS
+    english_phoneme:
+      _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
+      punct: true
+      apostrophe: true
+      pad_with_space: false
+      g2p:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: false
+        use_chars: true
+        use_stresses: true
+
+  train_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
+      dataset_meta: ${train_ds_meta}
+      weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch}
+      min_duration: 0.2
+      max_duration: 20.0
+
+    dataloader_params:
+      batch_size: ${batch_size}
+      num_workers: 4
+      drop_last: true
+      pin_memory: true
+
+  validation_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
+      dataset_meta: ${val_ds_meta}
+      min_duration: 0.2
+      max_duration: 20.0
+
+    dataloader_params:
+      batch_size: ${batch_size}
+      num_workers: 4
+      pin_memory: true
+
+  optim:
+    _target_: torch.optim.AdamW
+    lr: 1e-4
+
+    sched:
+      name: ExponentialLR
+      gamma: 0.998
+
+trainer:
+  num_nodes: 1
+  devices: -1
+  accelerator: gpu
+  strategy: ddp_find_unused_parameters_true
+  precision: bf16-mixed
+  max_epochs: ${max_epochs}
+  accumulate_grad_batches: 1
+  enable_checkpointing: False # Provided by exp_manager
+  logger: false # Provided by exp_manager
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 1
+  num_sanity_val_steps: 0
+  benchmark: false
+  gradient_clip_val: 2.5
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    entity: null
+    name: ${name}
+    project: null
+    group: null
+    resume: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    mode: min
+    save_top_k: 5
+    save_best_model: true
+    always_save_nemo: true
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
\ No newline at end of file
diff --git a/examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml b/examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml
new file mode 100644
index 000000000000..6ed9b529eac6
--- /dev/null
+++ b/examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml
@@ -0,0 +1,169 @@
+name: Magpie-TTS-DecoderOnly-EN
+
+quadratic_duration: 20
+
+# Adjust batch size based on GPU memory
+# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch.
+# If null, then weighted sampling is disabled.
+
+# Dataset metadata for each manifest
+# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41
+
+model:
+  use_lhotse: true
+  transformer_hf_backend: "Qwen/Qwen2.5-1.5B"
+  use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided.
+  context_duration_min: 5.0
+  context_duration_max: 5.0
+  load_cached_codes_if_available: true
+  
+  embedding_dim: 1536
+  hidden_dim: 1536
+  codecmodel_path: ???
+  
+  # Local transformer parameters for autoregressive codebook prediction within a frame
+  local_transformer_type: "none" # "none", "autoregressive", "maskgit"
+  # Below args are only relevant if use_local_transformer is autoregressive, maskgit
+  local_transformer_loss_scale: 1.0
+  local_transformer_n_layers: 3
+  local_transformer_n_heads: 1
+  local_transformer_hidden_dim: 256
+
+  cfg_unconditional_prob: 0.1
+  
+  text_input_mode: "streaming"
+  frame_stacking_factor: 1
+  phoneme_stacking_factor: 2
+  streaming_phonemes_delay: 4
+  streaming_speech_delay: 8
+  dropout_text_input_prob: 0.3
+  
+  phoneme_tokenizer:
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
+    punct: true
+    apostrophe: true
+    pad_with_space: false
+    g2p:
+      _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+      phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+      heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+      phoneme_probability: 1.0
+      ignore_ambiguous_words: false
+      use_chars: true
+      use_stresses: true
+
+  text_tokenizers: # Add more languages for multi-lingual TTS
+    english_phoneme:
+      _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
+      punct: true
+      apostrophe: true
+      pad_with_space: false
+      g2p:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: false
+        use_chars: true
+        use_stresses: true
+
+  train_ds:
+    use_lhotse: ${model.use_lhotse}
+    volume_norm: true
+
+    dataset:
+      min_duration: 0.2
+      min_context_speaker_similarity: 0.6
+      max_cer: 0.03
+      batch_duration : ???  # in seconds. Adjust based on your GPU memory.
+      quadratic_duration: ${quadratic_duration}
+      use_bucketing: true
+      num_buckets: 20
+      bucket_buffer_size: 20_000
+      shuffle_buffer_size: 20_000
+      num_cuts_for_bins_estimate: 20_000
+      shard_seed: "trng"
+      drop_last: true
+      shuffle: true
+      num_workers: 6
+      pin_memory: true
+
+      input_cfg:
+      - type: lhotse_shar
+        shar_path: ???
+        weight: 1.0
+        tags:
+          tokenizer_names: ["english_phoneme"]
+
+
+  validation_ds:
+    use_lhotse: ${model.use_lhotse}
+    volume_norm: true
+
+    dataset:
+      min_duration: 0.2
+      min_context_speaker_similarity: 0.6
+      max_cer: 0.03
+      batch_duration: ???   # recommend to use smaller batch_duration for validation dataset than training dataset.
+      quadratic_duration: ${quadratic_duration}
+      use_bucketing: false
+      force_finite: true
+      drop_last: false
+      shuffle: false
+      num_workers: 2
+      pin_memory: true
+
+      input_cfg:
+      - type: lhotse_shar
+        shar_path: ???
+        weight: 1.0
+        tags:
+          tokenizer_names: ["english_phoneme"]
+  
+  optim:
+    _target_: torch.optim.AdamW
+    lr: 1e-4
+
+    sched:
+      name: ExponentialLR
+      gamma: 0.998
+
+trainer:
+  num_nodes: 1
+  devices: -1
+  accelerator: gpu
+  strategy: ddp_find_unused_parameters_true
+  precision: bf16-mixed
+  max_steps: ???
+  accumulate_grad_batches: 1
+  enable_checkpointing: False # Provided by exp_manager
+  logger: false # Provided by exp_manager
+  log_every_n_steps: 100
+  limit_train_batches: 1_000
+  val_check_interval: 1_000
+  num_sanity_val_steps: 0
+  benchmark: false
+  use_distributed_sampler: false  # required because Lhotse has its own handling
+  gradient_clip_val: 2.5
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    entity: null
+    name: ${name}
+    project: null
+    group: null
+    resume: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    mode: min
+    save_top_k: 5
+    save_best_model: true
+    always_save_nemo: true
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.4f}-{step}-{epoch}'
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
\ No newline at end of file
diff --git a/examples/tts/evalset_config.json b/examples/tts/evalset_config.json
index 2d61a601f880..4ff4d12ad9eb 100644
--- a/examples/tts/evalset_config.json
+++ b/examples/tts/evalset_config.json
@@ -13,6 +13,11 @@
         "manifest_path": "/home/TestData/an4_dataset/an4_val_context_v1_longform_tiny.json",
         "audio_dir": "/",
         "feature_dir": null
+    },
+    "riva_hard_digits": {
+        "manifest_path": "/Data/evaluation_manifests/hard-digits-path-corrected.ndjson",
+        "audio_dir": "/Data/RIVA-TTS",
+        "feature_dir": "/Data/RIVA-TTS"
     }
 }
 
diff --git a/examples/tts/magpietts_decoder_only.py b/examples/tts/magpietts_decoder_only.py
new file mode 100644
index 000000000000..44859fee8d64
--- /dev/null
+++ b/examples/tts/magpietts_decoder_only.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import lightning.pytorch as pl
+import torch.multiprocessing as mp
+from omegaconf import OmegaConf
+
+from nemo.collections.tts.models import MagpieTTSDecoderModel
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+
+@hydra_runner(config_path="conf/magpietts", config_name="magpietts_decoderonly_en")
+def main(cfg):
+    logging.info('\nConfig Params:\n%s', OmegaConf.to_yaml(cfg, resolve=True))
+
+    # forcing "spawn" method for multiprocessing over "fork" when choosing multiple
+    # worker processes for dataloaders. By default, multiprocessing uses "fork" to create
+    # worker processes, which inherit the memory state of the main process, including its
+    # already initialized CUDA state. When the worker processes trieds to use
+    # CUDA, it runs into conflicts with the inherited, now potentially invalid,
+    # CUDA context, resuling in the CUDA initialization error. When
+    # num_workers=0, all dataloading happens in the main process, so there is no
+    # process forking and no CUDA context conflict. When num_workers>0, the standard way
+    # to fix this is to use "spawn" to create a completely new and clean python process for
+    # each worker, avoding the problematic CUDA state inheritance.
+    mp.set_start_method("spawn", force=True)
+
+    trainer = pl.Trainer(**cfg.trainer)
+    trainer.callbacks.append(pl.callbacks.LearningRateMonitor(logging_interval='step', log_weight_decay=True))
+    exp_manager(trainer, cfg.get("exp_manager", None))
+
+    model = MagpieTTSDecoderModel(cfg=cfg.model, trainer=trainer)
+    model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
+
+    if cfg.get('mode', 'train') == 'train':
+        trainer.fit(model)
+    elif cfg.get('mode', 'train') == 'test':
+        trainer.test(model)
+    else:
+        raise NotImplementedError(f"Only train and test modes are supported. Got {cfg.mode}")
+
+
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter
\ No newline at end of file
diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py
index 9379b29668aa..d8d9e883fd04 100644
--- a/examples/tts/magpietts_inference.py
+++ b/examples/tts/magpietts_inference.py
@@ -190,7 +190,7 @@ def run_inference_and_evaluation(
         violin_plot_metrics.remove('utmosv2')
 
     # Load model
-    model, checkpoint_name = load_magpie_model(model_config)
+    model, checkpoint_name = load_magpie_model(model_config, is_decoder_only_model=inference_config.is_decoder_only_model)
 
     # Log architecture summary and get MoE info + FLOPs metrics
     moe_info, flops_per_component = log_model_architecture_summary(model)
@@ -502,6 +502,10 @@ def create_argument_parser() -> argparse.ArgumentParser:
     target_group = parser.add_argument_group('Quality Targets')
     target_group.add_argument('--cer_target', type=float, default=None)
     target_group.add_argument('--ssim_target', type=float, default=None)
+    target_group.add_argument('--is_decoder_only_model', action='store_true')
+    target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt'])
+    target_group.add_argument('--phoneme_sampling_method', type=str, default='greedy', choices=['greedy', 'multinomial'])
+    target_group.add_argument('--dropout_text_input', action='store_true')
 
     return parser
 
@@ -553,6 +557,10 @@ def main(argv=None):
         maskgit_noise_scale=args.maskgit_noise_scale,
         maskgit_fixed_schedule=args.maskgit_fixed_schedule,
         maskgit_sampling_type=args.maskgit_sampling_type,
+        is_decoder_only_model=args.is_decoder_only_model,
+        phoneme_input_type=args.phoneme_input_type,
+        phoneme_sampling_method=args.phoneme_sampling_method,
+        dropout_text_input=args.dropout_text_input,
     )
 
     eval_config = EvaluationConfig(
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
index 75a12da269ee..4ecd544df81e 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -1216,6 +1216,7 @@ def __init__(self, tokenizers: List[Union[BaseTokenizer, PreTrainedTokenizerBase
         self.tokenizer_pad_ids = tokenizer_pad_ids
         # Define aggregated token's pad value from the first tokenizer's pad value
         first_tokenizer = self.tokenizers[tokenizer_names[0]]
+        self.first_tokenizer = first_tokenizer
         if hasattr(first_tokenizer, "pad_token_id"):  # Defined in PreTrainedTokenizerBase subclasses
             self.pad = first_tokenizer.pad_token_id
         elif hasattr(first_tokenizer, "pad"):  # Defined in BaseTokenizer subclasses
diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py
index 3c158ee4bd8e..789636a569e3 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset.py
@@ -403,6 +403,7 @@ def __init__(
         self.dataset_type = dataset_type
         self.tokenizer_config = tokenizer_config
         self.text_tokenizer = None  # Assigned in worker_init_fn in model file
+        self.phoneme_tokenizer = None  # Assigned in worker_init_fn in model file (if any)
         self.load_16khz_audio = load_16khz_audio
         self.use_text_conditioning_tokenizer = use_text_conditioning_tokenizer
         self.text_conditioning_tokenizer_name = text_conditioning_tokenizer_name
@@ -434,6 +435,13 @@ def __getitem__(self, index):
             "text_len": text_len,
         }
 
+        if self.phoneme_tokenizer is not None:
+            phoneme_tokens = self.phoneme_tokenizer.encode(data.text)
+            phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
+            phoneme_tokens_len = len(phoneme_tokens)
+            example["phoneme_tokens"] = torch.tensor(phoneme_tokens, dtype=torch.int32)
+            example["phoneme_tokens_len"] = phoneme_tokens_len
+
         if self.load_cached_codes_if_available and 'target_audio_codes_path' in data.manifest_entry:
             audio_codes_path = data.manifest_entry['target_audio_codes_path']
             audio_codes = torch.load(audio_codes_path)  # (C, T)
@@ -632,6 +640,8 @@ def collate_fn(self, batch: List[dict]):
         raw_text_list = []
         language_list = []
         speaker_indices_list = []
+        phoneme_tokens_list = []
+        phoneme_tokens_len_list = []
         for example in batch:
             dataset_name_list.append(example["dataset_name"])
             raw_text_list.append(example["raw_text"])
@@ -642,6 +652,9 @@ def collate_fn(self, batch: List[dict]):
 
             if 'audio_filepath' in example:
                 audio_filepath_list.append(example["audio_filepath"])
+            if 'phoneme_tokens' in example:
+                phoneme_tokens_list.append(example["phoneme_tokens"])
+                phoneme_tokens_len_list.append(example["phoneme_tokens_len"])
 
             if 'audio' in example:
                 audio_list.append(example["audio"])
@@ -711,6 +724,13 @@ def collate_fn(self, batch: List[dict]):
             batch_dict['audio_codes'] = batch_audio_codes
             batch_dict['audio_codes_lens'] = batch_audio_codes_len
 
+        if len(phoneme_tokens_list) > 0:
+            batch_phoneme_tokens_len = torch.IntTensor(phoneme_tokens_len_list)
+            phoneme_tokens_max_len = int(batch_phoneme_tokens_len.max().item())
+            batch_phoneme_tokens = stack_tensors(phoneme_tokens_list, max_lens=[phoneme_tokens_max_len], pad_value=self.phoneme_tokenizer.pad)
+            batch_dict['phoneme_tokens'] = batch_phoneme_tokens
+            batch_dict['phoneme_tokens_lens'] = batch_phoneme_tokens_len
+
         if len(context_audio_list) > 0:
             batch_context_audio_len = torch.IntTensor(context_audio_len_list)
             context_audio_max_len = int(batch_context_audio_len.max().item())
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index 1ee0b05bef62..4bd378151b9a 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -59,6 +59,13 @@ def setup_tokenizers(all_tokenizers_config, mode='train'):
 
     return aggregated_tokenizer
 
+def instantiate_phoneme_tokenizer(phoneme_tokenizer_config):
+    phoneme_tokenizer = instantiate(phoneme_tokenizer_config)
+    phoneme_vocab_size = len(phoneme_tokenizer.tokens)
+    phoneme_tokenizer.bos_token_id = phoneme_vocab_size
+    phoneme_tokenizer.eos_token_id = phoneme_vocab_size + 1
+    phoneme_tokenizer.vocab_size = phoneme_vocab_size + 2
+    return phoneme_tokenizer
 
 def check_speaker_format(item: str):
     # enforce the format as example like "| Language:en Dataset:HiFiTTS Speaker:9136_other |".
@@ -140,6 +147,7 @@ def __init__(
         tokenizer_config: DictConfig = None,
         text_context_remapping: Dict[str, str] = None,
         text_context_remapping_prob: float = 0.0,
+        phoneme_tokenizer_config: DictConfig = None,
     ):
         super().__init__()
         self.sample_rate = sample_rate
@@ -160,8 +168,10 @@ def __init__(
         self.context_duration_max = context_duration_max
         self.tokenizer_config = tokenizer_config
         self.text_tokenizer = None
+        self.phoneme_tokenizer = None
         self.text_context_remapping = text_context_remapping
         self.text_context_remapping_prob = text_context_remapping_prob
+        self.phoneme_tokenizer_config = phoneme_tokenizer_config
 
     def get_num_audio_samples_to_slice(self, duration, sample_rate):
         num_codec_frames = int(duration * sample_rate / self.codec_model_samples_per_frame)
@@ -188,6 +198,12 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
             self.eos_id = self.bos_id + 1
             self.pad_id = self.text_tokenizer.pad
 
+        if self.phoneme_tokenizer is None and self.phoneme_tokenizer_config is not None:
+            worker_info = torch.utils.data.get_worker_info()
+            worker_id = worker_info.id if worker_info is not None else 0
+            logging.info(f"Worker {worker_id} initializing phoneme tokenizer...")
+            self.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.phoneme_tokenizer_config)
+
         # define list to store batched information
         dataset_name_list = []
         audio_list = []
@@ -210,6 +226,8 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
         raw_text_list = (
             []
         )  # raw text here is the string of normalized text or text stored in the supervision segment. Used to distinguish from text tokens.
+        phoneme_token_list = []
+        phoneme_token_len_list = []
         for cut in cuts:
             speaker = cut.supervisions[0].speaker
             if not check_speaker_format(speaker):
@@ -390,6 +408,13 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
             token_list.append(tokens)
             token_len_list.append(text_len)
 
+            if self.phoneme_tokenizer is not None:
+                phoneme_tokens = self.phoneme_tokenizer.encode(text_str)
+                phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
+                phoneme_tokens_len = len(phoneme_tokens)
+                phoneme_token_list.append(torch.tensor(phoneme_tokens, dtype=torch.int32))
+                phoneme_token_len_list.append(phoneme_tokens_len)
+
             if self.include_align_prior:
                 align_prior = beta_binomial_prior_distribution(
                     phoneme_count=text_len, mel_count=spec_len, scaling_factor=self.prior_scaling_factor
@@ -409,6 +434,10 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
             "text_lens": torch.IntTensor(token_len_list),
         }
 
+        if self.phoneme_tokenizer is not None:
+            batch_dict["phoneme_tokens"] = collate_vectors(phoneme_token_list, padding_value=self.phoneme_tokenizer.pad)
+            batch_dict["phoneme_tokens_lens"] = torch.IntTensor(phoneme_token_len_list)
+            
         # audio for SV.
         if len(audio_list_16khz) > 0:
             batch_dict["audio_16khz"] = collate_vectors(audio_list_16khz, padding_value=0.0)
diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py
index 15d592dca2f7..6e781bed19ef 100644
--- a/nemo/collections/tts/models/__init__.py
+++ b/nemo/collections/tts/models/__init__.py
@@ -18,6 +18,7 @@
 from nemo.collections.tts.models.fastpitch_ssl import FastPitchModel_SSL
 from nemo.collections.tts.models.hifigan import HifiGanModel
 from nemo.collections.tts.models.magpietts import InferBatchOutput, MagpieTTSModel
+from nemo.collections.tts.models.magpietts_decoder_only import MagpieTTSDecoderModel
 from nemo.collections.tts.models.magpietts_preference_optimization import (
     MagpieTTSModelOfflinePO,
     MagpieTTSModelOfflinePODataGen,
@@ -34,6 +35,7 @@
     "HifiGanModel",
     "InferBatchOutput",
     "MagpieTTSModel",
+    "MagpieTTSDecoderModel",
     "MagpieTTSModelOfflinePODataGen",
     "MagpieTTSModelOfflinePO",
     "MagpieTTSModelOnlinePO",
diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py
new file mode 100644
index 000000000000..1b60b4b7b6ed
--- /dev/null
+++ b/nemo/collections/tts/models/magpietts_decoder_only.py
@@ -0,0 +1,1729 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Sequence, Tuple
+import torch
+import wandb
+from hydra.utils import instantiate
+from functools import partial
+from lightning.pytorch import Trainer
+from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
+from omegaconf import DictConfig
+from torch import nn
+from torch.utils.data import get_worker_info
+
+from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
+from nemo.collections.tts.data.text_to_speech_dataset_lhotse import MagpieTTSLhotseDataset, setup_tokenizers, instantiate_phoneme_tokenizer
+
+from nemo.collections.tts.models import AudioCodecModel
+from nemo.collections.tts.modules import transformer_2501
+
+from nemo.collections.tts.modules.magpietts_modules import CharAwareSubwordEncoder, SpecialAudioToken, LocalTransformerType, cosine_schedule
+from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
+
+from nemo.core.classes import ModelPT
+from nemo.core.classes.common import PretrainedModelInfo
+from nemo.utils import logging
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM
+)
+import time
+from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter
+import random
+
+
+
+def worker_init_fn(worker_id):
+    # For mp.set_start_method("spawn", force=True)
+    # The dataset class should be picklable, so we initialize non-picklable objects here
+    logging.info(f"Worker {worker_id} initializing...")
+    worker_info = get_worker_info()
+    dataset = worker_info.dataset  # Get the dataset instance in this worker
+    tokenizer = setup_tokenizers(
+        dataset.tokenizer_config, mode=dataset.dataset_type
+    )
+    dataset.text_tokenizer = tokenizer
+    if hasattr(dataset, 'phoneme_tokenizer_config'):
+        dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config)
+
+
+class MagpieTTSDecoderModel(ModelPT):
+    """
+    Magpie-TTS Model Decoder Only Model
+    audio/text
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
+        self.world_size = 1
+        if trainer is not None:
+            self.world_size = trainer.num_nodes * trainer.num_devices
+
+        # load codec
+        codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False)
+        self.sample_rate = codec_model.sample_rate
+        
+        if hasattr(codec_model, "discriminator"):
+            # del codec discriminator to free memory
+            del codec_model.discriminator
+
+        # Set up codebook configuration
+        vector_quantizer = cfg.get('vector_quantizer')
+        if vector_quantizer is not None:
+            vector_quantizer = instantiate(vector_quantizer)
+            num_audio_codebooks = vector_quantizer.num_codebooks
+            codebook_size = vector_quantizer.codebook_size
+            codec_converter = VectorQuantizerIndexConverter(
+                vector_quantizer_original=codec_model.vector_quantizer,
+                vector_quantizer_new=vector_quantizer,
+            )
+            data_num_audio_codebooks = codec_model.vector_quantizer.num_codebooks
+        else:
+            num_audio_codebooks = codec_model.num_codebooks
+            data_num_audio_codebooks = num_audio_codebooks
+            codebook_size = codec_model.codebook_size
+            codec_converter = None
+        
+
+        # The dataloader needs to know the number of codebooks that the context codes were stored in
+        # In the case where there are no context codes saved, and there is no context audio (in the text context path),
+        # We create a dummy context code tensor that is only [context_BOS, context_EOS] that is repeated for
+        # data_num_audio_codebooks
+        self.data_num_audio_codebooks = data_num_audio_codebooks
+        self.num_audio_codebooks = num_audio_codebooks
+        self.codebook_size = codebook_size
+
+        
+        self.codec_model_samples_per_frame = codec_model.samples_per_frame
+        # Our codebooks start with actual audio codec tokens, followed by special tokens.
+        # The `forced_*` options are for backward compatibility for models trained with older code.
+        num_audio_tokens = codec_model.codebook_size
+        # Our codebooks start with actual audio codec tokens, followed by special tokens.
+        # The `forced_*` options are for backward compatibility for models trained with older code.
+        get_token_index = partial(SpecialAudioToken.get_index, base_codebook_size=self.codebook_size)
+        self.audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_BOS)
+        self.audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_EOS)
+        self.context_audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_BOS)
+        self.context_audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_EOS)
+        self.mask_token_id = get_token_index(SpecialAudioToken.MASK_TOKEN)
+        self.num_all_tokens_per_codebook = self.codebook_size + len(SpecialAudioToken)
+        self.use_bpe_char_tokenizer = cfg.get('use_bpe_char_tokenizer', False)
+
+        # If specified, use this as the text conditioning tokenizer. Otherwise, use the first tokenizer.
+        self.text_conditioning_tokenizer_name = cfg.get('text_conditioning_tokenizer_name', None)
+        if self.text_conditioning_tokenizer_name is None:
+            self.text_conditioning_tokenizer_name = list(cfg.text_tokenizers.keys())[0]
+
+        self.cfg_unconditional_prob = cfg.get('cfg_unconditional_prob', 0.0)
+        self.text_input_mode = cfg.get('text_input_mode', 'full')
+        self.streaming_speech_delay = cfg.get('streaming_speech_delay', 3)
+        self.streaming_phonemes_delay = cfg.get('streaming_phonemes_delay', 2)
+        self.frame_stacking_factor = cfg.get('frame_stacking_factor', 1)
+
+        self.tokenizer = setup_tokenizers(
+            all_tokenizers_config=cfg.text_tokenizers,
+            mode='train',
+        )
+        
+        num_tokens_tokenizer = len(self.tokenizer.tokens)
+        num_tokens = num_tokens_tokenizer + 3  # +2 for BOS and EOS
+        self.bos_id = num_tokens - 3
+        self.eos_id = num_tokens - 2
+        self.cfg_unk_token_id = num_tokens - 1
+        self.phoneme_tokenizer = None
+        self.dropout_text_input_prob = cfg.get('dropout_text_input_prob', 0.0)
+        self.dropout_phoneme_input_prob = cfg.get('dropout_phoneme_input_prob', 0.0)
+        if cfg.get('phoneme_tokenizer', None) is not None:
+            self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer)
+            self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1)
+            self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size
+
+        
+        self.pad_context_text_to_max_duration = False
+
+        super().__init__(cfg=cfg, trainer=trainer)
+
+        # This needs to happen after super().__init__()
+        self._codec_model = codec_model
+        self._codec_model.freeze()  #Lightning does requires_grad = False and self.eval()
+        self._codec_converter = codec_converter
+
+        audio_embeddings = []
+        for _ in range(self.num_audio_codebooks * self.frame_stacking_factor):
+            audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, cfg.embedding_dim))
+        self.audio_embeddings = nn.ModuleList(audio_embeddings)
+        
+        if self.phoneme_tokenizer is not None:
+            phoneme_embeddings = []
+            for _ in range(self.phoneme_stacking_factor):
+                phoneme_embeddings.append(nn.Embedding(self.phoneme_vocab_size, cfg.embedding_dim))
+            self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings)
+            self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor)
+
+
+        if cfg.transformer_hf_backend == "custom_qwen3_moe":
+            # from transformers.models import qwen3_moe
+            # config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(intermediate_size=3072, num_hidden_layers=5, num_experts=64)
+            # self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
+            from transformers.models import qwen2_moe
+            config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32)
+            self.decoder = qwen2_moe.modeling_qwen2_moe.Qwen2MoeModel(config_qwen2)
+        else:
+            self.transformer_backend_config = AutoConfig.from_pretrained(
+                cfg.transformer_hf_backend,
+                trust_remote_code=True,
+            )
+
+            hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config)
+            self.decoder = hf_transformer.model
+            self.lm_text_head = hf_transformer.lm_head
+
+        self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim)
+        self.decoder.set_input_embeddings(self.text_embedding)
+        
+        if self.use_bpe_char_tokenizer:
+            # BPE char tokenizer
+            assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer"
+            tokenizer_name = self.tokenizer.tokenizer_names[0]
+            tokenizer = self.tokenizer.tokenizers[tokenizer_name]
+            subword_vocab = tokenizer.get_vocab()
+            # special tokens will be stored as it is in the char_vocab
+            # Each special token will only be mapped to one char id
+            special_vocab = {
+                '<BOS>': self.bos_id,
+                '<EOS>': self.eos_id,
+                '<CFG_UNK>': self.cfg_unk_token_id,
+            }
+            self.cas_encoder = CharAwareSubwordEncoder(
+                d_embed=cfg.embedding_dim,
+                llm_tokenizer_vocab=subword_vocab,
+                subword_padding_idx=self.tokenizer.pad,
+                special_vocab=special_vocab
+            )
+
+        self.final_proj = nn.Linear(cfg.hidden_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor)
+        self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none')
+
+        self.local_transformer_type = LocalTransformerType(cfg.get('local_transformer_type', 'none').lower())
+        logging.info(f"Local transformer type: {self.local_transformer_type}")
+        if self.local_transformer_type != LocalTransformerType.NO_LT:
+            local_transformer_hidden_dim = cfg.get('local_transformer_hidden_dim', 256)
+            if local_transformer_hidden_dim != cfg.hidden_dim:
+                self.local_transformer_in_projection = nn.Linear(cfg.hidden_dim, local_transformer_hidden_dim)
+            else:
+                self.local_transformer_in_projection = nn.Identity()
+            self.local_transformer = transformer_2501.Transformer(
+                n_layers=self.cfg.get('local_transformer_n_layers', 2),
+                d_model=local_transformer_hidden_dim,
+                d_ffn=local_transformer_hidden_dim*4,
+                sa_n_heads=self.cfg.get('local_transformer_n_heads', 1),
+                kernel_size=1,
+                is_causal=self.local_transformer_type == LocalTransformerType.AR,
+                max_length_causal_mask=self.num_audio_codebooks * self.frame_stacking_factor + 2,
+                use_learnable_pos_emb=True,
+            )
+            local_transformer_out_projections = []
+            for _ in range(self.num_audio_codebooks * self.frame_stacking_factor):
+                # Have a separate projection layer for each codebook, to distinguish between them
+                local_transformer_out_projections.append(nn.Linear(local_transformer_hidden_dim, self.num_all_tokens_per_codebook))
+            self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections)
+
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        """
+        Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model
+        from the checkpoint. The codec model is saved in a separate checkpoint.
+        """
+        if hasattr(self, '_no_state_dict') and self._no_state_dict:
+            return {}
+        # Don't save the speaker verification and codec model in the state dict
+        state_dict = super().state_dict(destination, prefix, keep_vars)
+        keys_substrings_to_exclude = ['_speaker_verification_model', '_codec_model']
+        for key in list(state_dict.keys()):
+            if any([substring in key for substring in keys_substrings_to_exclude]):
+                del state_dict[key]
+        return state_dict
+    
+    def load_state_dict(self, state_dict, strict=True):
+        """
+        Modify load_state_dict so that we don't restore weights to _speaker_verification_model and _codec_model when
+        strict is True.
+        When strict is False, we can call pytorch's load_state_dict.
+        When strict is True, we loop through all parameters and rename them to enable loading.
+        """
+        if strict == False:
+            super().load_state_dict(state_dict, strict=False)
+        for name, child in self.named_children():
+            if name in ['_speaker_verification_model', '_codec_model']:
+                continue
+            if any(param.numel() > 0 for param in child.parameters()):
+                # If the module has parameters, we want to change the default mapping so that the state_dict gets
+                # loaded.
+                # Ex: state_dict[encoder.position_embeddings.weight] -> new_state_dict[position_embeddings.weight]
+                new_state_dict = {}
+                for key in state_dict.keys():
+                    name_with_dot = f"{name}."
+                    if key.startswith(name_with_dot):
+                        new_state_dict[key[len(name_with_dot):]] = state_dict[key]
+                child.load_state_dict(new_state_dict)
+
+    def audio_to_codes(self, audio, audio_len, audio_type='target'):
+        # audio: (B, T)
+        # audio_len: (B,)
+        if audio_type == 'target':
+            audio_eos_id = self.audio_eos_id
+            audio_bos_id = self.audio_bos_id
+        elif audio_type == 'context':
+            audio_eos_id = self.context_audio_eos_id
+            audio_bos_id = self.context_audio_bos_id
+        else:
+            raise ValueError(f"Received audio_type of {audio_type}. Must be `target` or `context`")
+
+        self._codec_model.eval()
+        with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32):
+            codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len)
+            if self._codec_converter is not None:
+                codes = self._codec_converter.convert_original_to_new(audio_tokens=codes, audio_lens=codes_len)
+            # Add a timestep to begining and end of codes tensor
+            bos_tensor = torch.full(
+                (codes.size(0), codes.size(1), 1), audio_bos_id, dtype=codes.dtype, device=codes.device
+            )
+            pad_tensor = torch.full(
+                (codes.size(0), codes.size(1), 1), 0, dtype=codes.dtype, device=codes.device
+            )  # 0 is the padding token in the audio codebook
+            codes = torch.cat([bos_tensor, codes, pad_tensor], dim=-1)
+            # codes: (B, C, T')
+            # codes_len: (B,)
+            for idx in range(codes.size(0)):
+                codes[idx, :, codes_len[idx] + 1] = audio_eos_id
+            codes_len = codes_len + 2
+
+            return codes.long(), codes_len.long()
+
+    def codes_to_audio(self, codes, codes_len):
+        # codes: (B, C, T')
+        # codes_len: (B,)
+        if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor:
+            # Unstack the audio codes if they are stacked
+            codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor)
+        
+        if codes.size(2) < 5:
+            # If the codes are too short, we need to pad them
+            codes = torch.cat([codes, torch.zeros(codes.size(0), codes.size(1), 5 - codes.size(2), device=codes.device)], dim=2).long()
+            codes_len = codes_len + 5 - codes.size(2)
+        
+        self._codec_model.eval()
+        with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
+            # Make a copy to avoid modifying the original tensor if it's used elsewhere
+            codes_copy = codes.clone()
+            # Replace eos and bos tokens with padding in the copied tensor
+            codes_copy[codes == self.audio_bos_id] = 0  # zero is the padding token
+            codes_copy[codes == self.audio_eos_id] = 0
+            # Pass the modified integer token IDs
+            if self._codec_converter is not None:
+                codes_copy = self._codec_converter.convert_new_to_original(
+                    audio_tokens=codes_copy, audio_lens=codes_len
+                )
+            audio, audio_len = self._codec_model.decode(tokens=codes_copy, tokens_len=codes_len)
+            # audio: (B, T)
+            # audio_len: (B,)
+            return audio, audio_len
+
+    def embed_audio_tokens(self, audio_tokens):
+        # audio_tokens: (B, C, T')
+        # Add and average the embeddings of the audio tokens across the codebooks
+        audio_embedding = None
+        for c in range(audio_tokens.size(1)):
+            embedding = self.audio_embeddings[c](audio_tokens[:, c, :])
+            if audio_embedding is None:
+                audio_embedding = embedding
+            else:
+                audio_embedding = audio_embedding + embedding
+        audio_embedding = audio_embedding / audio_tokens.size(1)
+        return audio_embedding
+
+    def embed_phoneme_tokens(self, phoneme_tokens):
+        # phoneme_tokens: (B, S, T')
+        phoneme_embedding = None
+        for c in range(phoneme_tokens.size(1)):
+            embedding = self.phoneme_embeddings[c](phoneme_tokens[:, c, :])
+            if phoneme_embedding is None:
+                phoneme_embedding = embedding
+            else:
+                phoneme_embedding = phoneme_embedding + embedding
+        phoneme_embedding = phoneme_embedding / phoneme_tokens.size(1)
+        return phoneme_embedding
+    
+    def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False):
+        """
+        Predicts the logits for all codebooks using the local transformer. Used in both autoregressive (AR) and MaskGit (MG) modes.
+        This function is used in training and validation, not inference/sampling.
+        The sequence layout is slightly different between AR and MG modes, as shown in the diagram below,
+        (using an 8-codebook setup as an example):
+        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+        | AR target  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |   none  |
+        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+        | MG target  |  none   |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
+        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+        |   Input    | Magpie  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
+        |            | Latent  | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK |
+        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+        | Seq. Index |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |    8    |
+        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+        
+        dec_out: (B, T', E)
+        audio_codes_target: (B, C, T')
+        targets_offset_by_one: bool, if False, the target for index 0 is codebook 0, for index 1 is codebook 1, etc. (autoregressive)
+                                     if True,  the target for index 1 is codebook 0, for index 2 is codebook 1, etc. (MaskGit)
+        """
+        dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', E)
+        local_transformer_input = [dec_out_all]
+        for codebook_num in range(audio_codes_target.size(1)):
+            codes = audio_codes_target[:, codebook_num] # (B, T')
+            codes = codes.reshape(-1) # (B*T',)
+            codebook_embedding = self.audio_embeddings[codebook_num](codes) # (B*T', E)
+            local_transformer_input.append(codebook_embedding)
+
+        local_transformer_input = torch.stack(local_transformer_input, dim=1) # (B*T', C+1, E)
+        local_transformer_input = self.local_transformer_in_projection(local_transformer_input) # (B*T', C+1, 128)
+        _mask = torch.ones( local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device)
+        local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B*T', C+1, E)
+        if not targets_offset_by_one:
+            # for autoregressive local transformer the target for index 0 is codebook 0, for index 1 is codebook 1, etc.
+            local_transformer_output = local_transformer_output[:, :-1, :] # (B*T', C, E)
+        else:
+            # for MaskGit the target for index **1** is codebook 0, for index 2 is codebook 1, etc.
+            local_transformer_output = local_transformer_output[:, 1:, :] # (B*T', C, E)
+        all_code_logits = []
+        for codebook_num in range(audio_codes_target.size(1)):
+            # Using a separate projection layer for each codebook (to distinguish between them)
+            # Checked the time - this loop is not taking much time (compared to the local transformer forward pass)
+            codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, codebook_num, :]) # (B*T', num_all_tokens_per_codebook)
+            all_code_logits.append(codebook_logits)
+        all_code_logits = torch.cat(all_code_logits, dim=1) # (B*T', num_codebooks * num_all_tokens_per_codebook)
+
+        all_code_logits = all_code_logits.view(
+            audio_codes_target.size(0), audio_codes_target.size(2), -1
+        ) # (B, T', C * num_all_tokens_per_codebook)
+
+        return all_code_logits
+
+    def maskgit_create_random_mask(self, codes):
+        """
+        Creates a mask where True indicates the positions that should be replaced with a MASK_TOKEN.
+        """
+        # Codes: (B, C, T)
+        B,C,T = codes.shape
+        # get a uniform random vector uniformly sampled from [0,1) ## Todo does it need to be inclusive on the right?
+        rand_values = torch.rand(B,T, device=codes.device)
+        # apply the cosine schedule 
+        frac_masked = cosine_schedule(rand_values)
+        # how many positions to mask
+        n_masked = torch.ceil(frac_masked * C).long() # B,T
+        # start from all unmasked
+        mask = torch.zeros_like(codes, dtype=torch.bool)
+        # The code further below is the vectorized version of this:
+        #  for b in range(B):
+        #      for t in range(T):
+        #          if n_masked[b,t] > 0:
+        #              # get a random permutation of the codebook indices
+        #              perm = torch.randperm(C)
+        #              # mask the top n_masked positions
+        #              mask[b, perm[:n_masked[b,t]], t] = True
+        #
+        # Create random permutations 
+        random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1)  # (B, C, T)        
+        # Create a mask tensor where each position indicates if it should be masked        
+        mask_indices = torch.arange(C, device=codes.device).view(1, C, 1)
+        mask = mask_indices < n_masked.view(B, 1, T) # (B, C, T)
+        # Apply the random permutations to the mask
+        mask = torch.gather(mask, 1, random_permutations)
+    
+        return mask # (B, C, T)
+    
+    def maskgit_apply_random_mask(self, codes):
+        # Randomly replaces some codes with the MASK_TOKEN with a proportion following the cosine schedule.
+        # Codes: (B, C, T)        
+        mask = self.maskgit_create_random_mask(codes)
+        ## replace some tokens with MASK_TOKEN
+        codes_with_mask = torch.where(mask, self.mask_token_id, codes)
+        return codes_with_mask, mask
+
+    def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=None):
+        """
+        Computes the audio codebook loss. Used by
+        (1) The main Magpie-TTS transformer
+        (2) The local transformer, for both autoregressive and MaskGit methods
+        
+        logits: (B, T', num_codebooks * num_tokens_per_codebook)
+        audio_codes: (B, C, T')
+        audio_codes_lens: (B,)
+        mask_tokens_mask: (B, C, T') True for tokens that were replaced with the MASK_TOKEN and should
+                                     therefore be the only ones included in the loss computation.
+        """
+        loss_mask = get_mask_from_lengths(audio_codes_lens)
+        if mask_tokens_mask is not None:
+            # For MaskGit we only compute loss for the masked tokens.
+            # *Both* conditions must be true:
+            # 1. the token is masked
+            # 2. the token is not padding
+            loss_mask = loss_mask.unsqueeze(1) * mask_tokens_mask
+            if not loss_mask.any():
+                # Without this we were very rarely getting NaNs in the loss
+                logging.warning("No tokens valid were found in compute_loss()!")
+                return torch.tensor(0.0, device=loss_mask.device), loss_mask 
+        else:            
+            # repeat loss mask for each codebook to simplify code below
+            loss_mask = loss_mask.unsqueeze(1).repeat(1, audio_codes.size(1), 1)
+        total_codebook_loss = None
+        for codebook in range(audio_codes.size(1)):
+            si = codebook * self.num_all_tokens_per_codebook
+            ei = si + self.num_all_tokens_per_codebook
+            codebook_logits = logits[:, :, si:ei]  # (B, T', num_tokens_per_codebook)
+            codebook_targets = audio_codes[:, codebook]  # (B, T')
+            codebook_loss = self.cross_entropy_loss(
+                codebook_logits.permute(0, 2, 1), codebook_targets  # (B, num_tokens_per_codebook, T')
+            )  # (B, T')
+            codebook_loss = codebook_loss * loss_mask[:, codebook, :]
+            codebook_loss = codebook_loss.sum() / loss_mask[:, codebook, :].sum()
+            if total_codebook_loss is None:
+                total_codebook_loss = codebook_loss
+            else:
+                total_codebook_loss = total_codebook_loss + codebook_loss
+
+        total_codebook_loss = total_codebook_loss / audio_codes.size(1)
+        return total_codebook_loss, loss_mask
+
+    def compute_phoneme_loss(self, logits, phoneme_tokens, phoneme_tokens_lens):
+        loss_mask = get_mask_from_lengths(phoneme_tokens_lens)
+        total_phoneme_loss = None
+        for codebook in range(self.phoneme_stacking_factor):
+            si = codebook * self.phoneme_vocab_size
+            ei = si + self.phoneme_vocab_size
+            phoneme_logits = logits[:, :, si:ei]
+            phoneme_targets = phoneme_tokens[:, codebook]
+            phoneme_loss = self.cross_entropy_loss(phoneme_logits.permute(0, 2, 1), phoneme_targets)
+            phoneme_loss = phoneme_loss * loss_mask
+            phoneme_loss = phoneme_loss.sum() / loss_mask.sum()
+            if total_phoneme_loss is None:
+                total_phoneme_loss = phoneme_loss
+            else:
+                total_phoneme_loss = total_phoneme_loss + phoneme_loss
+        total_phoneme_loss = total_phoneme_loss / self.phoneme_stacking_factor
+        return total_phoneme_loss, loss_mask
+    
+
+    def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None):
+        backend_out = self.decoder(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+        )
+        # hidden_states = backend_out.last_hidden_state  # (B, T_total, H)
+        return backend_out
+    
+
+    def logits_to_audio_codes(self, all_code_logits, audio_codes_lens):
+        # all_code_logits: (B, T', num_codebooks * num_tokens_per_codebook)
+        # audio_codes_lens: (B,)
+        all_preds = []
+        for idx in range(self.num_audio_codebooks * self.frame_stacking_factor):
+            si = idx * self.num_all_tokens_per_codebook
+            ei = si + self.num_all_tokens_per_codebook
+            codebook_logits = all_code_logits[:, :, si:ei]
+            codebook_probs = torch.softmax(codebook_logits, dim=-1)  # (B, T', num_tokens_per_codebook)
+            # argmax to get the tokens
+            codebook_preds = torch.argmax(codebook_probs, dim=-1)  # (B, T')
+            all_preds.append(codebook_preds)
+
+        all_preds = torch.stack(all_preds, dim=1)  # (B, C, T')
+        audio_mask = get_mask_from_lengths(audio_codes_lens)
+        all_preds = all_preds * audio_mask.unsqueeze(1)
+
+        return all_preds
+
+    def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80, unfinished_items={}, finished_items={}, use_cfg=False, cfg_scale=1.0, n_steps=3):
+        """
+        Sample codes for one timestep from the local transformer using MaskGit.
+        """
+        if self.frame_stacking_factor > 1:
+            raise NotImplementedError("MaskGit sampling is not implemented for frame stacking factor > 1")
+        # dec_output: (B, E)
+        device = dec_output.device
+        # disable KV cache since our transformer is not causal
+        self.local_transformer.reset_cache(use_cache=False)
+        dec_output = dec_output.unsqueeze(1) # (B, 1, E)
+        local_transformer_input_init = self.local_transformer_in_projection(dec_output) # (B, 1, D) where D is the dimension of the local transformer
+        C = self.num_audio_codebooks
+        B = dec_output.size(0)
+
+        min_confidence = float("-inf")
+        max_confidence = 10000 # this needs to be large enough that unmasked items will always remain unmasked. # TODO @rfejgin: use float('inf')?
+        confidences = min_confidence * torch.ones(B, C, device=device)
+        # initialize to all masked
+        codes = self.mask_token_id * torch.ones((B, C), device=device, dtype=torch.long)
+        sampled_codes = codes.clone()
+        for step in range(n_steps):
+            # get mask fraction
+            frac_masked = cosine_schedule(torch.tensor(step / (n_steps)))
+            # how many codebooks to mask
+            n_masked = torch.ceil(C * frac_masked).long() # TODO @rfejgin: should we force this to be initialized to exactly `C` (to avoid numerical issues)?
+            n_unmasked = C - n_masked
+            # pick top-confidence codebooks up to n_unmasked
+            _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1)
+
+            # replace masks of the top-k confident codebooks with the the codes that were sampled for them
+            unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices)
+            codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
+            
+            # build transformer input 
+            local_transformer_input = local_transformer_input_init
+            for codebook_num in range(C):
+                next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1) # (B, 1, 768)
+                next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) # (B, 1, d_local)
+                local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) # (B, codebook_num+1, d_local)
+
+            # run transformer
+            _mask = torch.ones(B, C+1, device=device)
+            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, C+1, d_local)
+            
+            # get logits
+            logits = []
+            for codebook_num in range(C):
+                # The `codebook_num+1` is to drop first position which corresponds to the magpie latent
+                codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, codebook_num+1, :]) # (B, num_audio_tokens_per_codebook)
+                logits.append(codebook_logits)
+            logits = torch.stack(logits, dim=1) # (B, C, num_audio_tokens_per_codebook)
+
+            # apply CFG
+            if use_cfg:
+                actual_batch_size = logits.size(0) // 2
+                conditional_logits = logits[:actual_batch_size]
+                unconditional_logits = logits[actual_batch_size:]
+                cfg_logits = cfg_scale * conditional_logits +  (1.0 - cfg_scale) * unconditional_logits
+                logits[:actual_batch_size] = cfg_logits
+
+            # handle unfinished and finished items
+            for item_idx in unfinished_items:
+                logits[item_idx, self.audio_eos_id] = float('-inf')
+            for item_idx in finished_items:
+                logits[item_idx, :, :] = float('-inf')
+                logits[item_idx, :, self.audio_eos_id] = 0.0
+            
+            # sample with top-k
+            logits_topk = torch.topk(logits, topk, dim=-1)[0] # (B, C, topk)
+            indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1) # (B, C, num_audio_tokens_per_codebook)
+            logits_rescored = logits.clone()
+            logits_rescored[indices_to_remove] = float('-inf')
+            probs = torch.softmax(logits_rescored / temperature, dim=-1) # (B, C, num_audio_tokens_per_codebook)
+            sampled_codes = torch.multinomial(probs.view(B*C, -1), 1).view(B, C)
+            if use_cfg:
+                # TODO @rfejgin: why do we need to keep second half of the batch? can probably optimize this
+                sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size]
+                probs[actual_batch_size:] = probs[:actual_batch_size]
+            confidences  = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1)
+
+            # set confidence to max for unmasked codebooks so that they will remain unmasked
+            confidences.scatter_(index=topk_indices, dim=1, src=max_confidence*torch.ones_like(topk_indices, dtype=torch.float))
+
+            # replace entries in sampled_codes with previously unmasked codebooks
+            sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
+            # optionally: add noise to confidences here (as in token-critic paper) (not implemented)
+        
+        codes = sampled_codes
+        assert not (codes == self.mask_token_id).any(), f"Codes contain mask tokens after completion of MaskGit sampling"
+        if use_cfg:
+            codes = codes[:actual_batch_size]
+        return codes
+
+    def local_transformer_sample_autoregressive(self, dec_output, temperature=0.7, topk=80, unfinished_items={}, finished_items={}, use_cfg=False, cfg_scale=1.0):
+        # dec_output: (B, E)
+        self.local_transformer.reset_cache(use_cache=False)
+        dec_output = dec_output.unsqueeze(1) # (B, 1, E)
+        local_transformer_input = self.local_transformer_in_projection(dec_output) # (B, 1, 128)
+        all_preds = []
+        for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor):
+            _mask = torch.ones( local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device)
+            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, T, 128)
+            codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, -1, :]) # (B, num_all_tokens_per_codebook)
+            if use_cfg:
+                actual_batch_size = codebook_logits.size(0) // 2
+                conditional_logits = codebook_logits[:actual_batch_size]
+                unconditional_logits = codebook_logits[actual_batch_size:]
+                cfg_logits = cfg_scale * conditional_logits +  (1.0 - cfg_scale) * unconditional_logits
+                codebook_logits[:actual_batch_size] = cfg_logits
+
+            for item_idx in unfinished_items:
+                codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
+            for item_idx in finished_items:
+                codebook_logits[item_idx, :] = float('-inf')
+                codebook_logits[item_idx, self.audio_eos_id] = 0.0
+
+            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk)
+            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(-1) # (B, num_tokens_per_codebook)
+            codebook_logits_rescored = codebook_logits.clone()
+            codebook_logits_rescored[indices_to_remove] = float('-inf')
+            codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1) # (B, num_tokens_per_codebook)
+            codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1)
+            if use_cfg:
+                codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size]
+            all_preds.append(codebook_preds)
+            next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(1) # (B, 1, 128)
+            next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) # (B, 1, 128)
+            local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) # (B, T+1, 128)
+
+        all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks)
+        if use_cfg:
+            all_preds = all_preds[:actual_batch_size]
+
+        return all_preds
+
+
+    def sample_codes_from_logits(self, all_code_logits_t, temperature=0.7, topk=80, unfinished_items={}, finished_items={}):
+        # all_code_logits_t: (B, num_codebooks * num_tokens_per_codebook), logits at a given timestep
+        all_preds = []
+        for idx in range(self.num_audio_codebooks * self.frame_stacking_factor):
+            si = idx * self.num_all_tokens_per_codebook
+            ei = si + self.num_all_tokens_per_codebook
+            codebook_logits = all_code_logits_t[:, si:ei]  # (B, num_tokens_per_codebook)
+            for item_idx in unfinished_items:
+                codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
+            for item_idx in finished_items:
+                codebook_logits[item_idx, :] = float('-inf')
+                codebook_logits[item_idx, self.audio_eos_id] = 0.0
+            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]  # (B, topk)
+            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(
+                -1
+            )  # (B, num_tokens_per_codebook)
+            codebook_logits_rescored = codebook_logits.clone()
+            codebook_logits_rescored[indices_to_remove] = float('-inf')
+
+            codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1)  # (B, num_tokens_per_codebook)
+            codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
+            all_preds.append(codebook_preds)
+        all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
+        return all_preds
+
+    def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, topk=80):
+        # all_code_logits_t: (B, phoneme_stacking_factor * phoneme_vocab_size), logits at a given timestep
+        all_preds = []
+        for idx in range(self.phoneme_stacking_factor):
+            si = idx * self.phoneme_vocab_size
+            ei = si + self.phoneme_vocab_size
+            codebook_logits = all_code_logits_t[:, si:ei]  # (B, num_tokens_per_codebook)
+            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]  # (B, topk)
+            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(
+                -1
+            )  # (B, num_tokens_per_codebook)
+            codebook_logits_rescored = codebook_logits.clone()
+            codebook_logits_rescored[indices_to_remove] = float('-inf')
+
+            codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1)  # (B, num_tokens_per_codebook)
+            codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
+            all_preds.append(codebook_preds)
+        all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
+        return all_preds
+
+    def log_val_audio_example(
+        self,
+        logits,
+        target_audio_codes,
+        audio_codes_lens_target,
+        context_audio_codes=None,
+        context_audio_codes_lens=None,
+    ):
+        wandb_audio_log = {}
+
+        pred_audio_codes = self.logits_to_audio_codes(logits, audio_codes_lens_target)
+        pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target)
+        target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target)
+
+        context_audio, context_audio_lens = None, None
+        if context_audio_codes is not None and context_audio_codes.shape[2] > 3:
+            # > 3 ensures, it is a valid context audio tensor (and not dummy tensor used in text context)
+            context_audio, context_audio_lens = self.codes_to_audio(context_audio_codes, context_audio_codes_lens)
+
+        for logger in self.loggers:
+            is_wandb = isinstance(logger, WandbLogger)
+            is_tb = isinstance(logger, TensorBoardLogger)
+            if not is_wandb and not is_tb:
+                raise ValueError(f"Invalid logger type for audio logging: {type(logger)}. Only `WandbLogger` and `TensorBoardLogger` are supported.")
+
+            for idx in range(min(3, pred_audio.size(0))):
+                pred_audio_np = pred_audio[idx].float().detach().cpu().numpy()
+                target_audio_np = target_audio[idx].float().detach().cpu().numpy()
+                pred_audio_np = pred_audio_np[: pred_audio_lens[idx]]
+                target_audio_np = target_audio_np[: target_audio_lens[idx]]
+                context_audio_np = None
+                if context_audio is not None:
+                    context_audio_np = context_audio[idx].float().detach().cpu().numpy()
+                    context_audio_np = context_audio_np[: context_audio_lens[idx]]
+
+                if is_wandb:
+                    wandb_audio_log[f"Audio/Example_{idx}"] = list()
+                    if context_audio_np is not None:
+                        wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(context_audio_np, sample_rate=self.sample_rate, caption="context"))
+                    wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(pred_audio_np, sample_rate=self.sample_rate, caption="prediction"))
+                    wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(target_audio_np, sample_rate=self.sample_rate, caption="target"))
+
+                if is_tb:
+                    if context_audio_np is not None:
+                        logger.experiment.add_audio(
+                            f'Example_{idx}/context',
+                            context_audio_np,
+                            global_step=self.global_step,
+                            sample_rate=self.sample_rate,
+                        )
+                    logger.experiment.add_audio(
+                        f'Example_{idx}/prediction',
+                        pred_audio_np,
+                        global_step=self.global_step,
+                        sample_rate=self.sample_rate,
+                    )
+                    logger.experiment.add_audio(
+                        f'Example_{idx}/target',
+                        target_audio_np,
+                        global_step=self.global_step,
+                        sample_rate=self.sample_rate,
+                    )
+
+        return wandb_audio_log
+
+    
+    def join_embeddings_temporally(
+        self,
+        embeddings: Sequence[torch.Tensor],     # [ (B, Ti, E), … ]
+        lengths:  Sequence[torch.Tensor],     # [ (B,), … ]  same order/size as `embeddings`
+        pad_embed: torch.Tensor | None = None # (E,)  defaults to zeros
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Merges Multiple Embedding sequences into a single Embedding Sequence.
+
+        Args:
+            embeddings  : Sequence of tensors, each of shape (B, Ti, E) — batch, time, embedding
+            lengths     : Sequence of tensors, each of shape (B,)
+            pad_embed   : (E,)  — embedding to use for padding, defaults to zeros
+        
+        Returns:
+            joined      : (B, max_sum_len, E)  — merged & padded
+            out_lengths : (B,)  — total lengths of each batch element after merging
+        """
+        if len(embeddings) == 0:
+            raise ValueError("contexts must be non-empty")
+
+        B, _, E = embeddings[0].shape
+        device = embeddings[0].device
+        dtype = embeddings[0].dtype
+
+        # 1. compute output sizes
+        len_stack   = torch.stack(tuple(lengths), dim=0)          # (N, B)
+        out_lengths = len_stack.sum(0)
+        max_len     = int(out_lengths.max())
+
+        if pad_embed is None:
+            pad_embed = torch.zeros(E, dtype=dtype, device=device)
+
+        joined = pad_embed.expand(B, max_len, E).clone()          # (B,max_len,E)
+
+        # batch row indices
+        batch_rows = torch.arange(B, device=device).unsqueeze(1)  # (B,1)
+
+        # running offset keeps “write cursor” for each row
+        offset = torch.zeros(B, dtype=torch.long, device=device)  # (B,)
+
+        for i, (embedding_i, len_i) in enumerate(zip(embeddings, lengths)):
+            Ti = embedding_i.shape[1]
+            t_idx  = torch.arange(Ti, device=device) # (Ti,)
+            mask   = t_idx.unsqueeze(0) < len_i.unsqueeze(1) # (B,Ti)
+
+            # destination columns: offset + t
+            dest_cols = offset.unsqueeze(1) + t_idx # (B,Ti)
+
+            # Assign embedding_i to the correct positions in joined
+            joined[batch_rows.expand_as(mask)[mask],
+                dest_cols[mask]] = embedding_i[mask]
+
+            # move cursor past this segment
+            offset += len_i
+
+        return joined, out_lengths
+
+    def prepare_context_tensors(self, batch, dropout_text_input=False):
+        # Transcript
+        text = batch['text']
+        text_lens = batch['text_lens']
+        text_embedded = self.decoder.get_input_embeddings()(text)
+        if self.use_bpe_char_tokenizer:
+            text_mask = get_mask_from_lengths(text_lens)
+            cas_embedding = self.cas_encoder(text, subword_mask=text_mask)  # (B, L, E)
+            text_embedded = text_embedded + cas_embedding
+        
+        if text_embedded.shape[1] < self.streaming_speech_delay + 1:
+            # If text is too short, pad it with zeros
+            padding_tensor = torch.zeros(text_embedded.shape[0], self.streaming_speech_delay + 1 - text_embedded.shape[1], text_embedded.shape[2], device=text_embedded.device)
+            text_embedded = torch.cat([text_embedded, padding_tensor], dim=1)
+
+        if dropout_text_input:
+            # Make text embedding all zeros
+            text_embedded = text_embedded * 0.0
+
+        # Context Audio
+        if 'context_audio_codes' in batch:
+            context_audio_codes = batch['context_audio_codes']
+            context_audio_codes_lens = batch['context_audio_codes_lens']
+            if self._codec_converter is not None:
+                context_audio_codes = self._codec_converter.convert_original_to_new(
+                    audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens
+                ).long()
+        else:
+            context_audio_codes, context_audio_codes_lens = self.audio_to_codes(
+                batch['context_audio'], batch['context_audio_lens'], audio_type='context'
+            )
+        
+        context_audio_codes, context_audio_codes_lens = self.stack_codes(context_audio_codes, context_audio_codes_lens, self.audio_bos_id, self.audio_eos_id, self.frame_stacking_factor, self.num_audio_codebooks)
+        context_audio_embedded = self.embed_audio_tokens(context_audio_codes)  # (B, T', E)
+
+        # Context Text
+        context_text_tokens = batch['context_text_tokens']
+        context_text_lens = batch['context_text_tokens_lens']
+        context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens)  # (B, L, E)
+    
+        remaining_text_embedded = None
+        remaining_text_lens = None
+        if self.text_input_mode == 'full':
+            context_embedding, context_lens = self.join_embeddings_temporally(
+                embeddings=[context_audio_embedded, context_text_embedded, text_embedded],
+                lengths=[context_audio_codes_lens, context_text_lens, text_lens],
+            )
+        elif self.text_input_mode == 'streaming':
+            prompt_text_embedded = text_embedded[:,:self.streaming_speech_delay,:]
+            prompt_text_lens = torch.ones_like(text_lens) * self.streaming_speech_delay
+            context_embedding, context_lens = self.join_embeddings_temporally(
+                embeddings=[context_audio_embedded, context_text_embedded, prompt_text_embedded],
+                lengths=[context_audio_codes_lens, context_text_lens, prompt_text_lens],
+            )
+            remaining_text_embedded = text_embedded[:,self.streaming_speech_delay:,:]
+            remaining_text_lens = text_lens - self.streaming_speech_delay
+            remaining_text_lens = remaining_text_lens.clamp(min=0)
+            remaining_text_mask = get_mask_from_lengths(remaining_text_lens)
+            remaining_text_embedded = remaining_text_embedded * remaining_text_mask.unsqueeze(2) # (B, T, E)
+        else:
+            raise ValueError(f"Invalid text input mode: {self.text_input_mode}")
+
+        return {
+            'context_embedding': context_embedding,  # (B, T_total, E)
+            'context_lens': context_lens,  # (B,)
+            'context_audio_codes': context_audio_codes,  # (B, C, T')
+            'context_audio_embedded': context_audio_embedded,  # (B, T', E)
+            'context_audio_codes_lens': context_audio_codes_lens,  # (B,)
+            'text_embedded': text_embedded,  # (B, L, E)
+            'text_lens': text_lens,  # (B,)
+            'context_text_tokens': context_text_tokens,  # (B, L)
+            'context_text_lens': context_text_lens,  # (B,)
+            'remaining_text_embedded': remaining_text_embedded,  # (B, T, E)
+            'remaining_text_lens': remaining_text_lens,  # (B,)
+        }
+
+    def slice_pred_embeddings(self, transformer_out, context_lens, target_lens):
+        """
+        Slices the transformer output to get the predicted embeddings for the target sequence.
+        Args:
+            transformer_out: (B, T, E)
+            context_lens: (B,) - start index of target per batch
+            target_lens: (B,) - length of target per batch
+        
+        Returns: (B, T_max, E) tensor where T_max = max(target_lens)
+        """
+        B, T, E = transformer_out.shape
+        device = transformer_out.device
+
+        # Compute max target length in batch for padding
+        max_len = target_lens.max().item()
+
+        # Build index tensor for each batch element
+        # Shape: (B, max_len)
+        range_indices = torch.arange(max_len, device=device).unsqueeze(0).expand(B, -1)
+        gather_indices = context_lens.unsqueeze(1) + range_indices  # (B, max_len)
+        gather_indices = torch.clamp(gather_indices, max=transformer_out.size(1) - 1)
+        
+        # Expand to shape (B, max_len, E) for gather
+        gather_indices_exp = gather_indices.unsqueeze(2).expand(-1, -1, E)
+        sliced = torch.gather(transformer_out, dim=1, index=gather_indices_exp)
+        return sliced
+
+
+    def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_codebooks):
+        if stacking_factor == 1:
+            return codes, codes_lens
+        
+        contains_bos = codes[0,0,0].item() == bos_id
+        if contains_bos:
+            bos_tensor_repeated = torch.full((codes.size(0), (stacking_factor) * num_codebooks, 1), bos_id, device=codes.device) # (B,stacking_factor*C, 1)
+            codes = codes[:, :, 1:] # Remove the bos token
+            codes_lens = codes_lens - 1 # Remove the bos token
+        B, C, T = codes.shape
+        s = int(stacking_factor)
+
+        # --- Compute max padding needed ---
+        pad_t = (-T) % s  # pad so that T' is divisible by s
+        pad_tail = torch.full((B, C, pad_t), eos_id,
+                                dtype=codes.dtype, device=codes.device)
+        codes = torch.cat([codes, pad_tail], dim=-1)
+
+        # --- Stack time into channel dimension ---
+        Tp = codes.shape[-1]
+        T_out = Tp // s
+        codes = codes.view(B, C, T_out, s)
+        codes = codes.permute(0, 1, 3, 2).reshape(B, C * s, T_out)
+
+        new_lens = torch.div(codes_lens + s - 1, s, rounding_mode='floor')
+        if contains_bos:
+            codes = torch.cat([bos_tensor_repeated, codes], dim=2)
+            new_lens = new_lens + 1
+
+        return codes, new_lens
+    
+    def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor):
+        if stacking_factor == 1:
+            return stacked_codes, stacked_lens
+        
+        B, CxS, T_out = stacked_codes.shape
+        s = int(stacking_factor)
+        assert CxS % s == 0, f"Channel dim ({CxS}) must be divisible by stacking_factor ({s})"
+
+        C = CxS // s
+        # Reshape: split channels back into (C, s)
+        x = stacked_codes.view(B, C, s, T_out)
+        # Bring s back into time dimension
+        x = x.permute(0, 1, 3, 2).reshape(B, C, T_out * s)
+
+        # Recover original lengths (before padding)
+        orig_lens = stacked_lens * s
+
+        return x, orig_lens
+
+    def prepare_phoneme_channel_input(self, phoneme_tokens, phoneme_tokens_lens, context_lens):
+        # import ipdb; ipdb.set_trace()
+        phoneme_tokens = phoneme_tokens.unsqueeze(1) # (B, 1, L)
+        phoneme_tokens, phoneme_tokens_lens = self.stack_codes(
+            phoneme_tokens, 
+            phoneme_tokens_lens, 
+            self.phoneme_tokenizer.bos_token_id, 
+            self.phoneme_tokenizer.eos_token_id, 
+            self.phoneme_stacking_factor, 
+            1
+        )
+        # import ipdb; ipdb.set_trace()
+        phoneme_tokens_embedded = self.embed_phoneme_tokens(phoneme_tokens) # (B, T', E)
+
+        phoneme_mask = get_mask_from_lengths(phoneme_tokens_lens)
+        phoneme_tokens_embedded = phoneme_tokens_embedded * phoneme_mask.unsqueeze(2) # (B, T', E)
+
+        zero_context_tensor = torch.zeros(context_lens.size(0), context_lens.max().item(), self.cfg.embedding_dim, device=phoneme_tokens.device)
+        phoneme_channel_input, phoneme_channel_input_lens = self.join_embeddings_temporally(
+            embeddings=[zero_context_tensor, phoneme_tokens_embedded],
+            lengths=[context_lens, phoneme_tokens_lens],
+        )
+        return phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens
+
+    
+    def process_batch(self, batch, mode="train"):
+        dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False
+        dropout_phoneme_input = ((random.random() < self.dropout_phoneme_input_prob) and (not dropout_text_input)) if mode == 'train' else False
+        context_tensors = self.prepare_context_tensors(batch, dropout_text_input)
+        print("text lens", context_tensors['text_lens'])
+        remaining_text_embedded = context_tensors['remaining_text_embedded']
+        context_embedding = context_tensors['context_embedding']
+        context_lens = context_tensors['context_lens']
+
+        dropout_conditional_input = False
+        if mode == 'train' and self.cfg_unconditional_prob > 0.0:
+            if torch.rand(1).item() < self.cfg_unconditional_prob:
+                dropout_conditional_input = True
+                # Get embedding of a special UNCONDITIONAL_TOKEN
+                cfg_token_id = self.cfg_unk_token_id # int
+                cfg_token_embedding = self.decoder.get_input_embeddings()(torch.full((context_embedding.size(0), 1), cfg_token_id, device=context_embedding.device))  # (B, 1, E)
+                # Keeping the dummy context same size as the context embedding makes 
+                # inference easier especially with KV caching and using a duplicated batch.
+                context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1)  # (B, T_total, E)
+                # Make unconditional remaining text embedding all zeros. Simplifies the inference implementation.
+                if self.text_input_mode == 'streaming':
+                    remaining_text_embedded = torch.zeros_like(remaining_text_embedded)
+
+        if 'audio_codes' not in batch:
+            audio_codes, audio_codes_lens = self.audio_to_codes(batch['audio'], batch['audio_lens'])
+        else:
+            audio_codes = batch['audio_codes']
+            audio_codes_lens = batch['audio_codes_lens']
+            if self._codec_converter is not None:
+                audio_codes = self._codec_converter.convert_original_to_new(
+                    audio_tokens=audio_codes, audio_lens=audio_codes_lens
+                ).long()
+        
+        audio_codes, audio_codes_lens = self.stack_codes(audio_codes, audio_codes_lens, self.audio_bos_id, self.audio_eos_id, self.frame_stacking_factor, self.num_audio_codebooks)
+        audio_codes_lens_input = audio_codes_lens_target = audio_codes_lens - 1
+        audio_codes_target = audio_codes[:, :, 1:]  # (B, C, T') Target for the decoder
+        audio_codes_input = audio_codes[:, :, :-1]  # (B, C, T') Input to the decoder
+        audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input) # (B, T, E) # Computing this to be use in the alignment encoder
+        if remaining_text_embedded is not None:
+            # Make remaining text embedded the same size as audio_codes_input_embedded by padding with zeros on the right
+            padding_len = audio_codes_input_embedded.size(1) - remaining_text_embedded.size(1)
+            padding_tensor = torch.zeros(remaining_text_embedded.size(0), padding_len, remaining_text_embedded.size(2), device=remaining_text_embedded.device)
+            remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1)
+            audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded
+            
+
+        context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally(
+            embeddings=[context_embedding, audio_codes_input_embedded],
+            lengths=[context_lens, audio_codes_lens_input],
+        )
+
+        if self.phoneme_tokenizer is not None:
+            context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay
+            phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens = self.prepare_phoneme_channel_input(
+                batch['phoneme_tokens'], 
+                batch['phoneme_tokens_lens'], 
+                context_lens_for_phonemes
+            )
+            print("phoneme_tokens_lens", phoneme_tokens_lens)
+            print("audio_codes_lens", audio_codes_lens_input)
+            if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]:
+                padding_tensor = torch.zeros(phoneme_channel_input.shape[0], context_plus_audio_embedded.shape[1] - phoneme_channel_input.shape[1], phoneme_channel_input.shape[2], device=phoneme_channel_input.device)
+                phoneme_channel_input = torch.cat([phoneme_channel_input, padding_tensor], dim=1)
+            else:
+                phoneme_channel_input = phoneme_channel_input[:, :context_plus_audio_embedded.shape[1], :]
+
+            if (not dropout_conditional_input) and (not dropout_phoneme_input):
+                context_plus_audio_embedded = context_plus_audio_embedded + phoneme_channel_input
+
+        transformer_out = self.forward(
+            inputs_embeds=context_plus_audio_embedded,
+            attention_mask=get_mask_from_lengths(context_plus_audio_lens),
+        )
+        transformer_hidden_states = transformer_out.last_hidden_state  # (B, T_total, E)
+        
+        pred_embeddings = self.slice_pred_embeddings(
+            transformer_hidden_states,
+            context_lens=context_lens,
+            target_lens=audio_codes_lens_target,
+        )
+        
+        logits = self.final_proj(pred_embeddings)  # (B, T', num_codebooks * num_tokens_per_codebook)
+        # import ipdb; ipdb.set_trace()
+        codebook_loss, loss_mask = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target)
+        loss = codebook_loss
+
+        local_transformer_loss = None
+        local_transformer_logits = None
+        if self.local_transformer_type != LocalTransformerType.NO_LT:
+            if self.local_transformer_type == LocalTransformerType.MASKGIT:
+                # randomly replace some positions with MASK_TOKEN
+                audio_codes_masked, mask_tokens_mask = self.maskgit_apply_random_mask(audio_codes_target)
+                local_transformer_logits = self.compute_local_transformer_logits(pred_embeddings, audio_codes_masked, targets_offset_by_one=True)
+                #audio_codes_masked = audio_codes_masked[:, 1:, :]
+                local_transformer_loss, _ = self.compute_loss(local_transformer_logits, audio_codes_target, audio_codes_lens_target, mask_tokens_mask)
+            else:
+                # autoregressive
+                assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type"
+                local_transformer_logits = self.compute_local_transformer_logits(pred_embeddings, audio_codes_target, targets_offset_by_one=False)
+                local_transformer_loss, _ = self.compute_loss(local_transformer_logits, audio_codes_target, audio_codes_lens_target, None)
+            local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0)
+            loss = loss + local_transformer_loss_scale * local_transformer_loss
+
+        phoneme_loss = None
+        if self.phoneme_tokenizer is not None:
+            pred_embeddings_phoneme = self.slice_pred_embeddings(
+                transformer_hidden_states,
+                context_lens=context_lens_for_phonemes,
+                target_lens=phoneme_tokens_lens-1,
+            )
+            phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme) # (B, T', phoneme_stacking_factor * phoneme_vocab_size)
+            if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input):
+                # Only compute phoneme loss if not doing unconditional training or text dropout
+                phoneme_loss, _ = self.compute_phoneme_loss(phoneme_logits, phoneme_tokens[:,:,1:].long(), phoneme_tokens_lens - 1)
+                print("No Dropout - phoneme loss:", phoneme_loss.item())
+            else:
+                phoneme_loss = torch.tensor(0.0, device=logits.device)
+                print("Dropout - phoneme loss skipped", phoneme_loss.item())
+
+            loss = loss + phoneme_loss
+
+        return {
+            'loss': loss,
+            'codebook_loss': codebook_loss,
+            'phoneme_loss': phoneme_loss,
+            'local_transformer_loss': local_transformer_loss,
+            'local_transformer_logits': local_transformer_logits,  # (B, T', num_codebooks * num_tokens_per_codebook)
+            'logits': logits,
+            'audio_codes_target': audio_codes_target,  # (B, C, T')
+            'audio_codes_lens_target': audio_codes_lens_target,  # (B,)
+            'context_audio_codes': context_tensors['context_audio_codes'],  # (B, C, T')
+            'context_audio_codes_lens': context_tensors['context_audio_codes_lens'],  # (B,)
+        }
+
+        
+
+    def training_step(self, batch, batch_idx):
+        batch_output = self.process_batch(batch)
+        loss = batch_output['loss']
+        codebook_loss = batch_output['codebook_loss']
+        self.log('train/codebook_loss', codebook_loss, prog_bar=True, sync_dist=True)
+        self.log('train/loss', loss, prog_bar=True, sync_dist=True)
+
+        if self.phoneme_tokenizer is not None:
+            phoneme_loss = batch_output['phoneme_loss']
+            self.log('train/phoneme_loss', phoneme_loss, prog_bar=True, sync_dist=True)
+        
+        local_transformer_loss = batch_output['local_transformer_loss']
+        if local_transformer_loss is not None:
+            self.log('train/local_transformer_loss', local_transformer_loss, prog_bar=True, sync_dist=True)
+
+        # Log batch info
+        batch_size, text_token_max_len = batch["text"].shape
+        text_token_total_num = batch["text_lens"].sum()
+        batch_info_dict = {
+            "train/batch_size": batch_size,
+            "train/text_token_max_len": text_token_max_len,
+            "train/text_token_total_num_in_batch": text_token_total_num,
+            "train/text_token_pad_ratio_percent_in_batch": 100 * (1 - text_token_total_num / (batch_size * text_token_max_len)),
+        }
+
+        if "audio_codes" in batch:
+            audio_codes_max_len = batch["audio_codes"].shape[-1]
+            audio_codes_total_num = batch["audio_codes_lens"].sum()
+            batch_info_dict.update({
+                "train/audio_codes_max_len": audio_codes_max_len,
+                "train/audio_codes_total_num_in_batch": audio_codes_total_num,
+                "train/audio_codes_pad_ratio_percent_in_batch": 100 * (1 - audio_codes_total_num / (batch_size * audio_codes_max_len)),
+            })
+        else:
+            audio_samples_max_len = batch["audio"].shape[-1]
+            audio_samples_total_num = batch["audio_lens"].sum()
+            batch_info_dict.update({
+                "train/audio_samples_max_len": audio_samples_max_len,
+                "train/audio_samples_total_num_in_batch": audio_samples_total_num,
+                "train/audio_samples_pad_ratio_percent_in_batch": 100 * (1 - audio_samples_total_num / (batch_size * audio_samples_max_len)),
+            })
+
+        self.log_dict(batch_info_dict, on_step=True)
+
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        batch_output = self.process_batch(batch, mode="val")
+        # self.process_batch returns a dict. We currently only log "logits" which come from the parallel prediction
+        # head. If we use local_transformer, then the local_transformer returns "local_transformer_logits"
+        loss = batch_output['loss']
+        codebook_loss = batch_output['codebook_loss']
+        logits = batch_output['logits']
+        audio_codes_target = batch_output['audio_codes_target']
+        audio_codes_lens_target = batch_output['audio_codes_lens_target']
+        context_audio_codes = batch_output['context_audio_codes']
+        context_audio_codes_lens = batch_output['context_audio_codes_lens']
+        
+        if batch_idx == 0 and self.global_rank == 0:
+            # Prepare dictionary for aggregated wandb logging
+            wandb_log_dict = {}
+
+            # Get audio data for logging
+            wandb_log_dict.update(
+                self.log_val_audio_example(
+                    logits, audio_codes_target, audio_codes_lens_target, context_audio_codes, context_audio_codes_lens
+                )
+            )
+
+            # Perform single wandb log call if wandb is active and there is data
+            for logger in self.loggers:
+                if isinstance(logger, WandbLogger) and wandb_log_dict:
+                    logger.experiment.log(wandb_log_dict)
+            
+            # infer_output_no_cfg_noLT = self.infer_batch(
+            #     batch, 
+            #     max_decoder_steps=500, 
+            #     temperature=0.7, 
+            #     topk=80, 
+            #     use_local_transformer_for_inference=False, 
+            #     maskgit_n_steps=3, 
+            #     use_cfg=False, 
+            #     cfg_scale=1.0
+            # )
+            # infer_output_cfg_withLT = self.infer_batch(
+            #     batch, 
+            #     max_decoder_steps=500, 
+            #     temperature=0.7, 
+            #     topk=80, 
+            #     use_local_transformer_for_inference=self.local_transformer_type != LocalTransformerType.NO_LT,
+            #     maskgit_n_steps=3,
+            #     use_cfg=True, 
+            #     cfg_scale=2.5
+            # )
+            # pred_audio_no_cfg_noLT, pred_audio_no_cfg_noLT_lens = infer_output_no_cfg_noLT[0], infer_output_no_cfg_noLT[1]
+            # pred_audio_cfg_withLT, pred_audio_cfg_withLT_lens = infer_output_cfg_withLT[0], infer_output_cfg_withLT[1]
+
+            # for logger in self.loggers:
+            #     is_wandb = isinstance(logger, WandbLogger)
+            #     is_tb = isinstance(logger, TensorBoardLogger)
+            #     if not is_wandb and not is_tb:
+            #         raise ValueError(f"Invalid logger type for audio logging: {type(logger)}. Only `WandbLogger` and `TensorBoardLogger` are supported.")
+            #     for idx in range(pred_audio_no_cfg_noLT.size(0)):
+            #         pred_audio_no_cfg_noLT_idx = pred_audio_no_cfg_noLT[idx][:pred_audio_no_cfg_noLT_lens[idx]].float().cpu().numpy()
+            #         pred_audio_cfg_withLT_idx = pred_audio_cfg_withLT[idx][:pred_audio_cfg_withLT_lens[idx]].float().cpu().numpy()
+            #         if is_wandb:
+            #             logger.experiment.log({
+            #                 "val/pred_audio_no_cfg_noLT": wandb.Audio(pred_audio_no_cfg_noLT_idx, sample_rate=self.sample_rate, caption="Inference No CFG, No LT"),
+            #                 "val/pred_audio_cfg_withLT": wandb.Audio(pred_audio_cfg_withLT_idx, sample_rate=self.sample_rate, caption="Inference CFG, With LT"),
+            #             })
+            #         if is_tb:
+            #             logger.experiment.add_audio(
+            #                 "val/pred_audio_no_cfg_noLT", pred_audio_no_cfg_noLT_idx, sample_rate=self.sample_rate, global_step=batch_idx
+            #             )
+            #             logger.experiment.add_audio(
+            #                 "val/pred_audio_cfg_withLT", pred_audio_cfg_withLT_idx, sample_rate=self.sample_rate, global_step=batch_idx
+            #             )   
+
+        local_transformer_loss = batch_output['local_transformer_loss']
+        val_output = {
+            'val_loss': loss,
+            'val_codebook_loss': codebook_loss,
+            'val_local_transformer_loss': local_transformer_loss,
+        }
+
+        if self.phoneme_tokenizer is not None:
+            phoneme_loss = batch_output['phoneme_loss']
+            val_output['val_phoneme_loss'] = phoneme_loss
+
+        self.validation_step_outputs.append(val_output)
+
+        return val_output
+
+    def on_validation_epoch_end(self):
+        collect = lambda key: torch.stack([x[key] for x in self.validation_step_outputs]).mean()
+        val_loss = collect("val_loss")
+        val_codebook_loss = collect("val_codebook_loss")
+        
+        self.log("val_loss", val_loss, prog_bar=True, sync_dist=True)
+        self.log("val/codebook_loss", val_codebook_loss, prog_bar=True, sync_dist=True)
+        
+        if self.local_transformer_type != LocalTransformerType.NO_LT:
+            val_local_transformer_loss = collect("val_local_transformer_loss")
+            self.log("val/local_transformer_loss", val_local_transformer_loss, prog_bar=True, sync_dist=True)
+        
+        if self.phoneme_tokenizer is not None:
+            val_phoneme_loss = collect("val_phoneme_loss")
+            self.log("val/phoneme_loss", val_phoneme_loss, prog_bar=True, sync_dist=True)
+        
+        self.validation_step_outputs.clear()  # free memory
+
+    def get_dataset(self, dataset_cfg, dataset_type):
+        dataset = instantiate(
+            dataset_cfg.dataset,
+            sample_rate=self.sample_rate,
+            bos_id=None,
+            eos_id=self.eos_id,
+            audio_bos_id=self.audio_bos_id,
+            audio_eos_id=self.audio_eos_id,
+            context_audio_bos_id=self.context_audio_bos_id,
+            context_audio_eos_id=self.context_audio_eos_id,
+            num_audio_codebooks=self.data_num_audio_codebooks,
+            codec_model_samples_per_frame=self.codec_model_samples_per_frame,
+            prior_scaling_factor=0.0,
+            load_cached_codes_if_available=self.cfg.load_cached_codes_if_available,
+            dataset_type=dataset_type,  # train or test used for setting phone prob to 1.0 in test dataset (worker_init_fn)
+            use_text_conditioning_tokenizer=True,
+            text_conditioning_tokenizer_name=self.text_conditioning_tokenizer_name,
+            pad_context_text_to_max_duration=self.pad_context_text_to_max_duration,
+            context_duration_min=self.cfg.context_duration_min,
+            context_duration_max=self.cfg.context_duration_max,
+        )
+        dataset.load_16khz_audio = False
+        dataset.tokenizer_config = (
+            self.cfg.text_tokenizers
+        )  # This will be used in worker_init_fn for instantiating tokenizer
+        if self.phoneme_tokenizer is not None:
+            dataset.phoneme_tokenizer_config = self.cfg.phoneme_tokenizer
+        
+        return dataset
+
+    def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.DataLoader:
+        # TODO @xueyang: better to distinguish cfg. self.cfg is the model cfg, while cfg here is train_ds cfg. Also
+        #   cfg is a classifier-free guidance.
+        dataset = MagpieTTSLhotseDataset(
+            sample_rate=self.sample_rate,
+            volume_norm=dataset_cfg.volume_norm,
+            codec_model_samples_per_frame=self.codec_model_samples_per_frame,
+            audio_bos_id=self.audio_bos_id,
+            audio_eos_id=self.audio_eos_id,
+            context_audio_bos_id=self.context_audio_bos_id,
+            context_audio_eos_id=self.context_audio_eos_id,
+            num_audio_codebooks=self.data_num_audio_codebooks,
+            prior_scaling_factor=0.0,
+            load_cached_codes_if_available=self.cfg.load_cached_codes_if_available,
+            dataset_type=mode,  # train or test used for setting phone prob to 1.0 in test dataset (worker_init_fn)
+            load_16khz_audio=False,
+            pad_context_text_to_max_duration=self.pad_context_text_to_max_duration,
+            context_duration_min=self.cfg.context_duration_min,
+            context_duration_max=self.cfg.context_duration_max,
+            use_text_conditioning_tokenizer=True,
+            text_conditioning_tokenizer_name=self.text_conditioning_tokenizer_name,
+            tokenizer_config=self.cfg.text_tokenizers,
+            phoneme_tokenizer_config=self.cfg.get("phoneme_tokenizer", None)
+        )
+        
+        data_loader = get_lhotse_dataloader_from_config(
+            config=dataset_cfg.dataset,
+            global_rank=self.global_rank,
+            world_size=self.world_size,
+            dataset=dataset,
+        )
+        return data_loader
+
+    def setup_training_data(self, dataset_cfg):
+        if dataset_cfg.get("use_lhotse", False):
+            # TODO @xueyang: better to distinguish cfg. self.cfg is the model cfg, while cfg here is train_ds cfg. Also
+            #   cfg is a classifier-free guidance.
+            self._train_dl = self.get_lhotse_dataloader(dataset_cfg, mode='train')
+        else:
+            dataset = self.get_dataset(dataset_cfg, dataset_type='train')
+            sampler = dataset.get_sampler(dataset_cfg.dataloader_params.batch_size, world_size=self.trainer.world_size)
+            persistent_workers = True
+            if dataset_cfg.dataloader_params.num_workers == 0:
+                persistent_workers = False
+                # For num workers > 0 tokenizer will be assigned in worker_init_fn (since it is not picklable)
+                dataset.text_tokenizer = setup_tokenizers(
+                    all_tokenizers_config=self.cfg.text_tokenizers,
+                    mode='train',
+                )
+                if self.cfg.get("phoneme_tokenizer", None) is not None:
+                    dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.cfg.phoneme_tokenizer)
+
+            self._train_dl = torch.utils.data.DataLoader(
+                dataset,
+                collate_fn=dataset.collate_fn,
+                sampler=sampler,
+                **dataset_cfg.dataloader_params,
+                worker_init_fn=worker_init_fn,
+                persistent_workers=persistent_workers,
+            )
+
+    def _setup_test_dataloader(self, dataset_cfg) -> torch.utils.data.DataLoader:
+        if dataset_cfg.get("use_lhotse", False):
+            data_loader = self.get_lhotse_dataloader(dataset_cfg, mode='test')
+        else:
+            dataset = self.get_dataset(dataset_cfg, dataset_type='test')
+            persistent_workers = True
+            if dataset_cfg.dataloader_params.num_workers == 0:
+                persistent_workers = False
+                # For num workers > 0 tokenizer will be assigned in worker_init_fn (since it is not picklable)
+                dataset.text_tokenizer = setup_tokenizers(
+                    all_tokenizers_config=self.cfg.text_tokenizers,
+                    mode='test'
+                )
+                if self.cfg.get("phoneme_tokenizer", None) is not None:
+                    dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.cfg.phoneme_tokenizer)
+
+            data_loader = torch.utils.data.DataLoader(
+                dataset,
+                collate_fn=dataset.collate_fn,
+                **dataset_cfg.dataloader_params,
+                worker_init_fn=worker_init_fn,
+                persistent_workers=persistent_workers,
+            )
+        return data_loader
+
+    def setup_validation_data(self, cfg):
+        self._validation_dl = self._setup_test_dataloader(cfg)
+
+    def setup_test_data(self, cfg):
+        self._test_dl = self._setup_test_dataloader(cfg)
+
+    def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, use_local_transformer_for_inference=False, maskgit_n_steps=3, use_cfg=False, cfg_scale=1.0, phoneme_input_type='gt', phoneme_sampling_method='argmax', dropout_text_input=False):
+        with torch.inference_mode():
+            start_time = time.time()
+            context_tensors = self.prepare_context_tensors(batch, dropout_text_input=dropout_text_input)
+            context_embedding = context_tensors['context_embedding']  # (B, T_total, E)
+            context_lens = context_tensors['context_lens']  # (B,)
+            remaining_text_embedded = context_tensors['remaining_text_embedded']
+            remaining_text_lens = context_tensors['remaining_text_lens']
+            
+            if self.phoneme_tokenizer is not None:
+                context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay
+                phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = self.prepare_phoneme_channel_input(
+                    batch['phoneme_tokens'], 
+                    batch['phoneme_tokens_lens'], 
+                    context_lens_for_phonemes
+                )
+                phoneme_channel_input_pad_tensor = torch.zeros(phoneme_channel_input.size(0), max_decoder_steps, phoneme_channel_input.size(2), device=phoneme_channel_input.device)
+                phoneme_channel_input = torch.cat([phoneme_channel_input, phoneme_channel_input_pad_tensor], dim=1)
+                
+            audio_codes_bos = torch.full(
+                    (context_embedding.size(0), self.num_audio_codebooks * self.frame_stacking_factor, 1), self.audio_bos_id, device=context_embedding.device
+                ).long()
+            audio_codes_lens = torch.full((context_embedding.size(0),), 1, device=context_embedding.device).long()
+            audio_codes_input = audio_codes_bos
+
+            audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input)  # (B, T, E)
+            if self.text_input_mode == 'streaming':
+                remaining_text_pad_length = max_decoder_steps - remaining_text_lens.max().item() + 1
+                remaining_text_pad_tensor = torch.zeros(remaining_text_embedded.size(0), remaining_text_pad_length, remaining_text_embedded.size(2), device=remaining_text_embedded.device)
+                remaining_text_embedded = torch.cat([remaining_text_embedded, remaining_text_pad_tensor], dim=1)
+                audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded[:, :1, :] # :1 corresponds to audio BOS.
+
+            context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally(
+                embeddings=[context_embedding, audio_codes_input_embedded],
+                lengths=[context_lens, audio_codes_lens],
+            )
+            min_context_len = context_plus_audio_lens.min().item()
+            if self.phoneme_tokenizer is not None:
+                min_context_len = min_context_len - self.streaming_speech_delay + self.streaming_phonemes_delay - 1 # 1 for audio BOS that we had added.
+
+            actual_batch_size = context_embedding.size(0)
+            if use_cfg:
+                dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()(
+                    torch.full((actual_batch_size, 1), self.cfg_unk_token_id, device=context_embedding.device)
+                ) # (B, 1, E)
+                dummy_context_embedding_unconditional_expanded = dummy_context_embedding_unconditional.expand(-1, context_embedding.size(1), -1)  # (B, T_total, E)
+                
+                dummy_context_plus_audio_embedded, _ = self.join_embeddings_temporally(
+                    embeddings=[dummy_context_embedding_unconditional_expanded, audio_codes_input_embedded],
+                    lengths=[context_lens, audio_codes_lens],
+                )
+                first_inference_input = torch.cat(
+                    [context_plus_audio_embedded, dummy_context_plus_audio_embedded],
+                    dim=0
+                )[:,:min_context_len, :]  # (2B, T_min, E)
+            else:
+                first_inference_input = context_plus_audio_embedded[:, :min_context_len, :]  # (B, T_min, E)
+            # First forward pass to get the initial hidden state and past key values
+            transformer_out = self.forward(
+                inputs_embeds=first_inference_input,
+                attention_mask=None,
+                use_cache=True,
+                past_key_values=None,  # No past key values for the first step
+            )
+
+            time_to_first_prediction = time.time() - start_time
+            last_hidden = transformer_out.last_hidden_state  # (B, T_total, E)
+            past_kv = transformer_out.past_key_values
+
+            all_predictions = []
+            end_indices = {}
+            
+            current_text_positions = []
+            for item_idx in range(context_embedding.size(0)):
+                # 0 if we have started reading the remaining text otherwise negative (indicating how far we are before we start reading the remaining text)
+                current_text_positions.append(min_context_len - context_plus_audio_lens[item_idx])
+            current_text_positions = torch.tensor(current_text_positions, device=context_embedding.device).long()
+            if self.phoneme_tokenizer is not None:
+                current_phoneme_positions = current_text_positions - current_text_positions.max() - 1 # Make it 0-indexed.
+                # current_text_positions = current_text_positions - self.streaming_speech_delay + self.streaming_phonemes_delay    
+            pred_phoneme_token_lists = [
+                [] for _ in range(actual_batch_size)
+            ]
+            gt_phoneme_token_lists = [
+                [] for _ in range(actual_batch_size)
+            ]
+            phoneme_stream_ended = torch.zeros(actual_batch_size, device=context_embedding.device).bool() # (B,) Whether phoneme stream has ended for this item.
+            for idx in range(max_decoder_steps):
+                # import ipdb; ipdb.set_trace()
+                current_text_positions += 1
+                if self.phoneme_tokenizer is not None:
+                    current_phoneme_positions += 1
+                    print("current_phoneme_positions", current_phoneme_positions)
+                if idx % 20 == 0:
+                    print(f"Decoding timestep {idx}")
+
+                all_code_logits_t = self.final_proj(last_hidden[:, -1, :])  # (B, num_codebooks * num_tokens_per_codebook)
+                
+                if self.phoneme_tokenizer is not None:
+                    all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :]) # (B, phoneme_stacking_factor * phoneme_vocab_size)
+                    all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size]
+
+                if use_cfg:
+                    conditional_logits = all_code_logits_t[:actual_batch_size]
+                    unconditional_logits = all_code_logits_t[actual_batch_size:]
+                    all_code_logits_t = cfg_scale * conditional_logits +  (1.0 - cfg_scale) * unconditional_logits
+
+                if use_local_transformer_for_inference:
+                    if self.local_transformer_type == LocalTransformerType.AR :
+                        # Autoregressive sampling with local transformer
+                        audio_codes_next = self.local_transformer_sample_autoregressive(
+                            dec_output=last_hidden[:, -1, :],
+                            temperature=temperature,
+                            topk=topk,
+                            use_cfg=use_cfg,
+                            cfg_scale=cfg_scale,
+                        )
+                    elif self.local_transformer_type == LocalTransformerType.MASKGIT:
+                        audio_codes_next = self.local_transformer_sample_maskgit(
+                            dec_output=last_hidden[:, -1, :],
+                            temperature=temperature,
+                            topk=topk,
+                            n_steps=maskgit_n_steps,
+                            use_cfg=use_cfg,
+                            cfg_scale=cfg_scale,
+                        )
+                    else:
+                        raise ValueError(f"Local transformer inference requested by but local transformer type is {self.local_transformer_type}")
+                    # TODO @rfejgin: should we add argmax sampling for EOS here too?
+                    all_codes_next_argmax = audio_codes_next
+                else:
+                    # Parallel sampling from logits
+                    audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk) # (B, num_codebooks)
+                    all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01) # (B, num_codebooks)
+
+                phoneme_channel_input_t = None
+                
+                if self.phoneme_tokenizer is not None:
+                    all_codes_next_phoneme = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=temperature, topk=topk) # (B, phoneme_stacking_factor)
+                    all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.01) # (B, phoneme_stacking_factor)
+                    pred_phoneme_tokens = all_codes_next_phoneme_argmax if phoneme_sampling_method == 'argmax' else all_codes_next_phoneme # B, phoneme_stacking_factor
+                    phoneme_bos_tensor = torch.full(
+                        (actual_batch_size, self.phoneme_stacking_factor),
+                        self.phoneme_tokenizer.bos_token_id, 
+                        device=context_embedding.device
+                    ).long() # (B, phoneme_stacking_factor)
+                    use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long()
+                    print("use_bos_phoneme", use_bos_phoneme)
+                    pred_phoneme_tokens = (use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens).long()  # (B, phoneme_stacking_factor)
+                    
+                    print("pred_phoneme_tokens", pred_phoneme_tokens)
+                    gt_phoneme_idx = min(idx, gt_phoneme_tokens.size(2) - 1)
+                    gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx] # (B, phoneme_stacking_factor)
+                    print("gt_phoneme_tokens_current", gt_phoneme_tokens_current)
+                    
+                    input_phoneme_tokens_current = gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens
+                    input_phoneme_embedding = self.embed_phoneme_tokens(input_phoneme_tokens_current.unsqueeze(2))  # (B, phoneme_stacking_factor, E)
+                    
+                    use_phoneme_input = (current_phoneme_positions >= 0) * (~phoneme_stream_ended) # (B,)
+                    use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float()  # (B, 1, 1)
+                    zero_phoneme_embedding = torch.zeros(actual_batch_size, self.cfg.embedding_dim, device=all_codes_next_phoneme.device).unsqueeze(1) # (B, 1, E)
+                    # phoneme_channel_input_t = phoneme_channel_input[torch.arange(actual_batch_size), current_phoneme_positions.clamp(min=0) + min_context_len, :].unsqueeze(1) # (B, 1, E)
+                    phoneme_channel_input_t = use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding
+                    print("use_phoneme_input", use_phoneme_input)
+                    for item_idx in range(actual_batch_size):
+                        if use_phoneme_input[item_idx,0,0] > 0:
+                            for phoneme_channel_idx in range(self.phoneme_stacking_factor):
+                                _phoneme_token = pred_phoneme_tokens[item_idx, phoneme_channel_idx].item()
+                                if _phoneme_token not in [self.phoneme_tokenizer.eos_token_id, self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.pad]:
+                                    pred_phoneme_token_lists[item_idx].append(_phoneme_token)
+                                
+                                _gt_phoneme_token = gt_phoneme_tokens_current[item_idx, phoneme_channel_idx].item()
+                                if _gt_phoneme_token not in [self.phoneme_tokenizer.eos_token_id, self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.pad]:
+                                    gt_phoneme_token_lists[item_idx].append(_gt_phoneme_token)
+                                
+                        if torch.any(input_phoneme_tokens_current[item_idx] == self.phoneme_tokenizer.eos_token_id):
+                            print("Phoneme end detected for item {} at timestep {}".format(item_idx, idx))
+                            phoneme_stream_ended[item_idx] = True
+                    all_codes_next_phoneme = all_codes_next_phoneme.unsqueeze(1)
+                    # import ipdb; ipdb.set_trace()
+
+                for item_idx in range(all_codes_next_argmax.size(0)):
+                    if item_idx not in end_indices and idx + min_context_len > context_plus_audio_lens[item_idx]:
+                        pred_tokens = all_codes_next_argmax[item_idx]
+                        pred_tokens_multinomial = audio_codes_next[item_idx]
+                        if torch.any(pred_tokens == self.audio_eos_id) or torch.any(pred_tokens_multinomial == self.audio_eos_id):
+                            print("End detected for item {} at timestep {}".format(item_idx, idx))
+                            end_indices[item_idx] = idx
+                
+                all_predictions.append(audio_codes_next)
+                
+                new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2))  # (B, 1, E)
+                new_emb_unconditional = new_emb * 1
+                
+                if self.text_input_mode == 'streaming':
+                    _bs = context_embedding.size(0)
+                    remaining_text_embedded_current = remaining_text_embedded[torch.arange(_bs), current_text_positions.clamp(min=0) , :].unsqueeze(1) # (B, 1, E)
+                    new_emb = new_emb + remaining_text_embedded_current
+                    
+                
+                context_incomplete_mask = context_plus_audio_lens > idx + min_context_len # (B,)
+                # import ipdb; ipdb.set_trace()
+                # True if we have not yet reached the end of the context for this item
+                # import ipdb; ipdb.set_trace()
+                if context_incomplete_mask.any():
+                    # If some contexts are not yet complete.
+                    context_incomplete_mask = context_incomplete_mask.unsqueeze(1).unsqueeze(2).float()  # (B, 1, 1)
+                    context_embedding = context_plus_audio_embedded[:,min_context_len+idx:min_context_len+idx+1,:] # (B, 1, E)
+                    next_input = context_incomplete_mask * context_embedding + (1 - context_incomplete_mask) * new_emb
+                    if phoneme_channel_input_t is not None:
+                        next_input += phoneme_channel_input_t
+                    if use_cfg:
+                        next_input_unconditional = context_incomplete_mask * dummy_context_embedding_unconditional + (1 - context_incomplete_mask) * new_emb_unconditional
+                        next_input = torch.cat([next_input, next_input_unconditional], dim=0)  # (2B, 1, E)
+                else:
+                    next_input = new_emb
+                    if phoneme_channel_input_t is not None:
+                        next_input += phoneme_channel_input_t
+                    if use_cfg:
+                        next_input = torch.cat([next_input, new_emb_unconditional], dim=0)  # (2B, 1, E)
+                        
+                transformer_out = self.forward(
+                    inputs_embeds=next_input,
+                    attention_mask=None,
+                    use_cache=True,
+                    past_key_values=past_kv,
+                )
+                last_hidden = transformer_out.last_hidden_state
+                past_kv = transformer_out.past_key_values
+                if len(end_indices) == audio_codes_next.size(0):
+                    print("All items finished at timestep {}".format(idx))
+                    break
+            
+            if self.phoneme_tokenizer is not None:
+                for item_idx in range(actual_batch_size):
+                    print("Predicted phoneme tokens for item {}: {}".format(item_idx, pred_phoneme_token_lists[item_idx]))
+                    print("GT phoneme tokens for item {}: {}".format(item_idx, gt_phoneme_token_lists[item_idx]))
+                    predicted_phoneme_text = self.phoneme_tokenizer.decode(pred_phoneme_token_lists[item_idx])
+                    gt_phoneme_text = self.phoneme_tokenizer.decode(gt_phoneme_token_lists[item_idx])
+                    print("Predicted phoneme text for item {}: {}".format(item_idx, predicted_phoneme_text))
+                    print("GT phoneme text for item {}: {}".format(item_idx, gt_phoneme_text))
+
+            tts_generation_time = time.time() - start_time
+            tts_generation_time_per_frame = tts_generation_time / len(all_predictions)
+            pred_codes_start_indices = context_plus_audio_lens - min_context_len # (B,)
+            predicted_lens = [end_indices.get(idx, max_decoder_steps) for idx in range(context_embedding.size(0))] #  Ensure that the codec is atleast of length 4
+            predicted_codes_lens = torch.tensor(predicted_lens, device=context_embedding.device).long()
+            predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices # (B,)
+
+            predicted_codes = torch.stack(all_predictions, dim=-1)  # (B, num_codebooks, T)
+            predicted_codes = self.slice_pred_embeddings(
+                predicted_codes.permute(0, 2, 1),
+                context_lens=pred_codes_start_indices,
+                target_lens=predicted_codes_lens,
+            )
+            predicted_codes = predicted_codes.permute(0, 2, 1)  # (B, num_codebooks, T)
+            predicted_audio, predicted_audio_lens = self.codes_to_audio(predicted_codes, predicted_codes_lens)
+            
+            end_time = time.time()
+            total_audio_duration_generated = (predicted_audio_lens.max().item() * predicted_audio_lens.shape[0])/self.sample_rate
+            rtf = total_audio_duration_generated / (end_time - start_time)
+
+            rtf_metrics = {
+                'rtf': rtf,
+                'time_to_first_prediction': time_to_first_prediction,
+                'tts_generation_time': tts_generation_time,
+                'max_frames_generated': len(all_predictions),
+                'tts_generation_time_per_frame': tts_generation_time_per_frame,
+                'batch_size': context_embedding.size(0),
+            }
+
+            return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics
+
+
+        
+    @classmethod
+    def list_available_models(cls) -> List[PretrainedModelInfo]:
+        return []
+
diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index 369380768566..cc3083f30e2c 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -26,15 +26,15 @@
 import shutil
 import time
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import soundfile as sf
 import torch
 
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
 from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPATokenizer
-from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset
-from nemo.collections.tts.models import MagpieTTSModel
+from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset, MagpieTTSDataset
+from nemo.collections.tts.models import MagpieTTSModel, MagpieTTSDecoderModel
 from nemo.collections.tts.models.magpietts import ModelInferenceParameters
 from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors
 from nemo.utils import logging
@@ -73,6 +73,12 @@ class InferenceConfig:
     maskgit_fixed_schedule: Optional[List[int]] = None
     maskgit_sampling_type: Optional[str] = None
 
+    # Decoder-only inference options
+    phoneme_input_type: str = "gt"  # gt or predicted
+    phoneme_sampling_method: str = "argmax"  # argmax or multinomial
+    dropout_text_input: bool = False
+
+    is_decoder_only_model: bool = False
     def build_identifier(self) -> str:
         """Build a unique identifier string for this configuration.
 
@@ -127,8 +133,8 @@ class MagpieInferenceRunner:
     """
 
     def __init__(
-        self,
-        model: MagpieTTSModel,
+        self,# model can be MagpieTTSModel or DecoderOnlyMagpieTTSModel
+        model: Union[MagpieTTSModel, MagpieTTSDecoderModel],
         config: InferenceConfig,
     ):
         """Initialize the inference runner.
@@ -151,7 +157,8 @@ def _configure_tokenizer(self) -> None:
         """Configure the tokenizer for inference (phoneme prob = 1.0)."""
         g2p = None
         if isinstance(self.model.tokenizer, AggregatedTTSTokenizer):
-            g2p = self.model.tokenizer.tokenizers["english_phoneme"].g2p
+            if "english_phoneme" in self.model.tokenizer.tokenizers and hasattr(self.model.tokenizer.tokenizers["english_phoneme"], "g2p"):
+                g2p = self.model.tokenizer.tokenizers["english_phoneme"].g2p
         elif isinstance(self.model.tokenizer, IPATokenizer):
             g2p = self.model.tokenizer.g2p
 
@@ -163,13 +170,12 @@ def create_dataset(
         dataset_meta: dict,
         context_duration_min: Optional[float] = None,
         context_duration_max: Optional[float] = None,
-    ) -> ChunkedTTSInferenceDataset:
-        """Create a unified dataset for inference.
+    ) -> Union[ChunkedTTSInferenceDataset, MagpieTTSDataset]:
+        """Create an inference dataset.
 
-        Always creates ChunkedTTSInferenceDataset which uses language-aware chunking
-        to automatically handle both short and long texts:
-        - Short text (below threshold): processed as single chunk
-        - Long text (above threshold): split into sentence chunks
+        Standard MagpieTTS uses the chunked inference dataset from `main`.
+        Decoder-only MagpieTTS uses the regular dataset and its dedicated
+        `infer_batch()` inference path.
 
         Args:
             dataset_meta: Dataset metadata dictionary with 'manifest_path' and 'audio_dir'.
@@ -199,11 +205,35 @@ def create_dataset(
 
         self._manifest_records = read_manifest(manifest_path)
         self._audio_base_dir = audio_dir
+        if self.config.is_decoder_only_model:
+            logging.info("Creating standard inference dataset for decoder-only model")
+            dataset = MagpieTTSDataset(
+                dataset_meta=dataset_meta,
+                sample_rate=self.model.sample_rate,
+                min_duration=0.5,
+                max_duration=20,
+                codec_model_samples_per_frame=self.model.codec_model_samples_per_frame,
+                bos_id=getattr(self.model, "bos_id", None),
+                eos_id=self.model.eos_id,
+                num_audio_codebooks=self.model.num_audio_codebooks,
+                prior_scaling_factor=None,
+                load_cached_codes_if_available=False,
+                dataset_type='test',
+                tokenizer_config=None,
+                load_16khz_audio=False,
+                use_text_conditioning_tokenizer=True,
+                text_conditioning_tokenizer_name=self.model.text_conditioning_tokenizer_name,
+                pad_context_text_to_max_duration=False,
+                context_duration_min=context_duration_min,
+                context_duration_max=context_duration_max,
+            )
+            dataset.text_tokenizer = self.model.tokenizer
+        else:
+            logging.info("Creating unified inference dataset")
+            dataset = self._create_chunked_inference_dataset(dataset_meta, context_duration_min, context_duration_max)
 
-        # Always use unified dataset (handles both short and long texts automatically)
-        # Language for chunking thresholds is determined per-sample from manifest
-        logging.info("Creating unified inference dataset")
-        dataset = self._create_chunked_inference_dataset(dataset_meta, context_duration_min, context_duration_max)
+        if hasattr(self.model, 'phoneme_tokenizer'):
+            dataset.phoneme_tokenizer = self.model.phoneme_tokenizer
 
         return dataset
 
@@ -217,10 +247,7 @@ def run_inference_on_dataset(
         save_context_audio: bool = True,
         save_predicted_codes: bool = True,
     ) -> Tuple[List[dict], List[str], List[str]]:
-        """Run unified inference on a dataset.
-
-        Uses the unified inference path that automatically handles both short texts
-        (single chunk) and long texts (multiple chunks) through the same code path.
+        """Run inference on a dataset.
 
         Args:
             dataset: The inference dataset (created by create_dataset()).
@@ -248,12 +275,91 @@ def run_inference_on_dataset(
                 raise ValueError("audio_base_dir not provided and not cached from create_dataset()")
             audio_base_dir = self._audio_base_dir
 
-        # Always use unified inference path
+        if self.config.is_decoder_only_model:
+            logging.info("Using decoder-only inference path")
+            return self._run_decoder_only_inference(
+                dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes
+            )
+
         logging.info("Using unified inference path")
         return self._run_unified_inference(
             dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes
         )
 
+    def _run_decoder_only_inference(
+        self,
+        dataset: MagpieTTSDataset,
+        output_dir: str,
+        manifest_records: List[dict],
+        audio_base_dir: str,
+        save_context_audio: bool = True,
+        save_predicted_codes: bool = True,
+    ) -> Tuple[List[dict], List[str], List[str]]:
+        """Run inference for decoder-only models via `infer_batch()`."""
+        os.makedirs(output_dir, exist_ok=True)
+        self._delete_old_generated_files(output_dir)
+
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            batch_size=self.config.batch_size,
+            collate_fn=dataset.collate_fn,
+            num_workers=0,
+            shuffle=False,
+        )
+
+        all_rtf_metrics = []
+        generated_audio_paths = []
+        codec_file_paths = []
+        item_idx = 0
+        phoneme_sampling_method = (
+            "argmax" if self.config.phoneme_sampling_method == "greedy" else self.config.phoneme_sampling_method
+        )
+
+        for batch_idx, batch in enumerate(dataloader):
+            logging.info(f"Processing batch {batch_idx + 1}/{len(dataloader)}")
+            batch_cuda = self._batch_to_cuda(batch)
+
+            predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics = self.model.infer_batch(
+                batch_cuda,
+                max_decoder_steps=self.config.model_inference_parameters.max_decoder_steps,
+                temperature=self.config.model_inference_parameters.temperature,
+                topk=self.config.model_inference_parameters.topk,
+                use_local_transformer_for_inference=self.config.use_local_transformer,
+                maskgit_n_steps=self.config.maskgit_n_steps,
+                use_cfg=self.config.use_cfg,
+                cfg_scale=self.config.model_inference_parameters.cfg_scale,
+                phoneme_input_type=self.config.phoneme_input_type,
+                phoneme_sampling_method=phoneme_sampling_method,
+                dropout_text_input=self.config.dropout_text_input,
+            )
+
+            all_rtf_metrics.append(rtf_metrics)
+            logging.info(f"Output shape: {predicted_audio.size()}")
+
+            for idx in range(predicted_audio.size(0)):
+                audio_len = predicted_audio_lens[idx].item()
+                audio_np = predicted_audio[idx].float().detach().cpu().numpy()[:audio_len]
+                audio_path = os.path.join(output_dir, f"predicted_audio_{item_idx}.wav")
+                sf.write(audio_path, audio_np, self.model.sample_rate)
+                generated_audio_paths.append(audio_path)
+
+                if save_context_audio and item_idx < len(manifest_records):
+                    self._copy_reference_audio(
+                        manifest_records[item_idx],
+                        audio_base_dir,
+                        output_dir,
+                        item_idx,
+                    )
+
+                if save_predicted_codes:
+                    code_len = predicted_codes_lens[idx].item()
+                    codes_path = os.path.join(output_dir, f"predicted_codes_{item_idx}.pt")
+                    torch.save(predicted_codes[idx, :, :code_len].detach().cpu(), codes_path)
+                    codec_file_paths.append(codes_path)
+
+                item_idx += 1
+
+        return all_rtf_metrics, generated_audio_paths, codec_file_paths
     @staticmethod
     def _batch_to_cuda(batch: dict) -> dict:
         """Move batch tensors to CUDA device."""
diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py
index 647a8ea66a06..e0cd4c2714be 100644
--- a/nemo/collections/tts/modules/magpietts_inference/utils.py
+++ b/nemo/collections/tts/modules/magpietts_inference/utils.py
@@ -23,12 +23,12 @@
 
 import os
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple
+from typing import Dict, Optional, Tuple, Union
 
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 
-from nemo.collections.tts.models import MagpieTTSModel
+from nemo.collections.tts.models import MagpieTTSModel, MagpieTTSDecoderModel
 from nemo.utils import logging
 
 
@@ -253,7 +253,7 @@ def update_checkpoint_state_dict(state_dict: dict) -> dict:
     return new_state_dict
 
 
-def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[MagpieTTSModel, str]:
+def load_magpie_model(config: ModelLoadConfig, device: str = "cuda", is_decoder_only_model: bool = False) -> Tuple[Union[MagpieTTSModel, MagpieTTSDecoderModel], str]:
     """Load a MagpieTTS model from checkpoint or NeMo archive.
 
     Supports two loading modes:
@@ -271,7 +271,7 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma
         ValueError: If configuration is invalid or sample rates don't match.
     """
     config.validate()
-
+    model_cls = MagpieTTSDecoderModel if is_decoder_only_model else MagpieTTSModel
     if config.hparams_file is not None and config.checkpoint_file is not None:
         # Mode 1: Load from hparams + checkpoint
         model_cfg = OmegaConf.load(config.hparams_file)
@@ -290,7 +290,7 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma
                 config.legacy_text_conditioning,
             )
 
-        model = MagpieTTSModel(cfg=model_cfg)
+        model = model_cls(cfg=model_cfg)
         model.use_kv_cache_for_inference = True
 
         # Load weights
@@ -302,15 +302,15 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma
         checkpoint_name = os.path.basename(config.checkpoint_file).replace(".ckpt", "")
 
     else:
-        if config.nemo_file.startswith("nvidia/"):  # TODO @xueyang: why ignore `update_config_for_inference`?
-            model = MagpieTTSModel.from_pretrained(config.nemo_file)
+        if config.nemo_file.startswith("nvidia/"):
+            model = model_cls.from_pretrained(config.nemo_file)
             model.use_kv_cache_for_inference = True
             checkpoint_name = config.nemo_file.split("/")[-1]
             cfg_sample_rate = None
         else:
             # Mode 2: Load from .nemo archive
             logging.info(f"Loading model from NeMo archive: {config.nemo_file}")
-            model_cfg = MagpieTTSModel.restore_from(config.nemo_file, return_config=True)
+            model_cfg = model_cls.restore_from(config.nemo_file, return_config=True)
 
             with open_dict(model_cfg):
                 model_cfg, cfg_sample_rate = update_config_for_inference(
@@ -320,7 +320,7 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma
                     config.legacy_text_conditioning,
                 )
 
-            model = MagpieTTSModel.restore_from(config.nemo_file, override_config_path=model_cfg)
+            model = model_cls.restore_from(config.nemo_file, override_config_path=model_cfg)
             model.use_kv_cache_for_inference = True
             checkpoint_name = os.path.basename(config.nemo_file).replace(".nemo", "")
 

From 156f16fc16f26fc7c0c14c51a06c8c618819182e Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Thu, 8 Jan 2026 17:52:45 -0500
Subject: [PATCH 02/94] merge wit main again

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/magpietts_decoder_only.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py
index 1b60b4b7b6ed..0a346865dcba 100644
--- a/nemo/collections/tts/models/magpietts_decoder_only.py
+++ b/nemo/collections/tts/models/magpietts_decoder_only.py
@@ -73,6 +73,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         # load codec
         codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False)
         self.sample_rate = codec_model.sample_rate
+        self.output_sample_rate = codec_model.output_sample_rate
         
         if hasattr(codec_model, "discriminator"):
             # del codec discriminator to free memory
@@ -1449,6 +1450,7 @@ def setup_test_data(self, cfg):
         self._test_dl = self._setup_test_dataloader(cfg)
 
     def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, use_local_transformer_for_inference=False, maskgit_n_steps=3, use_cfg=False, cfg_scale=1.0, phoneme_input_type='gt', phoneme_sampling_method='argmax', dropout_text_input=False):
+        # TODO: Make this API same as MagpieTTS model.
         with torch.inference_mode():
             start_time = time.time()
             context_tensors = self.prepare_context_tensors(batch, dropout_text_input=dropout_text_input)
@@ -1718,7 +1720,7 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
                 'tts_generation_time_per_frame': tts_generation_time_per_frame,
                 'batch_size': context_embedding.size(0),
             }
-
+            
             return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics
 
 

From 6ba36996062fb56567cfa47c7cf89ce2af22155f Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Thu, 8 Jan 2026 22:12:37 +0000
Subject: [PATCH 03/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 examples/tts/magpietts_decoder_only.py        |   2 +-
 examples/tts/magpietts_inference.py           |   8 +-
 .../tts/data/text_to_speech_dataset.py        |   8 +-
 .../tts/data/text_to_speech_dataset_lhotse.py |  12 +-
 .../tts/models/magpietts_decoder_only.py      | 815 +++++++++++-------
 .../modules/magpietts_inference/inference.py  |   8 +-
 .../tts/modules/magpietts_inference/utils.py  |   6 +-
 7 files changed, 539 insertions(+), 320 deletions(-)

diff --git a/examples/tts/magpietts_decoder_only.py b/examples/tts/magpietts_decoder_only.py
index 44859fee8d64..73bb87de7969 100644
--- a/examples/tts/magpietts_decoder_only.py
+++ b/examples/tts/magpietts_decoder_only.py
@@ -54,4 +54,4 @@ def main(cfg):
 
 
 if __name__ == '__main__':
-    main()  # noqa pylint: disable=no-value-for-parameter
\ No newline at end of file
+    main()  # noqa pylint: disable=no-value-for-parameter
diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py
index d8d9e883fd04..3199f58e9970 100644
--- a/examples/tts/magpietts_inference.py
+++ b/examples/tts/magpietts_inference.py
@@ -190,7 +190,9 @@ def run_inference_and_evaluation(
         violin_plot_metrics.remove('utmosv2')
 
     # Load model
-    model, checkpoint_name = load_magpie_model(model_config, is_decoder_only_model=inference_config.is_decoder_only_model)
+    model, checkpoint_name = load_magpie_model(
+        model_config, is_decoder_only_model=inference_config.is_decoder_only_model
+    )
 
     # Log architecture summary and get MoE info + FLOPs metrics
     moe_info, flops_per_component = log_model_architecture_summary(model)
@@ -504,7 +506,9 @@ def create_argument_parser() -> argparse.ArgumentParser:
     target_group.add_argument('--ssim_target', type=float, default=None)
     target_group.add_argument('--is_decoder_only_model', action='store_true')
     target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt'])
-    target_group.add_argument('--phoneme_sampling_method', type=str, default='greedy', choices=['greedy', 'multinomial'])
+    target_group.add_argument(
+        '--phoneme_sampling_method', type=str, default='greedy', choices=['greedy', 'multinomial']
+    )
     target_group.add_argument('--dropout_text_input', action='store_true')
 
     return parser
diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py
index 789636a569e3..254169f621c6 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset.py
@@ -437,7 +437,9 @@ def __getitem__(self, index):
 
         if self.phoneme_tokenizer is not None:
             phoneme_tokens = self.phoneme_tokenizer.encode(data.text)
-            phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
+            phoneme_tokens = (
+                [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
+            )
             phoneme_tokens_len = len(phoneme_tokens)
             example["phoneme_tokens"] = torch.tensor(phoneme_tokens, dtype=torch.int32)
             example["phoneme_tokens_len"] = phoneme_tokens_len
@@ -727,7 +729,9 @@ def collate_fn(self, batch: List[dict]):
         if len(phoneme_tokens_list) > 0:
             batch_phoneme_tokens_len = torch.IntTensor(phoneme_tokens_len_list)
             phoneme_tokens_max_len = int(batch_phoneme_tokens_len.max().item())
-            batch_phoneme_tokens = stack_tensors(phoneme_tokens_list, max_lens=[phoneme_tokens_max_len], pad_value=self.phoneme_tokenizer.pad)
+            batch_phoneme_tokens = stack_tensors(
+                phoneme_tokens_list, max_lens=[phoneme_tokens_max_len], pad_value=self.phoneme_tokenizer.pad
+            )
             batch_dict['phoneme_tokens'] = batch_phoneme_tokens
             batch_dict['phoneme_tokens_lens'] = batch_phoneme_tokens_len
 
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index 4bd378151b9a..9bad7a36e44a 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -59,6 +59,7 @@ def setup_tokenizers(all_tokenizers_config, mode='train'):
 
     return aggregated_tokenizer
 
+
 def instantiate_phoneme_tokenizer(phoneme_tokenizer_config):
     phoneme_tokenizer = instantiate(phoneme_tokenizer_config)
     phoneme_vocab_size = len(phoneme_tokenizer.tokens)
@@ -67,6 +68,7 @@ def instantiate_phoneme_tokenizer(phoneme_tokenizer_config):
     phoneme_tokenizer.vocab_size = phoneme_vocab_size + 2
     return phoneme_tokenizer
 
+
 def check_speaker_format(item: str):
     # enforce the format as example like "| Language:en Dataset:HiFiTTS Speaker:9136_other |".
     pattern = r"\| Language:\w+ Dataset:[\w\d\W]+ Speaker:[\w\d\W]+ \|"
@@ -410,7 +412,9 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
 
             if self.phoneme_tokenizer is not None:
                 phoneme_tokens = self.phoneme_tokenizer.encode(text_str)
-                phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
+                phoneme_tokens = (
+                    [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
+                )
                 phoneme_tokens_len = len(phoneme_tokens)
                 phoneme_token_list.append(torch.tensor(phoneme_tokens, dtype=torch.int32))
                 phoneme_token_len_list.append(phoneme_tokens_len)
@@ -435,9 +439,11 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
         }
 
         if self.phoneme_tokenizer is not None:
-            batch_dict["phoneme_tokens"] = collate_vectors(phoneme_token_list, padding_value=self.phoneme_tokenizer.pad)
+            batch_dict["phoneme_tokens"] = collate_vectors(
+                phoneme_token_list, padding_value=self.phoneme_tokenizer.pad
+            )
             batch_dict["phoneme_tokens_lens"] = torch.IntTensor(phoneme_token_len_list)
-            
+
         # audio for SV.
         if len(audio_list_16khz) > 0:
             batch_dict["audio_16khz"] = collate_vectors(audio_list_16khz, padding_value=0.0)
diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py
index 0a346865dcba..f5f5be0522a6 100644
--- a/nemo/collections/tts/models/magpietts_decoder_only.py
+++ b/nemo/collections/tts/models/magpietts_decoder_only.py
@@ -11,38 +11,40 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import random
+import time
+from functools import partial
 from typing import List, Sequence, Tuple
+
 import torch
 import wandb
 from hydra.utils import instantiate
-from functools import partial
 from lightning.pytorch import Trainer
 from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
 from omegaconf import DictConfig
 from torch import nn
 from torch.utils.data import get_worker_info
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
-from nemo.collections.tts.data.text_to_speech_dataset_lhotse import MagpieTTSLhotseDataset, setup_tokenizers, instantiate_phoneme_tokenizer
-
+from nemo.collections.tts.data.text_to_speech_dataset_lhotse import (
+    MagpieTTSLhotseDataset,
+    instantiate_phoneme_tokenizer,
+    setup_tokenizers,
+)
 from nemo.collections.tts.models import AudioCodecModel
 from nemo.collections.tts.modules import transformer_2501
-
-from nemo.collections.tts.modules.magpietts_modules import CharAwareSubwordEncoder, SpecialAudioToken, LocalTransformerType, cosine_schedule
+from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter
+from nemo.collections.tts.modules.magpietts_modules import (
+    CharAwareSubwordEncoder,
+    LocalTransformerType,
+    SpecialAudioToken,
+    cosine_schedule,
+)
 from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
-
 from nemo.core.classes import ModelPT
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
-from transformers import (
-    AutoConfig,
-    AutoModel,
-    AutoModelForCausalLM
-)
-import time
-from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter
-import random
-
 
 
 def worker_init_fn(worker_id):
@@ -51,9 +53,7 @@ def worker_init_fn(worker_id):
     logging.info(f"Worker {worker_id} initializing...")
     worker_info = get_worker_info()
     dataset = worker_info.dataset  # Get the dataset instance in this worker
-    tokenizer = setup_tokenizers(
-        dataset.tokenizer_config, mode=dataset.dataset_type
-    )
+    tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type)
     dataset.text_tokenizer = tokenizer
     if hasattr(dataset, 'phoneme_tokenizer_config'):
         dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config)
@@ -74,7 +74,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False)
         self.sample_rate = codec_model.sample_rate
         self.output_sample_rate = codec_model.output_sample_rate
-        
         if hasattr(codec_model, "discriminator"):
             # del codec discriminator to free memory
             del codec_model.discriminator
@@ -95,7 +94,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             data_num_audio_codebooks = num_audio_codebooks
             codebook_size = codec_model.codebook_size
             codec_converter = None
-        
 
         # The dataloader needs to know the number of codebooks that the context codes were stored in
         # In the case where there are no context codes saved, and there is no context audio (in the text context path),
@@ -105,7 +103,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.num_audio_codebooks = num_audio_codebooks
         self.codebook_size = codebook_size
 
-        
         self.codec_model_samples_per_frame = codec_model.samples_per_frame
         # Our codebooks start with actual audio codec tokens, followed by special tokens.
         # The `forced_*` options are for backward compatibility for models trained with older code.
@@ -136,7 +133,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             all_tokenizers_config=cfg.text_tokenizers,
             mode='train',
         )
-        
+
         num_tokens_tokenizer = len(self.tokenizer.tokens)
         num_tokens = num_tokens_tokenizer + 3  # +2 for BOS and EOS
         self.bos_id = num_tokens - 3
@@ -150,21 +147,20 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1)
             self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size
 
-        
         self.pad_context_text_to_max_duration = False
 
         super().__init__(cfg=cfg, trainer=trainer)
 
         # This needs to happen after super().__init__()
         self._codec_model = codec_model
-        self._codec_model.freeze()  #Lightning does requires_grad = False and self.eval()
+        self._codec_model.freeze()  # Lightning does requires_grad = False and self.eval()
         self._codec_converter = codec_converter
 
         audio_embeddings = []
         for _ in range(self.num_audio_codebooks * self.frame_stacking_factor):
             audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, cfg.embedding_dim))
         self.audio_embeddings = nn.ModuleList(audio_embeddings)
-        
+
         if self.phoneme_tokenizer is not None:
             phoneme_embeddings = []
             for _ in range(self.phoneme_stacking_factor):
@@ -172,13 +168,15 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings)
             self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor)
 
-
         if cfg.transformer_hf_backend == "custom_qwen3_moe":
             # from transformers.models import qwen3_moe
             # config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(intermediate_size=3072, num_hidden_layers=5, num_experts=64)
             # self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
             from transformers.models import qwen2_moe
-            config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32)
+
+            config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig(
+                hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32
+            )
             self.decoder = qwen2_moe.modeling_qwen2_moe.Qwen2MoeModel(config_qwen2)
         else:
             self.transformer_backend_config = AutoConfig.from_pretrained(
@@ -192,7 +190,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
 
         self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim)
         self.decoder.set_input_embeddings(self.text_embedding)
-        
+
         if self.use_bpe_char_tokenizer:
             # BPE char tokenizer
             assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer"
@@ -210,10 +208,12 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
                 d_embed=cfg.embedding_dim,
                 llm_tokenizer_vocab=subword_vocab,
                 subword_padding_idx=self.tokenizer.pad,
-                special_vocab=special_vocab
+                special_vocab=special_vocab,
             )
 
-        self.final_proj = nn.Linear(cfg.hidden_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor)
+        self.final_proj = nn.Linear(
+            cfg.hidden_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor
+        )
         self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none')
 
         self.local_transformer_type = LocalTransformerType(cfg.get('local_transformer_type', 'none').lower())
@@ -227,7 +227,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.local_transformer = transformer_2501.Transformer(
                 n_layers=self.cfg.get('local_transformer_n_layers', 2),
                 d_model=local_transformer_hidden_dim,
-                d_ffn=local_transformer_hidden_dim*4,
+                d_ffn=local_transformer_hidden_dim * 4,
                 sa_n_heads=self.cfg.get('local_transformer_n_heads', 1),
                 kernel_size=1,
                 is_causal=self.local_transformer_type == LocalTransformerType.AR,
@@ -237,10 +237,11 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             local_transformer_out_projections = []
             for _ in range(self.num_audio_codebooks * self.frame_stacking_factor):
                 # Have a separate projection layer for each codebook, to distinguish between them
-                local_transformer_out_projections.append(nn.Linear(local_transformer_hidden_dim, self.num_all_tokens_per_codebook))
+                local_transformer_out_projections.append(
+                    nn.Linear(local_transformer_hidden_dim, self.num_all_tokens_per_codebook)
+                )
             self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections)
 
-
     def state_dict(self, destination=None, prefix='', keep_vars=False):
         """
         Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model
@@ -255,7 +256,7 @@ def state_dict(self, destination=None, prefix='', keep_vars=False):
             if any([substring in key for substring in keys_substrings_to_exclude]):
                 del state_dict[key]
         return state_dict
-    
+
     def load_state_dict(self, state_dict, strict=True):
         """
         Modify load_state_dict so that we don't restore weights to _speaker_verification_model and _codec_model when
@@ -276,7 +277,7 @@ def load_state_dict(self, state_dict, strict=True):
                 for key in state_dict.keys():
                     name_with_dot = f"{name}."
                     if key.startswith(name_with_dot):
-                        new_state_dict[key[len(name_with_dot):]] = state_dict[key]
+                        new_state_dict[key[len(name_with_dot) :]] = state_dict[key]
                 child.load_state_dict(new_state_dict)
 
     def audio_to_codes(self, audio, audio_len, audio_type='target'):
@@ -318,12 +319,14 @@ def codes_to_audio(self, codes, codes_len):
         if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor:
             # Unstack the audio codes if they are stacked
             codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor)
-        
+
         if codes.size(2) < 5:
             # If the codes are too short, we need to pad them
-            codes = torch.cat([codes, torch.zeros(codes.size(0), codes.size(1), 5 - codes.size(2), device=codes.device)], dim=2).long()
+            codes = torch.cat(
+                [codes, torch.zeros(codes.size(0), codes.size(1), 5 - codes.size(2), device=codes.device)], dim=2
+            ).long()
             codes_len = codes_len + 5 - codes.size(2)
-        
+
         self._codec_model.eval()
         with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
             # Make a copy to avoid modifying the original tensor if it's used elsewhere
@@ -365,7 +368,7 @@ def embed_phoneme_tokens(self, phoneme_tokens):
                 phoneme_embedding = phoneme_embedding + embedding
         phoneme_embedding = phoneme_embedding / phoneme_tokens.size(1)
         return phoneme_embedding
-    
+
     def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False):
         """
         Predicts the logits for all codebooks using the local transformer. Used in both autoregressive (AR) and MaskGit (MG) modes.
@@ -382,41 +385,45 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_
         +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
         | Seq. Index |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |    8    |
         +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-        
+
         dec_out: (B, T', E)
         audio_codes_target: (B, C, T')
         targets_offset_by_one: bool, if False, the target for index 0 is codebook 0, for index 1 is codebook 1, etc. (autoregressive)
                                      if True,  the target for index 1 is codebook 0, for index 2 is codebook 1, etc. (MaskGit)
         """
-        dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', E)
+        dec_out_all = dec_out.reshape(-1, dec_out.size(-1))  # (B*T', E)
         local_transformer_input = [dec_out_all]
         for codebook_num in range(audio_codes_target.size(1)):
-            codes = audio_codes_target[:, codebook_num] # (B, T')
-            codes = codes.reshape(-1) # (B*T',)
-            codebook_embedding = self.audio_embeddings[codebook_num](codes) # (B*T', E)
+            codes = audio_codes_target[:, codebook_num]  # (B, T')
+            codes = codes.reshape(-1)  # (B*T',)
+            codebook_embedding = self.audio_embeddings[codebook_num](codes)  # (B*T', E)
             local_transformer_input.append(codebook_embedding)
 
-        local_transformer_input = torch.stack(local_transformer_input, dim=1) # (B*T', C+1, E)
-        local_transformer_input = self.local_transformer_in_projection(local_transformer_input) # (B*T', C+1, 128)
-        _mask = torch.ones( local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device)
-        local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B*T', C+1, E)
+        local_transformer_input = torch.stack(local_transformer_input, dim=1)  # (B*T', C+1, E)
+        local_transformer_input = self.local_transformer_in_projection(local_transformer_input)  # (B*T', C+1, 128)
+        _mask = torch.ones(
+            local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
+        )
+        local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']  # (B*T', C+1, E)
         if not targets_offset_by_one:
             # for autoregressive local transformer the target for index 0 is codebook 0, for index 1 is codebook 1, etc.
-            local_transformer_output = local_transformer_output[:, :-1, :] # (B*T', C, E)
+            local_transformer_output = local_transformer_output[:, :-1, :]  # (B*T', C, E)
         else:
             # for MaskGit the target for index **1** is codebook 0, for index 2 is codebook 1, etc.
-            local_transformer_output = local_transformer_output[:, 1:, :] # (B*T', C, E)
+            local_transformer_output = local_transformer_output[:, 1:, :]  # (B*T', C, E)
         all_code_logits = []
         for codebook_num in range(audio_codes_target.size(1)):
             # Using a separate projection layer for each codebook (to distinguish between them)
             # Checked the time - this loop is not taking much time (compared to the local transformer forward pass)
-            codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, codebook_num, :]) # (B*T', num_all_tokens_per_codebook)
+            codebook_logits = self.local_transformer_out_projections[codebook_num](
+                local_transformer_output[:, codebook_num, :]
+            )  # (B*T', num_all_tokens_per_codebook)
             all_code_logits.append(codebook_logits)
-        all_code_logits = torch.cat(all_code_logits, dim=1) # (B*T', num_codebooks * num_all_tokens_per_codebook)
+        all_code_logits = torch.cat(all_code_logits, dim=1)  # (B*T', num_codebooks * num_all_tokens_per_codebook)
 
         all_code_logits = all_code_logits.view(
             audio_codes_target.size(0), audio_codes_target.size(2), -1
-        ) # (B, T', C * num_all_tokens_per_codebook)
+        )  # (B, T', C * num_all_tokens_per_codebook)
 
         return all_code_logits
 
@@ -425,13 +432,13 @@ def maskgit_create_random_mask(self, codes):
         Creates a mask where True indicates the positions that should be replaced with a MASK_TOKEN.
         """
         # Codes: (B, C, T)
-        B,C,T = codes.shape
+        B, C, T = codes.shape
         # get a uniform random vector uniformly sampled from [0,1) ## Todo does it need to be inclusive on the right?
-        rand_values = torch.rand(B,T, device=codes.device)
-        # apply the cosine schedule 
+        rand_values = torch.rand(B, T, device=codes.device)
+        # apply the cosine schedule
         frac_masked = cosine_schedule(rand_values)
         # how many positions to mask
-        n_masked = torch.ceil(frac_masked * C).long() # B,T
+        n_masked = torch.ceil(frac_masked * C).long()  # B,T
         # start from all unmasked
         mask = torch.zeros_like(codes, dtype=torch.bool)
         # The code further below is the vectorized version of this:
@@ -443,19 +450,19 @@ def maskgit_create_random_mask(self, codes):
         #              # mask the top n_masked positions
         #              mask[b, perm[:n_masked[b,t]], t] = True
         #
-        # Create random permutations 
-        random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1)  # (B, C, T)        
-        # Create a mask tensor where each position indicates if it should be masked        
+        # Create random permutations
+        random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1)  # (B, C, T)
+        # Create a mask tensor where each position indicates if it should be masked
         mask_indices = torch.arange(C, device=codes.device).view(1, C, 1)
-        mask = mask_indices < n_masked.view(B, 1, T) # (B, C, T)
+        mask = mask_indices < n_masked.view(B, 1, T)  # (B, C, T)
         # Apply the random permutations to the mask
         mask = torch.gather(mask, 1, random_permutations)
-    
-        return mask # (B, C, T)
-    
+
+        return mask  # (B, C, T)
+
     def maskgit_apply_random_mask(self, codes):
         # Randomly replaces some codes with the MASK_TOKEN with a proportion following the cosine schedule.
-        # Codes: (B, C, T)        
+        # Codes: (B, C, T)
         mask = self.maskgit_create_random_mask(codes)
         ## replace some tokens with MASK_TOKEN
         codes_with_mask = torch.where(mask, self.mask_token_id, codes)
@@ -466,7 +473,7 @@ def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=N
         Computes the audio codebook loss. Used by
         (1) The main Magpie-TTS transformer
         (2) The local transformer, for both autoregressive and MaskGit methods
-        
+
         logits: (B, T', num_codebooks * num_tokens_per_codebook)
         audio_codes: (B, C, T')
         audio_codes_lens: (B,)
@@ -483,8 +490,8 @@ def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=N
             if not loss_mask.any():
                 # Without this we were very rarely getting NaNs in the loss
                 logging.warning("No tokens valid were found in compute_loss()!")
-                return torch.tensor(0.0, device=loss_mask.device), loss_mask 
-        else:            
+                return torch.tensor(0.0, device=loss_mask.device), loss_mask
+        else:
             # repeat loss mask for each codebook to simplify code below
             loss_mask = loss_mask.unsqueeze(1).repeat(1, audio_codes.size(1), 1)
         total_codebook_loss = None
@@ -523,7 +530,6 @@ def compute_phoneme_loss(self, logits, phoneme_tokens, phoneme_tokens_lens):
                 total_phoneme_loss = total_phoneme_loss + phoneme_loss
         total_phoneme_loss = total_phoneme_loss / self.phoneme_stacking_factor
         return total_phoneme_loss, loss_mask
-    
 
     def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None):
         backend_out = self.decoder(
@@ -534,7 +540,6 @@ def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_value
         )
         # hidden_states = backend_out.last_hidden_state  # (B, T_total, H)
         return backend_out
-    
 
     def logits_to_audio_codes(self, all_code_logits, audio_codes_lens):
         # all_code_logits: (B, T', num_codebooks * num_tokens_per_codebook)
@@ -555,7 +560,17 @@ def logits_to_audio_codes(self, all_code_logits, audio_codes_lens):
 
         return all_preds
 
-    def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80, unfinished_items={}, finished_items={}, use_cfg=False, cfg_scale=1.0, n_steps=3):
+    def local_transformer_sample_maskgit(
+        self,
+        dec_output,
+        temperature=0.7,
+        topk=80,
+        unfinished_items={},
+        finished_items={},
+        use_cfg=False,
+        cfg_scale=1.0,
+        n_steps=3,
+    ):
         """
         Sample codes for one timestep from the local transformer using MaskGit.
         """
@@ -565,13 +580,15 @@ def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80,
         device = dec_output.device
         # disable KV cache since our transformer is not causal
         self.local_transformer.reset_cache(use_cache=False)
-        dec_output = dec_output.unsqueeze(1) # (B, 1, E)
-        local_transformer_input_init = self.local_transformer_in_projection(dec_output) # (B, 1, D) where D is the dimension of the local transformer
+        dec_output = dec_output.unsqueeze(1)  # (B, 1, E)
+        local_transformer_input_init = self.local_transformer_in_projection(
+            dec_output
+        )  # (B, 1, D) where D is the dimension of the local transformer
         C = self.num_audio_codebooks
         B = dec_output.size(0)
 
         min_confidence = float("-inf")
-        max_confidence = 10000 # this needs to be large enough that unmasked items will always remain unmasked. # TODO @rfejgin: use float('inf')?
+        max_confidence = 10000  # this needs to be large enough that unmasked items will always remain unmasked. # TODO @rfejgin: use float('inf')?
         confidences = min_confidence * torch.ones(B, C, device=device)
         # initialize to all masked
         codes = self.mask_token_id * torch.ones((B, C), device=device, dtype=torch.long)
@@ -580,7 +597,9 @@ def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80,
             # get mask fraction
             frac_masked = cosine_schedule(torch.tensor(step / (n_steps)))
             # how many codebooks to mask
-            n_masked = torch.ceil(C * frac_masked).long() # TODO @rfejgin: should we force this to be initialized to exactly `C` (to avoid numerical issues)?
+            n_masked = torch.ceil(
+                C * frac_masked
+            ).long()  # TODO @rfejgin: should we force this to be initialized to exactly `C` (to avoid numerical issues)?
             n_unmasked = C - n_masked
             # pick top-confidence codebooks up to n_unmasked
             _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1)
@@ -588,32 +607,42 @@ def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80,
             # replace masks of the top-k confident codebooks with the the codes that were sampled for them
             unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices)
             codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
-            
-            # build transformer input 
+
+            # build transformer input
             local_transformer_input = local_transformer_input_init
             for codebook_num in range(C):
-                next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1) # (B, 1, 768)
-                next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) # (B, 1, d_local)
-                local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) # (B, codebook_num+1, d_local)
+                next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(
+                    1
+                )  # (B, 1, 768)
+                next_local_transformer_input = self.local_transformer_in_projection(
+                    next_local_transformer_input
+                )  # (B, 1, d_local)
+                local_transformer_input = torch.cat(
+                    [local_transformer_input, next_local_transformer_input], dim=1
+                )  # (B, codebook_num+1, d_local)
 
             # run transformer
-            _mask = torch.ones(B, C+1, device=device)
-            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, C+1, d_local)
-            
+            _mask = torch.ones(B, C + 1, device=device)
+            local_transformer_output = self.local_transformer(local_transformer_input, _mask)[
+                'output'
+            ]  # (B, C+1, d_local)
+
             # get logits
             logits = []
             for codebook_num in range(C):
                 # The `codebook_num+1` is to drop first position which corresponds to the magpie latent
-                codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, codebook_num+1, :]) # (B, num_audio_tokens_per_codebook)
+                codebook_logits = self.local_transformer_out_projections[codebook_num](
+                    local_transformer_output[:, codebook_num + 1, :]
+                )  # (B, num_audio_tokens_per_codebook)
                 logits.append(codebook_logits)
-            logits = torch.stack(logits, dim=1) # (B, C, num_audio_tokens_per_codebook)
+            logits = torch.stack(logits, dim=1)  # (B, C, num_audio_tokens_per_codebook)
 
             # apply CFG
             if use_cfg:
                 actual_batch_size = logits.size(0) // 2
                 conditional_logits = logits[:actual_batch_size]
                 unconditional_logits = logits[actual_batch_size:]
-                cfg_logits = cfg_scale * conditional_logits +  (1.0 - cfg_scale) * unconditional_logits
+                cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
                 logits[:actual_batch_size] = cfg_logits
 
             # handle unfinished and finished items
@@ -622,48 +651,65 @@ def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80,
             for item_idx in finished_items:
                 logits[item_idx, :, :] = float('-inf')
                 logits[item_idx, :, self.audio_eos_id] = 0.0
-            
+
             # sample with top-k
-            logits_topk = torch.topk(logits, topk, dim=-1)[0] # (B, C, topk)
-            indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1) # (B, C, num_audio_tokens_per_codebook)
+            logits_topk = torch.topk(logits, topk, dim=-1)[0]  # (B, C, topk)
+            indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1)  # (B, C, num_audio_tokens_per_codebook)
             logits_rescored = logits.clone()
             logits_rescored[indices_to_remove] = float('-inf')
-            probs = torch.softmax(logits_rescored / temperature, dim=-1) # (B, C, num_audio_tokens_per_codebook)
-            sampled_codes = torch.multinomial(probs.view(B*C, -1), 1).view(B, C)
+            probs = torch.softmax(logits_rescored / temperature, dim=-1)  # (B, C, num_audio_tokens_per_codebook)
+            sampled_codes = torch.multinomial(probs.view(B * C, -1), 1).view(B, C)
             if use_cfg:
                 # TODO @rfejgin: why do we need to keep second half of the batch? can probably optimize this
                 sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size]
                 probs[actual_batch_size:] = probs[:actual_batch_size]
-            confidences  = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1)
+            confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1)
 
             # set confidence to max for unmasked codebooks so that they will remain unmasked
-            confidences.scatter_(index=topk_indices, dim=1, src=max_confidence*torch.ones_like(topk_indices, dtype=torch.float))
+            confidences.scatter_(
+                index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float)
+            )
 
             # replace entries in sampled_codes with previously unmasked codebooks
             sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
             # optionally: add noise to confidences here (as in token-critic paper) (not implemented)
-        
+
         codes = sampled_codes
-        assert not (codes == self.mask_token_id).any(), f"Codes contain mask tokens after completion of MaskGit sampling"
+        assert not (
+            codes == self.mask_token_id
+        ).any(), f"Codes contain mask tokens after completion of MaskGit sampling"
         if use_cfg:
             codes = codes[:actual_batch_size]
         return codes
 
-    def local_transformer_sample_autoregressive(self, dec_output, temperature=0.7, topk=80, unfinished_items={}, finished_items={}, use_cfg=False, cfg_scale=1.0):
+    def local_transformer_sample_autoregressive(
+        self,
+        dec_output,
+        temperature=0.7,
+        topk=80,
+        unfinished_items={},
+        finished_items={},
+        use_cfg=False,
+        cfg_scale=1.0,
+    ):
         # dec_output: (B, E)
         self.local_transformer.reset_cache(use_cache=False)
-        dec_output = dec_output.unsqueeze(1) # (B, 1, E)
-        local_transformer_input = self.local_transformer_in_projection(dec_output) # (B, 1, 128)
+        dec_output = dec_output.unsqueeze(1)  # (B, 1, E)
+        local_transformer_input = self.local_transformer_in_projection(dec_output)  # (B, 1, 128)
         all_preds = []
         for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor):
-            _mask = torch.ones( local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device)
-            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, T, 128)
-            codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, -1, :]) # (B, num_all_tokens_per_codebook)
+            _mask = torch.ones(
+                local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
+            )
+            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']  # (B, T, 128)
+            codebook_logits = self.local_transformer_out_projections[codebook_num](
+                local_transformer_output[:, -1, :]
+            )  # (B, num_all_tokens_per_codebook)
             if use_cfg:
                 actual_batch_size = codebook_logits.size(0) // 2
                 conditional_logits = codebook_logits[:actual_batch_size]
                 unconditional_logits = codebook_logits[actual_batch_size:]
-                cfg_logits = cfg_scale * conditional_logits +  (1.0 - cfg_scale) * unconditional_logits
+                cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
                 codebook_logits[:actual_batch_size] = cfg_logits
 
             for item_idx in unfinished_items:
@@ -672,27 +718,38 @@ def local_transformer_sample_autoregressive(self, dec_output, temperature=0.7, t
                 codebook_logits[item_idx, :] = float('-inf')
                 codebook_logits[item_idx, self.audio_eos_id] = 0.0
 
-            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk)
-            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(-1) # (B, num_tokens_per_codebook)
+            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]  # (B, topk)
+            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(
+                -1
+            )  # (B, num_tokens_per_codebook)
             codebook_logits_rescored = codebook_logits.clone()
             codebook_logits_rescored[indices_to_remove] = float('-inf')
-            codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1) # (B, num_tokens_per_codebook)
-            codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1)
+            codebook_probs = torch.softmax(
+                codebook_logits_rescored / temperature, dim=-1
+            )  # (B, num_tokens_per_codebook)
+            codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
             if use_cfg:
                 codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size]
             all_preds.append(codebook_preds)
-            next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(1) # (B, 1, 128)
-            next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) # (B, 1, 128)
-            local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) # (B, T+1, 128)
+            next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(
+                1
+            )  # (B, 1, 128)
+            next_local_transformer_input = self.local_transformer_in_projection(
+                next_local_transformer_input
+            )  # (B, 1, 128)
+            local_transformer_input = torch.cat(
+                [local_transformer_input, next_local_transformer_input], dim=1
+            )  # (B, T+1, 128)
 
-        all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks)
+        all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
         if use_cfg:
             all_preds = all_preds[:actual_batch_size]
 
         return all_preds
 
-
-    def sample_codes_from_logits(self, all_code_logits_t, temperature=0.7, topk=80, unfinished_items={}, finished_items={}):
+    def sample_codes_from_logits(
+        self, all_code_logits_t, temperature=0.7, topk=80, unfinished_items={}, finished_items={}
+    ):
         # all_code_logits_t: (B, num_codebooks * num_tokens_per_codebook), logits at a given timestep
         all_preds = []
         for idx in range(self.num_audio_codebooks * self.frame_stacking_factor):
@@ -711,7 +768,9 @@ def sample_codes_from_logits(self, all_code_logits_t, temperature=0.7, topk=80,
             codebook_logits_rescored = codebook_logits.clone()
             codebook_logits_rescored[indices_to_remove] = float('-inf')
 
-            codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1)  # (B, num_tokens_per_codebook)
+            codebook_probs = torch.softmax(
+                codebook_logits_rescored / temperature, dim=-1
+            )  # (B, num_tokens_per_codebook)
             codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
             all_preds.append(codebook_preds)
         all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
@@ -731,7 +790,9 @@ def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, t
             codebook_logits_rescored = codebook_logits.clone()
             codebook_logits_rescored[indices_to_remove] = float('-inf')
 
-            codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1)  # (B, num_tokens_per_codebook)
+            codebook_probs = torch.softmax(
+                codebook_logits_rescored / temperature, dim=-1
+            )  # (B, num_tokens_per_codebook)
             codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
             all_preds.append(codebook_preds)
         all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
@@ -760,7 +821,9 @@ def log_val_audio_example(
             is_wandb = isinstance(logger, WandbLogger)
             is_tb = isinstance(logger, TensorBoardLogger)
             if not is_wandb and not is_tb:
-                raise ValueError(f"Invalid logger type for audio logging: {type(logger)}. Only `WandbLogger` and `TensorBoardLogger` are supported.")
+                raise ValueError(
+                    f"Invalid logger type for audio logging: {type(logger)}. Only `WandbLogger` and `TensorBoardLogger` are supported."
+                )
 
             for idx in range(min(3, pred_audio.size(0))):
                 pred_audio_np = pred_audio[idx].float().detach().cpu().numpy()
@@ -775,9 +838,15 @@ def log_val_audio_example(
                 if is_wandb:
                     wandb_audio_log[f"Audio/Example_{idx}"] = list()
                     if context_audio_np is not None:
-                        wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(context_audio_np, sample_rate=self.sample_rate, caption="context"))
-                    wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(pred_audio_np, sample_rate=self.sample_rate, caption="prediction"))
-                    wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(target_audio_np, sample_rate=self.sample_rate, caption="target"))
+                        wandb_audio_log[f"Audio/Example_{idx}"].append(
+                            wandb.Audio(context_audio_np, sample_rate=self.sample_rate, caption="context")
+                        )
+                    wandb_audio_log[f"Audio/Example_{idx}"].append(
+                        wandb.Audio(pred_audio_np, sample_rate=self.sample_rate, caption="prediction")
+                    )
+                    wandb_audio_log[f"Audio/Example_{idx}"].append(
+                        wandb.Audio(target_audio_np, sample_rate=self.sample_rate, caption="target")
+                    )
 
                 if is_tb:
                     if context_audio_np is not None:
@@ -802,12 +871,11 @@ def log_val_audio_example(
 
         return wandb_audio_log
 
-    
     def join_embeddings_temporally(
         self,
-        embeddings: Sequence[torch.Tensor],     # [ (B, Ti, E), … ]
-        lengths:  Sequence[torch.Tensor],     # [ (B,), … ]  same order/size as `embeddings`
-        pad_embed: torch.Tensor | None = None # (E,)  defaults to zeros
+        embeddings: Sequence[torch.Tensor],  # [ (B, Ti, E), … ]
+        lengths: Sequence[torch.Tensor],  # [ (B,), … ]  same order/size as `embeddings`
+        pad_embed: torch.Tensor | None = None,  # (E,)  defaults to zeros
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Merges Multiple Embedding sequences into a single Embedding Sequence.
@@ -816,7 +884,7 @@ def join_embeddings_temporally(
             embeddings  : Sequence of tensors, each of shape (B, Ti, E) — batch, time, embedding
             lengths     : Sequence of tensors, each of shape (B,)
             pad_embed   : (E,)  — embedding to use for padding, defaults to zeros
-        
+
         Returns:
             joined      : (B, max_sum_len, E)  — merged & padded
             out_lengths : (B,)  — total lengths of each batch element after merging
@@ -829,14 +897,14 @@ def join_embeddings_temporally(
         dtype = embeddings[0].dtype
 
         # 1. compute output sizes
-        len_stack   = torch.stack(tuple(lengths), dim=0)          # (N, B)
+        len_stack = torch.stack(tuple(lengths), dim=0)  # (N, B)
         out_lengths = len_stack.sum(0)
-        max_len     = int(out_lengths.max())
+        max_len = int(out_lengths.max())
 
         if pad_embed is None:
             pad_embed = torch.zeros(E, dtype=dtype, device=device)
 
-        joined = pad_embed.expand(B, max_len, E).clone()          # (B,max_len,E)
+        joined = pad_embed.expand(B, max_len, E).clone()  # (B,max_len,E)
 
         # batch row indices
         batch_rows = torch.arange(B, device=device).unsqueeze(1)  # (B,1)
@@ -846,15 +914,14 @@ def join_embeddings_temporally(
 
         for i, (embedding_i, len_i) in enumerate(zip(embeddings, lengths)):
             Ti = embedding_i.shape[1]
-            t_idx  = torch.arange(Ti, device=device) # (Ti,)
-            mask   = t_idx.unsqueeze(0) < len_i.unsqueeze(1) # (B,Ti)
+            t_idx = torch.arange(Ti, device=device)  # (Ti,)
+            mask = t_idx.unsqueeze(0) < len_i.unsqueeze(1)  # (B,Ti)
 
             # destination columns: offset + t
-            dest_cols = offset.unsqueeze(1) + t_idx # (B,Ti)
+            dest_cols = offset.unsqueeze(1) + t_idx  # (B,Ti)
 
             # Assign embedding_i to the correct positions in joined
-            joined[batch_rows.expand_as(mask)[mask],
-                dest_cols[mask]] = embedding_i[mask]
+            joined[batch_rows.expand_as(mask)[mask], dest_cols[mask]] = embedding_i[mask]
 
             # move cursor past this segment
             offset += len_i
@@ -870,10 +937,15 @@ def prepare_context_tensors(self, batch, dropout_text_input=False):
             text_mask = get_mask_from_lengths(text_lens)
             cas_embedding = self.cas_encoder(text, subword_mask=text_mask)  # (B, L, E)
             text_embedded = text_embedded + cas_embedding
-        
+
         if text_embedded.shape[1] < self.streaming_speech_delay + 1:
             # If text is too short, pad it with zeros
-            padding_tensor = torch.zeros(text_embedded.shape[0], self.streaming_speech_delay + 1 - text_embedded.shape[1], text_embedded.shape[2], device=text_embedded.device)
+            padding_tensor = torch.zeros(
+                text_embedded.shape[0],
+                self.streaming_speech_delay + 1 - text_embedded.shape[1],
+                text_embedded.shape[2],
+                device=text_embedded.device,
+            )
             text_embedded = torch.cat([text_embedded, padding_tensor], dim=1)
 
         if dropout_text_input:
@@ -892,15 +964,22 @@ def prepare_context_tensors(self, batch, dropout_text_input=False):
             context_audio_codes, context_audio_codes_lens = self.audio_to_codes(
                 batch['context_audio'], batch['context_audio_lens'], audio_type='context'
             )
-        
-        context_audio_codes, context_audio_codes_lens = self.stack_codes(context_audio_codes, context_audio_codes_lens, self.audio_bos_id, self.audio_eos_id, self.frame_stacking_factor, self.num_audio_codebooks)
+
+        context_audio_codes, context_audio_codes_lens = self.stack_codes(
+            context_audio_codes,
+            context_audio_codes_lens,
+            self.audio_bos_id,
+            self.audio_eos_id,
+            self.frame_stacking_factor,
+            self.num_audio_codebooks,
+        )
         context_audio_embedded = self.embed_audio_tokens(context_audio_codes)  # (B, T', E)
 
         # Context Text
         context_text_tokens = batch['context_text_tokens']
         context_text_lens = batch['context_text_tokens_lens']
         context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens)  # (B, L, E)
-    
+
         remaining_text_embedded = None
         remaining_text_lens = None
         if self.text_input_mode == 'full':
@@ -909,17 +988,17 @@ def prepare_context_tensors(self, batch, dropout_text_input=False):
                 lengths=[context_audio_codes_lens, context_text_lens, text_lens],
             )
         elif self.text_input_mode == 'streaming':
-            prompt_text_embedded = text_embedded[:,:self.streaming_speech_delay,:]
+            prompt_text_embedded = text_embedded[:, : self.streaming_speech_delay, :]
             prompt_text_lens = torch.ones_like(text_lens) * self.streaming_speech_delay
             context_embedding, context_lens = self.join_embeddings_temporally(
                 embeddings=[context_audio_embedded, context_text_embedded, prompt_text_embedded],
                 lengths=[context_audio_codes_lens, context_text_lens, prompt_text_lens],
             )
-            remaining_text_embedded = text_embedded[:,self.streaming_speech_delay:,:]
+            remaining_text_embedded = text_embedded[:, self.streaming_speech_delay :, :]
             remaining_text_lens = text_lens - self.streaming_speech_delay
             remaining_text_lens = remaining_text_lens.clamp(min=0)
             remaining_text_mask = get_mask_from_lengths(remaining_text_lens)
-            remaining_text_embedded = remaining_text_embedded * remaining_text_mask.unsqueeze(2) # (B, T, E)
+            remaining_text_embedded = remaining_text_embedded * remaining_text_mask.unsqueeze(2)  # (B, T, E)
         else:
             raise ValueError(f"Invalid text input mode: {self.text_input_mode}")
 
@@ -944,7 +1023,7 @@ def slice_pred_embeddings(self, transformer_out, context_lens, target_lens):
             transformer_out: (B, T, E)
             context_lens: (B,) - start index of target per batch
             target_lens: (B,) - length of target per batch
-        
+
         Returns: (B, T_max, E) tensor where T_max = max(target_lens)
         """
         B, T, E = transformer_out.shape
@@ -958,29 +1037,29 @@ def slice_pred_embeddings(self, transformer_out, context_lens, target_lens):
         range_indices = torch.arange(max_len, device=device).unsqueeze(0).expand(B, -1)
         gather_indices = context_lens.unsqueeze(1) + range_indices  # (B, max_len)
         gather_indices = torch.clamp(gather_indices, max=transformer_out.size(1) - 1)
-        
+
         # Expand to shape (B, max_len, E) for gather
         gather_indices_exp = gather_indices.unsqueeze(2).expand(-1, -1, E)
         sliced = torch.gather(transformer_out, dim=1, index=gather_indices_exp)
         return sliced
 
-
     def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_codebooks):
         if stacking_factor == 1:
             return codes, codes_lens
-        
-        contains_bos = codes[0,0,0].item() == bos_id
+
+        contains_bos = codes[0, 0, 0].item() == bos_id
         if contains_bos:
-            bos_tensor_repeated = torch.full((codes.size(0), (stacking_factor) * num_codebooks, 1), bos_id, device=codes.device) # (B,stacking_factor*C, 1)
-            codes = codes[:, :, 1:] # Remove the bos token
-            codes_lens = codes_lens - 1 # Remove the bos token
+            bos_tensor_repeated = torch.full(
+                (codes.size(0), (stacking_factor) * num_codebooks, 1), bos_id, device=codes.device
+            )  # (B,stacking_factor*C, 1)
+            codes = codes[:, :, 1:]  # Remove the bos token
+            codes_lens = codes_lens - 1  # Remove the bos token
         B, C, T = codes.shape
         s = int(stacking_factor)
 
         # --- Compute max padding needed ---
         pad_t = (-T) % s  # pad so that T' is divisible by s
-        pad_tail = torch.full((B, C, pad_t), eos_id,
-                                dtype=codes.dtype, device=codes.device)
+        pad_tail = torch.full((B, C, pad_t), eos_id, dtype=codes.dtype, device=codes.device)
         codes = torch.cat([codes, pad_tail], dim=-1)
 
         # --- Stack time into channel dimension ---
@@ -995,11 +1074,11 @@ def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_co
             new_lens = new_lens + 1
 
         return codes, new_lens
-    
+
     def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor):
         if stacking_factor == 1:
             return stacked_codes, stacked_lens
-        
+
         B, CxS, T_out = stacked_codes.shape
         s = int(stacking_factor)
         assert CxS % s == 0, f"Channel dim ({CxS}) must be divisible by stacking_factor ({s})"
@@ -1017,32 +1096,37 @@ def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor):
 
     def prepare_phoneme_channel_input(self, phoneme_tokens, phoneme_tokens_lens, context_lens):
         # import ipdb; ipdb.set_trace()
-        phoneme_tokens = phoneme_tokens.unsqueeze(1) # (B, 1, L)
+        phoneme_tokens = phoneme_tokens.unsqueeze(1)  # (B, 1, L)
         phoneme_tokens, phoneme_tokens_lens = self.stack_codes(
-            phoneme_tokens, 
-            phoneme_tokens_lens, 
-            self.phoneme_tokenizer.bos_token_id, 
-            self.phoneme_tokenizer.eos_token_id, 
-            self.phoneme_stacking_factor, 
-            1
+            phoneme_tokens,
+            phoneme_tokens_lens,
+            self.phoneme_tokenizer.bos_token_id,
+            self.phoneme_tokenizer.eos_token_id,
+            self.phoneme_stacking_factor,
+            1,
         )
         # import ipdb; ipdb.set_trace()
-        phoneme_tokens_embedded = self.embed_phoneme_tokens(phoneme_tokens) # (B, T', E)
+        phoneme_tokens_embedded = self.embed_phoneme_tokens(phoneme_tokens)  # (B, T', E)
 
         phoneme_mask = get_mask_from_lengths(phoneme_tokens_lens)
-        phoneme_tokens_embedded = phoneme_tokens_embedded * phoneme_mask.unsqueeze(2) # (B, T', E)
+        phoneme_tokens_embedded = phoneme_tokens_embedded * phoneme_mask.unsqueeze(2)  # (B, T', E)
 
-        zero_context_tensor = torch.zeros(context_lens.size(0), context_lens.max().item(), self.cfg.embedding_dim, device=phoneme_tokens.device)
+        zero_context_tensor = torch.zeros(
+            context_lens.size(0), context_lens.max().item(), self.cfg.embedding_dim, device=phoneme_tokens.device
+        )
         phoneme_channel_input, phoneme_channel_input_lens = self.join_embeddings_temporally(
             embeddings=[zero_context_tensor, phoneme_tokens_embedded],
             lengths=[context_lens, phoneme_tokens_lens],
         )
         return phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens
 
-    
     def process_batch(self, batch, mode="train"):
         dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False
-        dropout_phoneme_input = ((random.random() < self.dropout_phoneme_input_prob) and (not dropout_text_input)) if mode == 'train' else False
+        dropout_phoneme_input = (
+            ((random.random() < self.dropout_phoneme_input_prob) and (not dropout_text_input))
+            if mode == 'train'
+            else False
+        )
         context_tensors = self.prepare_context_tensors(batch, dropout_text_input)
         print("text lens", context_tensors['text_lens'])
         remaining_text_embedded = context_tensors['remaining_text_embedded']
@@ -1054,9 +1138,11 @@ def process_batch(self, batch, mode="train"):
             if torch.rand(1).item() < self.cfg_unconditional_prob:
                 dropout_conditional_input = True
                 # Get embedding of a special UNCONDITIONAL_TOKEN
-                cfg_token_id = self.cfg_unk_token_id # int
-                cfg_token_embedding = self.decoder.get_input_embeddings()(torch.full((context_embedding.size(0), 1), cfg_token_id, device=context_embedding.device))  # (B, 1, E)
-                # Keeping the dummy context same size as the context embedding makes 
+                cfg_token_id = self.cfg_unk_token_id  # int
+                cfg_token_embedding = self.decoder.get_input_embeddings()(
+                    torch.full((context_embedding.size(0), 1), cfg_token_id, device=context_embedding.device)
+                )  # (B, 1, E)
+                # Keeping the dummy context same size as the context embedding makes
                 # inference easier especially with KV caching and using a duplicated batch.
                 context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1)  # (B, T_total, E)
                 # Make unconditional remaining text embedding all zeros. Simplifies the inference implementation.
@@ -1072,19 +1158,32 @@ def process_batch(self, batch, mode="train"):
                 audio_codes = self._codec_converter.convert_original_to_new(
                     audio_tokens=audio_codes, audio_lens=audio_codes_lens
                 ).long()
-        
-        audio_codes, audio_codes_lens = self.stack_codes(audio_codes, audio_codes_lens, self.audio_bos_id, self.audio_eos_id, self.frame_stacking_factor, self.num_audio_codebooks)
+
+        audio_codes, audio_codes_lens = self.stack_codes(
+            audio_codes,
+            audio_codes_lens,
+            self.audio_bos_id,
+            self.audio_eos_id,
+            self.frame_stacking_factor,
+            self.num_audio_codebooks,
+        )
         audio_codes_lens_input = audio_codes_lens_target = audio_codes_lens - 1
         audio_codes_target = audio_codes[:, :, 1:]  # (B, C, T') Target for the decoder
         audio_codes_input = audio_codes[:, :, :-1]  # (B, C, T') Input to the decoder
-        audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input) # (B, T, E) # Computing this to be use in the alignment encoder
+        audio_codes_input_embedded = self.embed_audio_tokens(
+            audio_codes_input
+        )  # (B, T, E) # Computing this to be use in the alignment encoder
         if remaining_text_embedded is not None:
             # Make remaining text embedded the same size as audio_codes_input_embedded by padding with zeros on the right
             padding_len = audio_codes_input_embedded.size(1) - remaining_text_embedded.size(1)
-            padding_tensor = torch.zeros(remaining_text_embedded.size(0), padding_len, remaining_text_embedded.size(2), device=remaining_text_embedded.device)
+            padding_tensor = torch.zeros(
+                remaining_text_embedded.size(0),
+                padding_len,
+                remaining_text_embedded.size(2),
+                device=remaining_text_embedded.device,
+            )
             remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1)
             audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded
-            
 
         context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally(
             embeddings=[context_embedding, audio_codes_input_embedded],
@@ -1093,18 +1192,23 @@ def process_batch(self, batch, mode="train"):
 
         if self.phoneme_tokenizer is not None:
             context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay
-            phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens = self.prepare_phoneme_channel_input(
-                batch['phoneme_tokens'], 
-                batch['phoneme_tokens_lens'], 
-                context_lens_for_phonemes
+            phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens = (
+                self.prepare_phoneme_channel_input(
+                    batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes
+                )
             )
             print("phoneme_tokens_lens", phoneme_tokens_lens)
             print("audio_codes_lens", audio_codes_lens_input)
             if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]:
-                padding_tensor = torch.zeros(phoneme_channel_input.shape[0], context_plus_audio_embedded.shape[1] - phoneme_channel_input.shape[1], phoneme_channel_input.shape[2], device=phoneme_channel_input.device)
+                padding_tensor = torch.zeros(
+                    phoneme_channel_input.shape[0],
+                    context_plus_audio_embedded.shape[1] - phoneme_channel_input.shape[1],
+                    phoneme_channel_input.shape[2],
+                    device=phoneme_channel_input.device,
+                )
                 phoneme_channel_input = torch.cat([phoneme_channel_input, padding_tensor], dim=1)
             else:
-                phoneme_channel_input = phoneme_channel_input[:, :context_plus_audio_embedded.shape[1], :]
+                phoneme_channel_input = phoneme_channel_input[:, : context_plus_audio_embedded.shape[1], :]
 
             if (not dropout_conditional_input) and (not dropout_phoneme_input):
                 context_plus_audio_embedded = context_plus_audio_embedded + phoneme_channel_input
@@ -1114,13 +1218,13 @@ def process_batch(self, batch, mode="train"):
             attention_mask=get_mask_from_lengths(context_plus_audio_lens),
         )
         transformer_hidden_states = transformer_out.last_hidden_state  # (B, T_total, E)
-        
+
         pred_embeddings = self.slice_pred_embeddings(
             transformer_hidden_states,
             context_lens=context_lens,
             target_lens=audio_codes_lens_target,
         )
-        
+
         logits = self.final_proj(pred_embeddings)  # (B, T', num_codebooks * num_tokens_per_codebook)
         # import ipdb; ipdb.set_trace()
         codebook_loss, loss_mask = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target)
@@ -1132,14 +1236,22 @@ def process_batch(self, batch, mode="train"):
             if self.local_transformer_type == LocalTransformerType.MASKGIT:
                 # randomly replace some positions with MASK_TOKEN
                 audio_codes_masked, mask_tokens_mask = self.maskgit_apply_random_mask(audio_codes_target)
-                local_transformer_logits = self.compute_local_transformer_logits(pred_embeddings, audio_codes_masked, targets_offset_by_one=True)
-                #audio_codes_masked = audio_codes_masked[:, 1:, :]
-                local_transformer_loss, _ = self.compute_loss(local_transformer_logits, audio_codes_target, audio_codes_lens_target, mask_tokens_mask)
+                local_transformer_logits = self.compute_local_transformer_logits(
+                    pred_embeddings, audio_codes_masked, targets_offset_by_one=True
+                )
+                # audio_codes_masked = audio_codes_masked[:, 1:, :]
+                local_transformer_loss, _ = self.compute_loss(
+                    local_transformer_logits, audio_codes_target, audio_codes_lens_target, mask_tokens_mask
+                )
             else:
                 # autoregressive
                 assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type"
-                local_transformer_logits = self.compute_local_transformer_logits(pred_embeddings, audio_codes_target, targets_offset_by_one=False)
-                local_transformer_loss, _ = self.compute_loss(local_transformer_logits, audio_codes_target, audio_codes_lens_target, None)
+                local_transformer_logits = self.compute_local_transformer_logits(
+                    pred_embeddings, audio_codes_target, targets_offset_by_one=False
+                )
+                local_transformer_loss, _ = self.compute_loss(
+                    local_transformer_logits, audio_codes_target, audio_codes_lens_target, None
+                )
             local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0)
             loss = loss + local_transformer_loss_scale * local_transformer_loss
 
@@ -1148,12 +1260,16 @@ def process_batch(self, batch, mode="train"):
             pred_embeddings_phoneme = self.slice_pred_embeddings(
                 transformer_hidden_states,
                 context_lens=context_lens_for_phonemes,
-                target_lens=phoneme_tokens_lens-1,
+                target_lens=phoneme_tokens_lens - 1,
             )
-            phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme) # (B, T', phoneme_stacking_factor * phoneme_vocab_size)
+            phoneme_logits = self.phoneme_final_proj(
+                pred_embeddings_phoneme
+            )  # (B, T', phoneme_stacking_factor * phoneme_vocab_size)
             if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input):
                 # Only compute phoneme loss if not doing unconditional training or text dropout
-                phoneme_loss, _ = self.compute_phoneme_loss(phoneme_logits, phoneme_tokens[:,:,1:].long(), phoneme_tokens_lens - 1)
+                phoneme_loss, _ = self.compute_phoneme_loss(
+                    phoneme_logits, phoneme_tokens[:, :, 1:].long(), phoneme_tokens_lens - 1
+                )
                 print("No Dropout - phoneme loss:", phoneme_loss.item())
             else:
                 phoneme_loss = torch.tensor(0.0, device=logits.device)
@@ -1174,8 +1290,6 @@ def process_batch(self, batch, mode="train"):
             'context_audio_codes_lens': context_tensors['context_audio_codes_lens'],  # (B,)
         }
 
-        
-
     def training_step(self, batch, batch_idx):
         batch_output = self.process_batch(batch)
         loss = batch_output['loss']
@@ -1186,7 +1300,7 @@ def training_step(self, batch, batch_idx):
         if self.phoneme_tokenizer is not None:
             phoneme_loss = batch_output['phoneme_loss']
             self.log('train/phoneme_loss', phoneme_loss, prog_bar=True, sync_dist=True)
-        
+
         local_transformer_loss = batch_output['local_transformer_loss']
         if local_transformer_loss is not None:
             self.log('train/local_transformer_loss', local_transformer_loss, prog_bar=True, sync_dist=True)
@@ -1198,25 +1312,32 @@ def training_step(self, batch, batch_idx):
             "train/batch_size": batch_size,
             "train/text_token_max_len": text_token_max_len,
             "train/text_token_total_num_in_batch": text_token_total_num,
-            "train/text_token_pad_ratio_percent_in_batch": 100 * (1 - text_token_total_num / (batch_size * text_token_max_len)),
+            "train/text_token_pad_ratio_percent_in_batch": 100
+            * (1 - text_token_total_num / (batch_size * text_token_max_len)),
         }
 
         if "audio_codes" in batch:
             audio_codes_max_len = batch["audio_codes"].shape[-1]
             audio_codes_total_num = batch["audio_codes_lens"].sum()
-            batch_info_dict.update({
-                "train/audio_codes_max_len": audio_codes_max_len,
-                "train/audio_codes_total_num_in_batch": audio_codes_total_num,
-                "train/audio_codes_pad_ratio_percent_in_batch": 100 * (1 - audio_codes_total_num / (batch_size * audio_codes_max_len)),
-            })
+            batch_info_dict.update(
+                {
+                    "train/audio_codes_max_len": audio_codes_max_len,
+                    "train/audio_codes_total_num_in_batch": audio_codes_total_num,
+                    "train/audio_codes_pad_ratio_percent_in_batch": 100
+                    * (1 - audio_codes_total_num / (batch_size * audio_codes_max_len)),
+                }
+            )
         else:
             audio_samples_max_len = batch["audio"].shape[-1]
             audio_samples_total_num = batch["audio_lens"].sum()
-            batch_info_dict.update({
-                "train/audio_samples_max_len": audio_samples_max_len,
-                "train/audio_samples_total_num_in_batch": audio_samples_total_num,
-                "train/audio_samples_pad_ratio_percent_in_batch": 100 * (1 - audio_samples_total_num / (batch_size * audio_samples_max_len)),
-            })
+            batch_info_dict.update(
+                {
+                    "train/audio_samples_max_len": audio_samples_max_len,
+                    "train/audio_samples_total_num_in_batch": audio_samples_total_num,
+                    "train/audio_samples_pad_ratio_percent_in_batch": 100
+                    * (1 - audio_samples_total_num / (batch_size * audio_samples_max_len)),
+                }
+            )
 
         self.log_dict(batch_info_dict, on_step=True)
 
@@ -1233,7 +1354,7 @@ def validation_step(self, batch, batch_idx):
         audio_codes_lens_target = batch_output['audio_codes_lens_target']
         context_audio_codes = batch_output['context_audio_codes']
         context_audio_codes_lens = batch_output['context_audio_codes_lens']
-        
+
         if batch_idx == 0 and self.global_rank == 0:
             # Prepare dictionary for aggregated wandb logging
             wandb_log_dict = {}
@@ -1249,25 +1370,25 @@ def validation_step(self, batch, batch_idx):
             for logger in self.loggers:
                 if isinstance(logger, WandbLogger) and wandb_log_dict:
                     logger.experiment.log(wandb_log_dict)
-            
+
             # infer_output_no_cfg_noLT = self.infer_batch(
-            #     batch, 
-            #     max_decoder_steps=500, 
-            #     temperature=0.7, 
-            #     topk=80, 
-            #     use_local_transformer_for_inference=False, 
-            #     maskgit_n_steps=3, 
-            #     use_cfg=False, 
+            #     batch,
+            #     max_decoder_steps=500,
+            #     temperature=0.7,
+            #     topk=80,
+            #     use_local_transformer_for_inference=False,
+            #     maskgit_n_steps=3,
+            #     use_cfg=False,
             #     cfg_scale=1.0
             # )
             # infer_output_cfg_withLT = self.infer_batch(
-            #     batch, 
-            #     max_decoder_steps=500, 
-            #     temperature=0.7, 
-            #     topk=80, 
+            #     batch,
+            #     max_decoder_steps=500,
+            #     temperature=0.7,
+            #     topk=80,
             #     use_local_transformer_for_inference=self.local_transformer_type != LocalTransformerType.NO_LT,
             #     maskgit_n_steps=3,
-            #     use_cfg=True, 
+            #     use_cfg=True,
             #     cfg_scale=2.5
             # )
             # pred_audio_no_cfg_noLT, pred_audio_no_cfg_noLT_lens = infer_output_no_cfg_noLT[0], infer_output_no_cfg_noLT[1]
@@ -1292,7 +1413,7 @@ def validation_step(self, batch, batch_idx):
             #             )
             #             logger.experiment.add_audio(
             #                 "val/pred_audio_cfg_withLT", pred_audio_cfg_withLT_idx, sample_rate=self.sample_rate, global_step=batch_idx
-            #             )   
+            #             )
 
         local_transformer_loss = batch_output['local_transformer_loss']
         val_output = {
@@ -1313,18 +1434,18 @@ def on_validation_epoch_end(self):
         collect = lambda key: torch.stack([x[key] for x in self.validation_step_outputs]).mean()
         val_loss = collect("val_loss")
         val_codebook_loss = collect("val_codebook_loss")
-        
+
         self.log("val_loss", val_loss, prog_bar=True, sync_dist=True)
         self.log("val/codebook_loss", val_codebook_loss, prog_bar=True, sync_dist=True)
-        
+
         if self.local_transformer_type != LocalTransformerType.NO_LT:
             val_local_transformer_loss = collect("val_local_transformer_loss")
             self.log("val/local_transformer_loss", val_local_transformer_loss, prog_bar=True, sync_dist=True)
-        
+
         if self.phoneme_tokenizer is not None:
             val_phoneme_loss = collect("val_phoneme_loss")
             self.log("val/phoneme_loss", val_phoneme_loss, prog_bar=True, sync_dist=True)
-        
+
         self.validation_step_outputs.clear()  # free memory
 
     def get_dataset(self, dataset_cfg, dataset_type):
@@ -1354,7 +1475,7 @@ def get_dataset(self, dataset_cfg, dataset_type):
         )  # This will be used in worker_init_fn for instantiating tokenizer
         if self.phoneme_tokenizer is not None:
             dataset.phoneme_tokenizer_config = self.cfg.phoneme_tokenizer
-        
+
         return dataset
 
     def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.DataLoader:
@@ -1379,9 +1500,9 @@ def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.D
             use_text_conditioning_tokenizer=True,
             text_conditioning_tokenizer_name=self.text_conditioning_tokenizer_name,
             tokenizer_config=self.cfg.text_tokenizers,
-            phoneme_tokenizer_config=self.cfg.get("phoneme_tokenizer", None)
+            phoneme_tokenizer_config=self.cfg.get("phoneme_tokenizer", None),
         )
-        
+
         data_loader = get_lhotse_dataloader_from_config(
             config=dataset_cfg.dataset,
             global_rank=self.global_rank,
@@ -1427,10 +1548,7 @@ def _setup_test_dataloader(self, dataset_cfg) -> torch.utils.data.DataLoader:
             if dataset_cfg.dataloader_params.num_workers == 0:
                 persistent_workers = False
                 # For num workers > 0 tokenizer will be assigned in worker_init_fn (since it is not picklable)
-                dataset.text_tokenizer = setup_tokenizers(
-                    all_tokenizers_config=self.cfg.text_tokenizers,
-                    mode='test'
-                )
+                dataset.text_tokenizer = setup_tokenizers(all_tokenizers_config=self.cfg.text_tokenizers, mode='test')
                 if self.cfg.get("phoneme_tokenizer", None) is not None:
                     dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.cfg.phoneme_tokenizer)
 
@@ -1449,7 +1567,20 @@ def setup_validation_data(self, cfg):
     def setup_test_data(self, cfg):
         self._test_dl = self._setup_test_dataloader(cfg)
 
-    def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, use_local_transformer_for_inference=False, maskgit_n_steps=3, use_cfg=False, cfg_scale=1.0, phoneme_input_type='gt', phoneme_sampling_method='argmax', dropout_text_input=False):
+    def infer_batch(
+        self,
+        batch,
+        max_decoder_steps=500,
+        temperature=0.7,
+        topk=80,
+        use_local_transformer_for_inference=False,
+        maskgit_n_steps=3,
+        use_cfg=False,
+        cfg_scale=1.0,
+        phoneme_input_type='gt',
+        phoneme_sampling_method='argmax',
+        dropout_text_input=False,
+    ):
         # TODO: Make this API same as MagpieTTS model.
         with torch.inference_mode():
             start_time = time.time()
@@ -1458,29 +1589,43 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
             context_lens = context_tensors['context_lens']  # (B,)
             remaining_text_embedded = context_tensors['remaining_text_embedded']
             remaining_text_lens = context_tensors['remaining_text_lens']
-            
+
             if self.phoneme_tokenizer is not None:
                 context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay
-                phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = self.prepare_phoneme_channel_input(
-                    batch['phoneme_tokens'], 
-                    batch['phoneme_tokens_lens'], 
-                    context_lens_for_phonemes
+                phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = (
+                    self.prepare_phoneme_channel_input(
+                        batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes
+                    )
+                )
+                phoneme_channel_input_pad_tensor = torch.zeros(
+                    phoneme_channel_input.size(0),
+                    max_decoder_steps,
+                    phoneme_channel_input.size(2),
+                    device=phoneme_channel_input.device,
                 )
-                phoneme_channel_input_pad_tensor = torch.zeros(phoneme_channel_input.size(0), max_decoder_steps, phoneme_channel_input.size(2), device=phoneme_channel_input.device)
                 phoneme_channel_input = torch.cat([phoneme_channel_input, phoneme_channel_input_pad_tensor], dim=1)
-                
+
             audio_codes_bos = torch.full(
-                    (context_embedding.size(0), self.num_audio_codebooks * self.frame_stacking_factor, 1), self.audio_bos_id, device=context_embedding.device
-                ).long()
+                (context_embedding.size(0), self.num_audio_codebooks * self.frame_stacking_factor, 1),
+                self.audio_bos_id,
+                device=context_embedding.device,
+            ).long()
             audio_codes_lens = torch.full((context_embedding.size(0),), 1, device=context_embedding.device).long()
             audio_codes_input = audio_codes_bos
 
             audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input)  # (B, T, E)
             if self.text_input_mode == 'streaming':
                 remaining_text_pad_length = max_decoder_steps - remaining_text_lens.max().item() + 1
-                remaining_text_pad_tensor = torch.zeros(remaining_text_embedded.size(0), remaining_text_pad_length, remaining_text_embedded.size(2), device=remaining_text_embedded.device)
+                remaining_text_pad_tensor = torch.zeros(
+                    remaining_text_embedded.size(0),
+                    remaining_text_pad_length,
+                    remaining_text_embedded.size(2),
+                    device=remaining_text_embedded.device,
+                )
                 remaining_text_embedded = torch.cat([remaining_text_embedded, remaining_text_pad_tensor], dim=1)
-                audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded[:, :1, :] # :1 corresponds to audio BOS.
+                audio_codes_input_embedded = (
+                    audio_codes_input_embedded + remaining_text_embedded[:, :1, :]
+                )  # :1 corresponds to audio BOS.
 
             context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally(
                 embeddings=[context_embedding, audio_codes_input_embedded],
@@ -1488,23 +1633,28 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
             )
             min_context_len = context_plus_audio_lens.min().item()
             if self.phoneme_tokenizer is not None:
-                min_context_len = min_context_len - self.streaming_speech_delay + self.streaming_phonemes_delay - 1 # 1 for audio BOS that we had added.
+                min_context_len = (
+                    min_context_len - self.streaming_speech_delay + self.streaming_phonemes_delay - 1
+                )  # 1 for audio BOS that we had added.
 
             actual_batch_size = context_embedding.size(0)
             if use_cfg:
                 dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()(
                     torch.full((actual_batch_size, 1), self.cfg_unk_token_id, device=context_embedding.device)
-                ) # (B, 1, E)
-                dummy_context_embedding_unconditional_expanded = dummy_context_embedding_unconditional.expand(-1, context_embedding.size(1), -1)  # (B, T_total, E)
-                
+                )  # (B, 1, E)
+                dummy_context_embedding_unconditional_expanded = dummy_context_embedding_unconditional.expand(
+                    -1, context_embedding.size(1), -1
+                )  # (B, T_total, E)
+
                 dummy_context_plus_audio_embedded, _ = self.join_embeddings_temporally(
                     embeddings=[dummy_context_embedding_unconditional_expanded, audio_codes_input_embedded],
                     lengths=[context_lens, audio_codes_lens],
                 )
                 first_inference_input = torch.cat(
-                    [context_plus_audio_embedded, dummy_context_plus_audio_embedded],
-                    dim=0
-                )[:,:min_context_len, :]  # (2B, T_min, E)
+                    [context_plus_audio_embedded, dummy_context_plus_audio_embedded], dim=0
+                )[
+                    :, :min_context_len, :
+                ]  # (2B, T_min, E)
             else:
                 first_inference_input = context_plus_audio_embedded[:, :min_context_len, :]  # (B, T_min, E)
             # First forward pass to get the initial hidden state and past key values
@@ -1521,22 +1671,22 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
 
             all_predictions = []
             end_indices = {}
-            
+
             current_text_positions = []
             for item_idx in range(context_embedding.size(0)):
                 # 0 if we have started reading the remaining text otherwise negative (indicating how far we are before we start reading the remaining text)
                 current_text_positions.append(min_context_len - context_plus_audio_lens[item_idx])
             current_text_positions = torch.tensor(current_text_positions, device=context_embedding.device).long()
             if self.phoneme_tokenizer is not None:
-                current_phoneme_positions = current_text_positions - current_text_positions.max() - 1 # Make it 0-indexed.
-                # current_text_positions = current_text_positions - self.streaming_speech_delay + self.streaming_phonemes_delay    
-            pred_phoneme_token_lists = [
-                [] for _ in range(actual_batch_size)
-            ]
-            gt_phoneme_token_lists = [
-                [] for _ in range(actual_batch_size)
-            ]
-            phoneme_stream_ended = torch.zeros(actual_batch_size, device=context_embedding.device).bool() # (B,) Whether phoneme stream has ended for this item.
+                current_phoneme_positions = (
+                    current_text_positions - current_text_positions.max() - 1
+                )  # Make it 0-indexed.
+                # current_text_positions = current_text_positions - self.streaming_speech_delay + self.streaming_phonemes_delay
+            pred_phoneme_token_lists = [[] for _ in range(actual_batch_size)]
+            gt_phoneme_token_lists = [[] for _ in range(actual_batch_size)]
+            phoneme_stream_ended = torch.zeros(
+                actual_batch_size, device=context_embedding.device
+            ).bool()  # (B,) Whether phoneme stream has ended for this item.
             for idx in range(max_decoder_steps):
                 # import ipdb; ipdb.set_trace()
                 current_text_positions += 1
@@ -1546,19 +1696,23 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
                 if idx % 20 == 0:
                     print(f"Decoding timestep {idx}")
 
-                all_code_logits_t = self.final_proj(last_hidden[:, -1, :])  # (B, num_codebooks * num_tokens_per_codebook)
-                
+                all_code_logits_t = self.final_proj(
+                    last_hidden[:, -1, :]
+                )  # (B, num_codebooks * num_tokens_per_codebook)
+
                 if self.phoneme_tokenizer is not None:
-                    all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :]) # (B, phoneme_stacking_factor * phoneme_vocab_size)
+                    all_code_logits_t_phoneme = self.phoneme_final_proj(
+                        last_hidden[:, -1, :]
+                    )  # (B, phoneme_stacking_factor * phoneme_vocab_size)
                     all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size]
 
                 if use_cfg:
                     conditional_logits = all_code_logits_t[:actual_batch_size]
                     unconditional_logits = all_code_logits_t[actual_batch_size:]
-                    all_code_logits_t = cfg_scale * conditional_logits +  (1.0 - cfg_scale) * unconditional_logits
+                    all_code_logits_t = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
 
                 if use_local_transformer_for_inference:
-                    if self.local_transformer_type == LocalTransformerType.AR :
+                    if self.local_transformer_type == LocalTransformerType.AR:
                         # Autoregressive sampling with local transformer
                         audio_codes_next = self.local_transformer_sample_autoregressive(
                             dec_output=last_hidden[:, -1, :],
@@ -1577,54 +1731,88 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
                             cfg_scale=cfg_scale,
                         )
                     else:
-                        raise ValueError(f"Local transformer inference requested by but local transformer type is {self.local_transformer_type}")
+                        raise ValueError(
+                            f"Local transformer inference requested by but local transformer type is {self.local_transformer_type}"
+                        )
                     # TODO @rfejgin: should we add argmax sampling for EOS here too?
                     all_codes_next_argmax = audio_codes_next
                 else:
                     # Parallel sampling from logits
-                    audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk) # (B, num_codebooks)
-                    all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01) # (B, num_codebooks)
+                    audio_codes_next = self.sample_codes_from_logits(
+                        all_code_logits_t, temperature=temperature, topk=topk
+                    )  # (B, num_codebooks)
+                    all_codes_next_argmax = self.sample_codes_from_logits(
+                        all_code_logits_t, temperature=0.01
+                    )  # (B, num_codebooks)
 
                 phoneme_channel_input_t = None
-                
+
                 if self.phoneme_tokenizer is not None:
-                    all_codes_next_phoneme = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=temperature, topk=topk) # (B, phoneme_stacking_factor)
-                    all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.01) # (B, phoneme_stacking_factor)
-                    pred_phoneme_tokens = all_codes_next_phoneme_argmax if phoneme_sampling_method == 'argmax' else all_codes_next_phoneme # B, phoneme_stacking_factor
+                    all_codes_next_phoneme = self.sample_codes_from_logits_phoneme(
+                        all_code_logits_t_phoneme, temperature=temperature, topk=topk
+                    )  # (B, phoneme_stacking_factor)
+                    all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme(
+                        all_code_logits_t_phoneme, temperature=0.01
+                    )  # (B, phoneme_stacking_factor)
+                    pred_phoneme_tokens = (
+                        all_codes_next_phoneme_argmax
+                        if phoneme_sampling_method == 'argmax'
+                        else all_codes_next_phoneme
+                    )  # B, phoneme_stacking_factor
                     phoneme_bos_tensor = torch.full(
                         (actual_batch_size, self.phoneme_stacking_factor),
-                        self.phoneme_tokenizer.bos_token_id, 
-                        device=context_embedding.device
-                    ).long() # (B, phoneme_stacking_factor)
+                        self.phoneme_tokenizer.bos_token_id,
+                        device=context_embedding.device,
+                    ).long()  # (B, phoneme_stacking_factor)
                     use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long()
                     print("use_bos_phoneme", use_bos_phoneme)
-                    pred_phoneme_tokens = (use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens).long()  # (B, phoneme_stacking_factor)
-                    
+                    pred_phoneme_tokens = (
+                        use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens
+                    ).long()  # (B, phoneme_stacking_factor)
+
                     print("pred_phoneme_tokens", pred_phoneme_tokens)
                     gt_phoneme_idx = min(idx, gt_phoneme_tokens.size(2) - 1)
-                    gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx] # (B, phoneme_stacking_factor)
+                    gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx]  # (B, phoneme_stacking_factor)
                     print("gt_phoneme_tokens_current", gt_phoneme_tokens_current)
-                    
-                    input_phoneme_tokens_current = gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens
-                    input_phoneme_embedding = self.embed_phoneme_tokens(input_phoneme_tokens_current.unsqueeze(2))  # (B, phoneme_stacking_factor, E)
-                    
-                    use_phoneme_input = (current_phoneme_positions >= 0) * (~phoneme_stream_ended) # (B,)
+
+                    input_phoneme_tokens_current = (
+                        gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens
+                    )
+                    input_phoneme_embedding = self.embed_phoneme_tokens(
+                        input_phoneme_tokens_current.unsqueeze(2)
+                    )  # (B, phoneme_stacking_factor, E)
+
+                    use_phoneme_input = (current_phoneme_positions >= 0) * (~phoneme_stream_ended)  # (B,)
                     use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float()  # (B, 1, 1)
-                    zero_phoneme_embedding = torch.zeros(actual_batch_size, self.cfg.embedding_dim, device=all_codes_next_phoneme.device).unsqueeze(1) # (B, 1, E)
+                    zero_phoneme_embedding = torch.zeros(
+                        actual_batch_size, self.cfg.embedding_dim, device=all_codes_next_phoneme.device
+                    ).unsqueeze(
+                        1
+                    )  # (B, 1, E)
                     # phoneme_channel_input_t = phoneme_channel_input[torch.arange(actual_batch_size), current_phoneme_positions.clamp(min=0) + min_context_len, :].unsqueeze(1) # (B, 1, E)
-                    phoneme_channel_input_t = use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding
+                    phoneme_channel_input_t = (
+                        use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding
+                    )
                     print("use_phoneme_input", use_phoneme_input)
                     for item_idx in range(actual_batch_size):
-                        if use_phoneme_input[item_idx,0,0] > 0:
+                        if use_phoneme_input[item_idx, 0, 0] > 0:
                             for phoneme_channel_idx in range(self.phoneme_stacking_factor):
                                 _phoneme_token = pred_phoneme_tokens[item_idx, phoneme_channel_idx].item()
-                                if _phoneme_token not in [self.phoneme_tokenizer.eos_token_id, self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.pad]:
+                                if _phoneme_token not in [
+                                    self.phoneme_tokenizer.eos_token_id,
+                                    self.phoneme_tokenizer.bos_token_id,
+                                    self.phoneme_tokenizer.pad,
+                                ]:
                                     pred_phoneme_token_lists[item_idx].append(_phoneme_token)
-                                
+
                                 _gt_phoneme_token = gt_phoneme_tokens_current[item_idx, phoneme_channel_idx].item()
-                                if _gt_phoneme_token not in [self.phoneme_tokenizer.eos_token_id, self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.pad]:
+                                if _gt_phoneme_token not in [
+                                    self.phoneme_tokenizer.eos_token_id,
+                                    self.phoneme_tokenizer.bos_token_id,
+                                    self.phoneme_tokenizer.pad,
+                                ]:
                                     gt_phoneme_token_lists[item_idx].append(_gt_phoneme_token)
-                                
+
                         if torch.any(input_phoneme_tokens_current[item_idx] == self.phoneme_tokenizer.eos_token_id):
                             print("Phoneme end detected for item {} at timestep {}".format(item_idx, idx))
                             phoneme_stream_ended[item_idx] = True
@@ -1635,34 +1823,44 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
                     if item_idx not in end_indices and idx + min_context_len > context_plus_audio_lens[item_idx]:
                         pred_tokens = all_codes_next_argmax[item_idx]
                         pred_tokens_multinomial = audio_codes_next[item_idx]
-                        if torch.any(pred_tokens == self.audio_eos_id) or torch.any(pred_tokens_multinomial == self.audio_eos_id):
+                        if torch.any(pred_tokens == self.audio_eos_id) or torch.any(
+                            pred_tokens_multinomial == self.audio_eos_id
+                        ):
                             print("End detected for item {} at timestep {}".format(item_idx, idx))
                             end_indices[item_idx] = idx
-                
+
                 all_predictions.append(audio_codes_next)
-                
+
                 new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2))  # (B, 1, E)
                 new_emb_unconditional = new_emb * 1
-                
+
                 if self.text_input_mode == 'streaming':
                     _bs = context_embedding.size(0)
-                    remaining_text_embedded_current = remaining_text_embedded[torch.arange(_bs), current_text_positions.clamp(min=0) , :].unsqueeze(1) # (B, 1, E)
+                    remaining_text_embedded_current = remaining_text_embedded[
+                        torch.arange(_bs), current_text_positions.clamp(min=0), :
+                    ].unsqueeze(
+                        1
+                    )  # (B, 1, E)
                     new_emb = new_emb + remaining_text_embedded_current
-                    
-                
-                context_incomplete_mask = context_plus_audio_lens > idx + min_context_len # (B,)
+
+                context_incomplete_mask = context_plus_audio_lens > idx + min_context_len  # (B,)
                 # import ipdb; ipdb.set_trace()
                 # True if we have not yet reached the end of the context for this item
                 # import ipdb; ipdb.set_trace()
                 if context_incomplete_mask.any():
                     # If some contexts are not yet complete.
                     context_incomplete_mask = context_incomplete_mask.unsqueeze(1).unsqueeze(2).float()  # (B, 1, 1)
-                    context_embedding = context_plus_audio_embedded[:,min_context_len+idx:min_context_len+idx+1,:] # (B, 1, E)
+                    context_embedding = context_plus_audio_embedded[
+                        :, min_context_len + idx : min_context_len + idx + 1, :
+                    ]  # (B, 1, E)
                     next_input = context_incomplete_mask * context_embedding + (1 - context_incomplete_mask) * new_emb
                     if phoneme_channel_input_t is not None:
                         next_input += phoneme_channel_input_t
                     if use_cfg:
-                        next_input_unconditional = context_incomplete_mask * dummy_context_embedding_unconditional + (1 - context_incomplete_mask) * new_emb_unconditional
+                        next_input_unconditional = (
+                            context_incomplete_mask * dummy_context_embedding_unconditional
+                            + (1 - context_incomplete_mask) * new_emb_unconditional
+                        )
                         next_input = torch.cat([next_input, next_input_unconditional], dim=0)  # (2B, 1, E)
                 else:
                     next_input = new_emb
@@ -1670,7 +1868,7 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
                         next_input += phoneme_channel_input_t
                     if use_cfg:
                         next_input = torch.cat([next_input, new_emb_unconditional], dim=0)  # (2B, 1, E)
-                        
+
                 transformer_out = self.forward(
                     inputs_embeds=next_input,
                     attention_mask=None,
@@ -1682,10 +1880,12 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
                 if len(end_indices) == audio_codes_next.size(0):
                     print("All items finished at timestep {}".format(idx))
                     break
-            
+
             if self.phoneme_tokenizer is not None:
                 for item_idx in range(actual_batch_size):
-                    print("Predicted phoneme tokens for item {}: {}".format(item_idx, pred_phoneme_token_lists[item_idx]))
+                    print(
+                        "Predicted phoneme tokens for item {}: {}".format(item_idx, pred_phoneme_token_lists[item_idx])
+                    )
                     print("GT phoneme tokens for item {}: {}".format(item_idx, gt_phoneme_token_lists[item_idx]))
                     predicted_phoneme_text = self.phoneme_tokenizer.decode(pred_phoneme_token_lists[item_idx])
                     gt_phoneme_text = self.phoneme_tokenizer.decode(gt_phoneme_token_lists[item_idx])
@@ -1694,10 +1894,12 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
 
             tts_generation_time = time.time() - start_time
             tts_generation_time_per_frame = tts_generation_time / len(all_predictions)
-            pred_codes_start_indices = context_plus_audio_lens - min_context_len # (B,)
-            predicted_lens = [end_indices.get(idx, max_decoder_steps) for idx in range(context_embedding.size(0))] #  Ensure that the codec is atleast of length 4
+            pred_codes_start_indices = context_plus_audio_lens - min_context_len  # (B,)
+            predicted_lens = [
+                end_indices.get(idx, max_decoder_steps) for idx in range(context_embedding.size(0))
+            ]  #  Ensure that the codec is atleast of length 4
             predicted_codes_lens = torch.tensor(predicted_lens, device=context_embedding.device).long()
-            predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices # (B,)
+            predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices  # (B,)
 
             predicted_codes = torch.stack(all_predictions, dim=-1)  # (B, num_codebooks, T)
             predicted_codes = self.slice_pred_embeddings(
@@ -1707,9 +1909,11 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
             )
             predicted_codes = predicted_codes.permute(0, 2, 1)  # (B, num_codebooks, T)
             predicted_audio, predicted_audio_lens = self.codes_to_audio(predicted_codes, predicted_codes_lens)
-            
+
             end_time = time.time()
-            total_audio_duration_generated = (predicted_audio_lens.max().item() * predicted_audio_lens.shape[0])/self.sample_rate
+            total_audio_duration_generated = (
+                predicted_audio_lens.max().item() * predicted_audio_lens.shape[0]
+            ) / self.sample_rate
             rtf = total_audio_duration_generated / (end_time - start_time)
 
             rtf_metrics = {
@@ -1723,9 +1927,6 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us
             
             return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics
 
-
-        
     @classmethod
     def list_available_models(cls) -> List[PretrainedModelInfo]:
         return []
-
diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index cc3083f30e2c..19e5793a892b 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -34,7 +34,7 @@
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
 from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPATokenizer
 from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset, MagpieTTSDataset
-from nemo.collections.tts.models import MagpieTTSModel, MagpieTTSDecoderModel
+from nemo.collections.tts.models import MagpieTTSDecoderModel, MagpieTTSModel
 from nemo.collections.tts.models.magpietts import ModelInferenceParameters
 from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors
 from nemo.utils import logging
@@ -133,7 +133,7 @@ class MagpieInferenceRunner:
     """
 
     def __init__(
-        self,# model can be MagpieTTSModel or DecoderOnlyMagpieTTSModel
+        self,  # model can be MagpieTTSModel or DecoderOnlyMagpieTTSModel
         model: Union[MagpieTTSModel, MagpieTTSDecoderModel],
         config: InferenceConfig,
     ):
@@ -157,7 +157,9 @@ def _configure_tokenizer(self) -> None:
         """Configure the tokenizer for inference (phoneme prob = 1.0)."""
         g2p = None
         if isinstance(self.model.tokenizer, AggregatedTTSTokenizer):
-            if "english_phoneme" in self.model.tokenizer.tokenizers and hasattr(self.model.tokenizer.tokenizers["english_phoneme"], "g2p"):
+            if "english_phoneme" in self.model.tokenizer.tokenizers and hasattr(
+                self.model.tokenizer.tokenizers["english_phoneme"], "g2p"
+            ):
                 g2p = self.model.tokenizer.tokenizers["english_phoneme"].g2p
         elif isinstance(self.model.tokenizer, IPATokenizer):
             g2p = self.model.tokenizer.g2p
diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py
index e0cd4c2714be..cce2855dd82b 100644
--- a/nemo/collections/tts/modules/magpietts_inference/utils.py
+++ b/nemo/collections/tts/modules/magpietts_inference/utils.py
@@ -28,7 +28,7 @@
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 
-from nemo.collections.tts.models import MagpieTTSModel, MagpieTTSDecoderModel
+from nemo.collections.tts.models import MagpieTTSDecoderModel, MagpieTTSModel
 from nemo.utils import logging
 
 
@@ -253,7 +253,9 @@ def update_checkpoint_state_dict(state_dict: dict) -> dict:
     return new_state_dict
 
 
-def load_magpie_model(config: ModelLoadConfig, device: str = "cuda", is_decoder_only_model: bool = False) -> Tuple[Union[MagpieTTSModel, MagpieTTSDecoderModel], str]:
+def load_magpie_model(
+    config: ModelLoadConfig, device: str = "cuda", is_decoder_only_model: bool = False
+) -> Tuple[Union[MagpieTTSModel, MagpieTTSDecoderModel], str]:
     """Load a MagpieTTS model from checkpoint or NeMo archive.
 
     Supports two loading modes:

From 94fcf032f98dac042c7d37add69c66272f3cdfd2 Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Thu, 8 Jan 2026 22:57:12 +0000
Subject: [PATCH 04/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/magpietts_decoder_only.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py
index f5f5be0522a6..7cecfce31573 100644
--- a/nemo/collections/tts/models/magpietts_decoder_only.py
+++ b/nemo/collections/tts/models/magpietts_decoder_only.py
@@ -74,6 +74,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False)
         self.sample_rate = codec_model.sample_rate
         self.output_sample_rate = codec_model.output_sample_rate
+
         if hasattr(codec_model, "discriminator"):
             # del codec discriminator to free memory
             del codec_model.discriminator
@@ -1924,7 +1925,7 @@ def infer_batch(
                 'tts_generation_time_per_frame': tts_generation_time_per_frame,
                 'batch_size': context_embedding.size(0),
             }
-            
+
             return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics
 
     @classmethod

From ae8f800c2d69196b4886316ed06b9e648e47130b Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Fri, 9 Jan 2026 16:43:13 -0500
Subject: [PATCH 05/94] handling changes in dataloader

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 .../tts/models/magpietts_decoder_only.py      | 164 +++++++++++-------
 1 file changed, 99 insertions(+), 65 deletions(-)

diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py
index 7cecfce31573..aa3588426281 100644
--- a/nemo/collections/tts/models/magpietts_decoder_only.py
+++ b/nemo/collections/tts/models/magpietts_decoder_only.py
@@ -281,69 +281,82 @@ def load_state_dict(self, state_dict, strict=True):
                         new_state_dict[key[len(name_with_dot) :]] = state_dict[key]
                 child.load_state_dict(new_state_dict)
 
-    def audio_to_codes(self, audio, audio_len, audio_type='target'):
-        # audio: (B, T)
-        # audio_len: (B,)
-        if audio_type == 'target':
-            audio_eos_id = self.audio_eos_id
-            audio_bos_id = self.audio_bos_id
-        elif audio_type == 'context':
-            audio_eos_id = self.context_audio_eos_id
-            audio_bos_id = self.context_audio_bos_id
-        else:
-            raise ValueError(f"Received audio_type of {audio_type}. Must be `target` or `context`")
+    def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1):
+        # codes: (B, C, T')
+        # codes_len: (B,)
+        codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0)
+        codes_len = codes_len + num_eos_tokens
+        # Insert EOS token at new final token entry
+        for idx in range(codes.size(0)):
+            codes[idx, :, codes_len[idx] - 1] = eos_id
+
+        return codes, codes_len
+
+    def add_special_tokens(self, codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1):
+        # codes: (B, C, T')
+        # codes_len: (B,)
+        codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id)
+        codes_len = codes_len + num_bos_tokens
+        codes, codes_len = self.add_eos_token(
+            codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens
+        )
+        return codes, codes_len
+
+    def remove_bos_token(self, codes, codes_len, num_tokens=1):
+        # codes: (B, C, T')
+        # codes_len: (B,)
+        codes = codes[:, :, num_tokens:]
+        codes_len = codes_len - num_tokens
+        return codes, codes_len
 
+    def remove_embedded_bos_token(self, embedded, embedded_len):
+        # codes: (B, T', C)
+        # codes_len: (B,)
+        embedded = embedded[:, 1:, :]
+        embedded_len = embedded_len - 1
+        return embedded, embedded_len
+
+    def remove_eos_token(self, codes, codes_len):
+        # codes: (B, C, T')
+        # codes_len: (B,)
+        codes_len = codes_len - 1
+        codes = codes[:, :, :-1]
+        mask = get_mask_from_lengths(lengths=codes_len)
+        codes = codes * mask.unsqueeze(1)
+        return codes, codes_len
+
+    def remove_embedded_eos_token(self, embedded, embedded_len):
+        # embedded: (B, T', D)
+        # embedded_len: (B,)
+        embedded_len = embedded_len - 1
+        embedded = embedded[:, :-1, :]
+        mask = get_mask_from_lengths(lengths=embedded_len)
+        embedded = embedded * mask.unsqueeze(2)
+        return embedded, embedded_len
+
+    def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1):
+        codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens)
+        codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len)
+        return codes, codes_len
+    
+    def audio_to_codes(self, audio, audio_len, sample_rate=None):
         self._codec_model.eval()
         with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32):
-            codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len)
-            if self._codec_converter is not None:
-                codes = self._codec_converter.convert_original_to_new(audio_tokens=codes, audio_lens=codes_len)
-            # Add a timestep to begining and end of codes tensor
-            bos_tensor = torch.full(
-                (codes.size(0), codes.size(1), 1), audio_bos_id, dtype=codes.dtype, device=codes.device
-            )
-            pad_tensor = torch.full(
-                (codes.size(0), codes.size(1), 1), 0, dtype=codes.dtype, device=codes.device
-            )  # 0 is the padding token in the audio codebook
-            codes = torch.cat([bos_tensor, codes, pad_tensor], dim=-1)
-            # codes: (B, C, T')
-            # codes_len: (B,)
-            for idx in range(codes.size(0)):
-                codes[idx, :, codes_len[idx] + 1] = audio_eos_id
-            codes_len = codes_len + 2
-
-            return codes.long(), codes_len.long()
+            codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate)
+            return codes, codes_len
 
     def codes_to_audio(self, codes, codes_len):
         # codes: (B, C, T')
         # codes_len: (B,)
-        if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor:
-            # Unstack the audio codes if they are stacked
-            codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor)
-
-        if codes.size(2) < 5:
-            # If the codes are too short, we need to pad them
-            codes = torch.cat(
-                [codes, torch.zeros(codes.size(0), codes.size(1), 5 - codes.size(2), device=codes.device)], dim=2
-            ).long()
-            codes_len = codes_len + 5 - codes.size(2)
-
         self._codec_model.eval()
         with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
-            # Make a copy to avoid modifying the original tensor if it's used elsewhere
-            codes_copy = codes.clone()
-            # Replace eos and bos tokens with padding in the copied tensor
-            codes_copy[codes == self.audio_bos_id] = 0  # zero is the padding token
-            codes_copy[codes == self.audio_eos_id] = 0
             # Pass the modified integer token IDs
             if self._codec_converter is not None:
-                codes_copy = self._codec_converter.convert_new_to_original(
-                    audio_tokens=codes_copy, audio_lens=codes_len
-                )
-            audio, audio_len = self._codec_model.decode(tokens=codes_copy, tokens_len=codes_len)
+                codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len)
+            audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len)
             # audio: (B, T)
             # audio_len: (B,)
-            return audio, audio_len
+            return audio, audio_len, codes
 
     def embed_audio_tokens(self, audio_tokens):
         # audio_tokens: (B, C, T')
@@ -502,7 +515,7 @@ def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=N
             codebook_logits = logits[:, :, si:ei]  # (B, T', num_tokens_per_codebook)
             codebook_targets = audio_codes[:, codebook]  # (B, T')
             codebook_loss = self.cross_entropy_loss(
-                codebook_logits.permute(0, 2, 1), codebook_targets  # (B, num_tokens_per_codebook, T')
+                codebook_logits.permute(0, 2, 1), codebook_targets.long()  # (B, num_tokens_per_codebook, T')
             )  # (B, T')
             codebook_loss = codebook_loss * loss_mask[:, codebook, :]
             codebook_loss = codebook_loss.sum() / loss_mask[:, codebook, :].sum()
@@ -810,12 +823,24 @@ def log_val_audio_example(
         wandb_audio_log = {}
 
         pred_audio_codes = self.logits_to_audio_codes(logits, audio_codes_lens_target)
-        pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target)
-        target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target)
+        pred_audio_codes, _ = self.remove_eos_token(
+            codes=pred_audio_codes,
+            codes_len=audio_codes_lens_target,
+        )
+        pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target-1)
+        target_audio_codes, _ = self.remove_eos_token(
+            codes=target_audio_codes,
+            codes_len=audio_codes_lens_target,
+        )
+        target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target-1)
 
         context_audio, context_audio_lens = None, None
         if context_audio_codes is not None and context_audio_codes.shape[2] > 3:
             # > 3 ensures, it is a valid context audio tensor (and not dummy tensor used in text context)
+            context_audio_codes, context_audio_codes_lens = self.remove_special_tokens(
+                codes=context_audio_codes,
+                codes_len=context_audio_codes_lens,
+            )
             context_audio, context_audio_lens = self.codes_to_audio(context_audio_codes, context_audio_codes_lens)
 
         for logger in self.loggers:
@@ -963,8 +988,16 @@ def prepare_context_tensors(self, batch, dropout_text_input=False):
                 ).long()
         else:
             context_audio_codes, context_audio_codes_lens = self.audio_to_codes(
-                batch['context_audio'], batch['context_audio_lens'], audio_type='context'
+                batch['context_audio'], batch['context_audio_lens']
             )
+        
+        context_audio_codes, context_audio_codes_lens = self.add_special_tokens(
+            codes=context_audio_codes,
+            codes_len=context_audio_codes_lens,
+            bos_id=self.context_audio_bos_id,
+            eos_id=self.context_audio_eos_id,
+        )
+
 
         context_audio_codes, context_audio_codes_lens = self.stack_codes(
             context_audio_codes,
@@ -1129,7 +1162,7 @@ def process_batch(self, batch, mode="train"):
             else False
         )
         context_tensors = self.prepare_context_tensors(batch, dropout_text_input)
-        print("text lens", context_tensors['text_lens'])
+        # print("text lens", context_tensors['text_lens'])
         remaining_text_embedded = context_tensors['remaining_text_embedded']
         context_embedding = context_tensors['context_embedding']
         context_lens = context_tensors['context_lens']
@@ -1160,6 +1193,14 @@ def process_batch(self, batch, mode="train"):
                     audio_tokens=audio_codes, audio_lens=audio_codes_lens
                 ).long()
 
+        
+        audio_codes, audio_codes_lens = self.add_special_tokens(
+            codes=audio_codes,
+            codes_len=audio_codes_lens,
+            bos_id=self.audio_bos_id,
+            eos_id=self.audio_eos_id,
+        )
+
         audio_codes, audio_codes_lens = self.stack_codes(
             audio_codes,
             audio_codes_lens,
@@ -1198,8 +1239,8 @@ def process_batch(self, batch, mode="train"):
                     batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes
                 )
             )
-            print("phoneme_tokens_lens", phoneme_tokens_lens)
-            print("audio_codes_lens", audio_codes_lens_input)
+            # print("phoneme_tokens_lens", phoneme_tokens_lens)
+            # print("audio_codes_lens", audio_codes_lens_input)
             if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]:
                 padding_tensor = torch.zeros(
                     phoneme_channel_input.shape[0],
@@ -1455,10 +1496,6 @@ def get_dataset(self, dataset_cfg, dataset_type):
             sample_rate=self.sample_rate,
             bos_id=None,
             eos_id=self.eos_id,
-            audio_bos_id=self.audio_bos_id,
-            audio_eos_id=self.audio_eos_id,
-            context_audio_bos_id=self.context_audio_bos_id,
-            context_audio_eos_id=self.context_audio_eos_id,
             num_audio_codebooks=self.data_num_audio_codebooks,
             codec_model_samples_per_frame=self.codec_model_samples_per_frame,
             prior_scaling_factor=0.0,
@@ -1486,10 +1523,6 @@ def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.D
             sample_rate=self.sample_rate,
             volume_norm=dataset_cfg.volume_norm,
             codec_model_samples_per_frame=self.codec_model_samples_per_frame,
-            audio_bos_id=self.audio_bos_id,
-            audio_eos_id=self.audio_eos_id,
-            context_audio_bos_id=self.context_audio_bos_id,
-            context_audio_eos_id=self.context_audio_eos_id,
             num_audio_codebooks=self.data_num_audio_codebooks,
             prior_scaling_factor=0.0,
             load_cached_codes_if_available=self.cfg.load_cached_codes_if_available,
@@ -1909,6 +1942,7 @@ def infer_batch(
                 target_lens=predicted_codes_lens,
             )
             predicted_codes = predicted_codes.permute(0, 2, 1)  # (B, num_codebooks, T)
+            predicted_codes, predicted_codes_lens = self.remove_eos_token(predicted_codes, predicted_codes_lens)
             predicted_audio, predicted_audio_lens = self.codes_to_audio(predicted_codes, predicted_codes_lens)
 
             end_time = time.time()

From c2ee2490864df2597c9d3dfb53b83630cfb38cb2 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Sat, 10 Jan 2026 03:06:08 -0500
Subject: [PATCH 06/94] hack to avoid HF error

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/audio_codec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py
index 2cdd5f0f8c9c..86097e134849 100644
--- a/nemo/collections/tts/models/audio_codec.py
+++ b/nemo/collections/tts/models/audio_codec.py
@@ -183,7 +183,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             # load pretrained model
             # self.speaker_encoder.load_checkpoint("https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar")
             self.speaker_encoder.load_checkpoint(
-                "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin", strict=False
+                "/gitrepos/checkpoints/pytorch_model.bin", strict=False
             )
             # freeze the pretrained speaker encoder
             self.speaker_encoder.freeze()

From 88a7576f534d37e532930b7bfa4b92877f38da7e Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Fri, 9 Jan 2026 21:44:13 +0000
Subject: [PATCH 07/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/magpietts_decoder_only.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py
index aa3588426281..4bec12088047 100644
--- a/nemo/collections/tts/models/magpietts_decoder_only.py
+++ b/nemo/collections/tts/models/magpietts_decoder_only.py
@@ -338,7 +338,7 @@ def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1):
         codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens)
         codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len)
         return codes, codes_len
-    
+
     def audio_to_codes(self, audio, audio_len, sample_rate=None):
         self._codec_model.eval()
         with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32):
@@ -827,12 +827,12 @@ def log_val_audio_example(
             codes=pred_audio_codes,
             codes_len=audio_codes_lens_target,
         )
-        pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target-1)
+        pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target - 1)
         target_audio_codes, _ = self.remove_eos_token(
             codes=target_audio_codes,
             codes_len=audio_codes_lens_target,
         )
-        target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target-1)
+        target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target - 1)
 
         context_audio, context_audio_lens = None, None
         if context_audio_codes is not None and context_audio_codes.shape[2] > 3:
@@ -990,7 +990,7 @@ def prepare_context_tensors(self, batch, dropout_text_input=False):
             context_audio_codes, context_audio_codes_lens = self.audio_to_codes(
                 batch['context_audio'], batch['context_audio_lens']
             )
-        
+
         context_audio_codes, context_audio_codes_lens = self.add_special_tokens(
             codes=context_audio_codes,
             codes_len=context_audio_codes_lens,
@@ -998,7 +998,6 @@ def prepare_context_tensors(self, batch, dropout_text_input=False):
             eos_id=self.context_audio_eos_id,
         )
 
-
         context_audio_codes, context_audio_codes_lens = self.stack_codes(
             context_audio_codes,
             context_audio_codes_lens,
@@ -1193,7 +1192,6 @@ def process_batch(self, batch, mode="train"):
                     audio_tokens=audio_codes, audio_lens=audio_codes_lens
                 ).long()
 
-        
         audio_codes, audio_codes_lens = self.add_special_tokens(
             codes=audio_codes,
             codes_len=audio_codes_lens,

From 76ce3d1545cff18507639f6d8e434331675e5458 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Sat, 10 Jan 2026 22:21:05 -0500
Subject: [PATCH 08/94] remove discriminatory temporarily

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/audio_codec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py
index 86097e134849..b91f32582ad4 100644
--- a/nemo/collections/tts/models/audio_codec.py
+++ b/nemo/collections/tts/models/audio_codec.py
@@ -110,7 +110,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self.audio_decoder = instantiate(cfg.audio_decoder)
 
         # Discriminator setup
-        self.discriminator = instantiate(cfg.discriminator)
+        # self.discriminator = instantiate(cfg.discriminator)
 
         # Mel loss setup
         loss_resolutions = cfg.loss_resolutions

From 6f3987ce82d7c969b1a79c4f37f09a2209c2792b Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Sat, 10 Jan 2026 08:07:24 +0000
Subject: [PATCH 09/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/audio_codec.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py
index b91f32582ad4..6ec1f8eb60e7 100644
--- a/nemo/collections/tts/models/audio_codec.py
+++ b/nemo/collections/tts/models/audio_codec.py
@@ -182,9 +182,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             self.speaker_encoder = ResNetSpeakerEncoder()
             # load pretrained model
             # self.speaker_encoder.load_checkpoint("https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar")
-            self.speaker_encoder.load_checkpoint(
-                "/gitrepos/checkpoints/pytorch_model.bin", strict=False
-            )
+            self.speaker_encoder.load_checkpoint("/gitrepos/checkpoints/pytorch_model.bin", strict=False)
             # freeze the pretrained speaker encoder
             self.speaker_encoder.freeze()
             logging.info("Speaker encoder loaded and frozen !!")

From aefe97f06147e1e76cdfb310bcfaedeb4f257761 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Sat, 10 Jan 2026 22:42:48 -0500
Subject: [PATCH 10/94] fix errors

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/magpietts_decoder_only.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py
index 4bec12088047..b08f355d8e29 100644
--- a/nemo/collections/tts/models/magpietts_decoder_only.py
+++ b/nemo/collections/tts/models/magpietts_decoder_only.py
@@ -827,12 +827,12 @@ def log_val_audio_example(
             codes=pred_audio_codes,
             codes_len=audio_codes_lens_target,
         )
-        pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target - 1)
+        pred_audio, pred_audio_lens, _ = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target - 1)
         target_audio_codes, _ = self.remove_eos_token(
             codes=target_audio_codes,
             codes_len=audio_codes_lens_target,
         )
-        target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target - 1)
+        target_audio, target_audio_lens, _ = self.codes_to_audio(target_audio_codes, audio_codes_lens_target - 1)
 
         context_audio, context_audio_lens = None, None
         if context_audio_codes is not None and context_audio_codes.shape[2] > 3:
@@ -841,7 +841,7 @@ def log_val_audio_example(
                 codes=context_audio_codes,
                 codes_len=context_audio_codes_lens,
             )
-            context_audio, context_audio_lens = self.codes_to_audio(context_audio_codes, context_audio_codes_lens)
+            context_audio, context_audio_lens, _ = self.codes_to_audio(context_audio_codes, context_audio_codes_lens)
 
         for logger in self.loggers:
             is_wandb = isinstance(logger, WandbLogger)
@@ -1941,7 +1941,7 @@ def infer_batch(
             )
             predicted_codes = predicted_codes.permute(0, 2, 1)  # (B, num_codebooks, T)
             predicted_codes, predicted_codes_lens = self.remove_eos_token(predicted_codes, predicted_codes_lens)
-            predicted_audio, predicted_audio_lens = self.codes_to_audio(predicted_codes, predicted_codes_lens)
+            predicted_audio, predicted_audio_lens, _ = self.codes_to_audio(predicted_codes, predicted_codes_lens)
 
             end_time = time.time()
             total_audio_duration_generated = (

From 9d52822d9fbbad507fc981a631278383f2e5ab16 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Sat, 10 Jan 2026 23:47:24 -0500
Subject: [PATCH 11/94] bug fix

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/magpietts_decoder_only.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py
index b08f355d8e29..d7d13d8e2310 100644
--- a/nemo/collections/tts/models/magpietts_decoder_only.py
+++ b/nemo/collections/tts/models/magpietts_decoder_only.py
@@ -349,6 +349,9 @@ def codes_to_audio(self, codes, codes_len):
         # codes: (B, C, T')
         # codes_len: (B,)
         self._codec_model.eval()
+        if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor:
+            # Unstack the audio codes if they are stacked
+            codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor)
         with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
             # Pass the modified integer token IDs
             if self._codec_converter is not None:

From 90a6c541a187d7753c6ade72c8a8e07dc31c8bfb Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Sun, 11 Jan 2026 03:13:31 -0500
Subject: [PATCH 12/94] add moe

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 .../tts/models/magpietts_decoder_only.py      | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py
index d7d13d8e2310..c053a92309a2 100644
--- a/nemo/collections/tts/models/magpietts_decoder_only.py
+++ b/nemo/collections/tts/models/magpietts_decoder_only.py
@@ -169,16 +169,23 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings)
             self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor)
 
-        if cfg.transformer_hf_backend == "custom_qwen3_moe":
-            # from transformers.models import qwen3_moe
-            # config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(intermediate_size=3072, num_hidden_layers=5, num_experts=64)
-            # self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
-            from transformers.models import qwen2_moe
-
-            config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig(
-                hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32
-            )
-            self.decoder = qwen2_moe.modeling_qwen2_moe.Qwen2MoeModel(config_qwen2)
+        if cfg.transformer_hf_backend == "custom_qwen3_moe_5layer":
+            from transformers.models import qwen3_moe
+            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=64)
+            self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
+        elif cfg.transformer_hf_backend == "custom_qwen3_moe_10layer":
+            from transformers.models import qwen3_moe
+            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=10, num_experts=64)
+            self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
+        elif cfg.transformer_hf_backend == "custom_qwen3_moe_15layer":
+            from transformers.models import qwen3_moe
+            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=15, num_experts=64)
+            self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
+            # from transformers.models import qwen2_moe
+            # config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig(
+            #     hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32
+            # )
+            # self.decoder = qwen2_moe.modeling_qwen2_moe.Qwen2MoeModel(config_qwen2)
         else:
             self.transformer_backend_config = AutoConfig.from_pretrained(
                 cfg.transformer_hf_backend,
@@ -352,6 +359,7 @@ def codes_to_audio(self, codes, codes_len):
         if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor:
             # Unstack the audio codes if they are stacked
             codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor)
+
         with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
             # Pass the modified integer token IDs
             if self._codec_converter is not None:

From 324b8038f103ba9839489dd8ef6d1fcb9cd3361c Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Sun, 11 Jan 2026 08:14:21 +0000
Subject: [PATCH 13/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 .../tts/models/magpietts_decoder_only.py          | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py
index c053a92309a2..4a16f01d5b12 100644
--- a/nemo/collections/tts/models/magpietts_decoder_only.py
+++ b/nemo/collections/tts/models/magpietts_decoder_only.py
@@ -171,15 +171,24 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
 
         if cfg.transformer_hf_backend == "custom_qwen3_moe_5layer":
             from transformers.models import qwen3_moe
-            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=64)
+
+            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(
+                hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=64
+            )
             self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
         elif cfg.transformer_hf_backend == "custom_qwen3_moe_10layer":
             from transformers.models import qwen3_moe
-            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=10, num_experts=64)
+
+            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(
+                hidden_size=1536, intermediate_size=3072, num_hidden_layers=10, num_experts=64
+            )
             self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
         elif cfg.transformer_hf_backend == "custom_qwen3_moe_15layer":
             from transformers.models import qwen3_moe
-            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=15, num_experts=64)
+
+            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(
+                hidden_size=1536, intermediate_size=3072, num_hidden_layers=15, num_experts=64
+            )
             self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
             # from transformers.models import qwen2_moe
             # config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig(

From 1c4a568d12833b7e5e75526dcd60b0cacd59bd1f Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Sun, 11 Jan 2026 03:20:58 -0500
Subject: [PATCH 14/94] 20 layer moe

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/magpietts_decoder_only.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py
index 4a16f01d5b12..b0b4e149bcf1 100644
--- a/nemo/collections/tts/models/magpietts_decoder_only.py
+++ b/nemo/collections/tts/models/magpietts_decoder_only.py
@@ -190,6 +190,13 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
                 hidden_size=1536, intermediate_size=3072, num_hidden_layers=15, num_experts=64
             )
             self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
+        elif cfg.transformer_hf_backend == "custom_qwen3_moe_20layer":
+            from transformers.models import qwen3_moe
+
+            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(
+                hidden_size=1536, intermediate_size=3072, num_hidden_layers=20, num_experts=64
+            )
+            self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
             # from transformers.models import qwen2_moe
             # config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig(
             #     hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32

From a19012af4b283e98f56ba60f337866f54796ca63 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Thu, 22 Jan 2026 14:32:58 -0500
Subject: [PATCH 15/94] some refactoring and clean up

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 ..._decoder_only.yaml => easy_magpietts.yaml} |   0
 ...lhotse.yaml => easy_magpietts_lhotse.yaml} |   0
 ...etts_decoder_only.py => easy_magpietts.py} |   6 +-
 examples/tts/evalset_config.json              |  36 ++-
 examples/tts/magpietts_inference.py           |  10 +
 nemo/collections/tts/models/__init__.py       |   4 +-
 nemo/collections/tts/models/audio_codec.py    |   9 +-
 ...etts_decoder_only.py => easy_magpietts.py} | 245 +-----------------
 .../modules/magpietts_inference/inference.py  |   4 +-
 .../tts/modules/magpietts_inference/utils.py  |   6 +-
 10 files changed, 71 insertions(+), 249 deletions(-)
 rename examples/tts/conf/magpietts/{magpietts_decoder_only.yaml => easy_magpietts.yaml} (100%)
 rename examples/tts/conf/magpietts/{magpietts_decoder_only_lhotse.yaml => easy_magpietts_lhotse.yaml} (100%)
 rename examples/tts/{magpietts_decoder_only.py => easy_magpietts.py} (91%)
 rename nemo/collections/tts/models/{magpietts_decoder_only.py => easy_magpietts.py} (87%)

diff --git a/examples/tts/conf/magpietts/magpietts_decoder_only.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
similarity index 100%
rename from examples/tts/conf/magpietts/magpietts_decoder_only.yaml
rename to examples/tts/conf/magpietts/easy_magpietts.yaml
diff --git a/examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
similarity index 100%
rename from examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml
rename to examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
diff --git a/examples/tts/magpietts_decoder_only.py b/examples/tts/easy_magpietts.py
similarity index 91%
rename from examples/tts/magpietts_decoder_only.py
rename to examples/tts/easy_magpietts.py
index 73bb87de7969..4195060b87ef 100644
--- a/examples/tts/magpietts_decoder_only.py
+++ b/examples/tts/easy_magpietts.py
@@ -16,13 +16,13 @@
 import torch.multiprocessing as mp
 from omegaconf import OmegaConf
 
-from nemo.collections.tts.models import MagpieTTSDecoderModel
+from nemo.collections.tts.models import EasyMagpieTTSModel
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
 
-@hydra_runner(config_path="conf/magpietts", config_name="magpietts_decoderonly_en")
+@hydra_runner(config_path="conf/magpietts", config_name="easy_magpietts")
 def main(cfg):
     logging.info('\nConfig Params:\n%s', OmegaConf.to_yaml(cfg, resolve=True))
 
@@ -42,7 +42,7 @@ def main(cfg):
     trainer.callbacks.append(pl.callbacks.LearningRateMonitor(logging_interval='step', log_weight_decay=True))
     exp_manager(trainer, cfg.get("exp_manager", None))
 
-    model = MagpieTTSDecoderModel(cfg=cfg.model, trainer=trainer)
+    model = EasyMagpieTTSModel(cfg=cfg.model, trainer=trainer)
     model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
 
     if cfg.get('mode', 'train') == 'train':
diff --git a/examples/tts/evalset_config.json b/examples/tts/evalset_config.json
index 4ff4d12ad9eb..029f818ef53b 100644
--- a/examples/tts/evalset_config.json
+++ b/examples/tts/evalset_config.json
@@ -14,10 +14,44 @@
         "audio_dir": "/",
         "feature_dir": null
     },
+    "riva_multibpe": {
+        "manifest_path": "/Data/evaluation_manifests/riva_hard_multi_bpe.ndjson",
+        "audio_dir": "/Data/RIVA-TTS",
+        "feature_dir": "/Data/RIVA-TTS"
+    },
     "riva_hard_digits": {
         "manifest_path": "/Data/evaluation_manifests/hard-digits-path-corrected.ndjson",
         "audio_dir": "/Data/RIVA-TTS",
         "feature_dir": "/Data/RIVA-TTS"
+    },
+    "riva_hard_letters": {
+        "manifest_path": "/Data/evaluation_manifests/hard-letters-path-corrected.ndjson",
+        "audio_dir": "/Data/RIVA-TTS",
+        "feature_dir": "/Data/RIVA-TTS"
+    },
+    "riva_hard_money": {
+        "manifest_path": "/Data/evaluation_manifests/hard-money-path-corrected.ndjson",
+        "audio_dir": "/Data/RIVA-TTS",
+        "feature_dir": "/Data/RIVA-TTS"
+    },
+    "riva_hard_short": {
+        "manifest_path": "/Data/evaluation_manifests/hard-short-path-corrected.ndjson",
+        "audio_dir": "/Data/RIVA-TTS",
+        "feature_dir": "/Data/RIVA-TTS"
+    },
+    "vctk": {
+        "manifest_path": "/Data/evaluation_manifests/smallvctk__phoneme__nemo_audio_21fps_8codebooks_2kcodes_v2bWithWavLM_simplet5_withcontextaudiopaths_silence_trimmed.json",
+        "audio_dir": "/Data/VCTK-Corpus-0.92",
+        "feature_dir": "/Data/VCTK-Corpus-0.92"
+    },
+    "libritts_seen": {
+        "manifest_path": "/Data/evaluation_manifests/LibriTTS_seen_evalset_from_testclean_v2.json",
+        "audio_dir": "/Data/LibriTTS",
+        "feature_dir": "/Data/LibriTTS"
+    },
+    "libritts_test_clean": {
+        "manifest_path": "/Data/evaluation_manifests/LibriTTS_test_clean_withContextAudioPaths.jsonl",
+        "audio_dir": "/Data/LibriTTS",
+        "feature_dir": "/Data/LibriTTS"
     }
 }
-
diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py
index 3199f58e9970..1e7753798db4 100644
--- a/examples/tts/magpietts_inference.py
+++ b/examples/tts/magpietts_inference.py
@@ -193,6 +193,8 @@ def run_inference_and_evaluation(
     model, checkpoint_name = load_magpie_model(
         model_config, is_decoder_only_model=inference_config.is_decoder_only_model
     )
+    # change model to fp32 for inference
+    model = model.float()
 
     # Log architecture summary and get MoE info + FLOPs metrics
     moe_info, flops_per_component = log_model_architecture_summary(model)
@@ -551,6 +553,14 @@ def main(argv=None):
             else:
                 model_inference_parameters[field_name] = arg_from_cmdline
 
+    if "max_decoder_steps" not in model_inference_parameters:
+        if args.longform_mode in {'always', 'auto'}:
+            model_inference_parameters["max_decoder_steps"] = args.longform_max_decoder_steps
+        elif args.is_decoder_only_model:
+            model_inference_parameters["max_decoder_steps"] = 220
+        else:
+            model_inference_parameters["max_decoder_steps"] = 440
+
     inference_config = InferenceConfig(
         model_inference_parameters=ModelInferenceParameters.from_dict(model_inference_parameters),
         batch_size=args.batch_size,
diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py
index 6e781bed19ef..d9f406a3ba3d 100644
--- a/nemo/collections/tts/models/__init__.py
+++ b/nemo/collections/tts/models/__init__.py
@@ -18,7 +18,7 @@
 from nemo.collections.tts.models.fastpitch_ssl import FastPitchModel_SSL
 from nemo.collections.tts.models.hifigan import HifiGanModel
 from nemo.collections.tts.models.magpietts import InferBatchOutput, MagpieTTSModel
-from nemo.collections.tts.models.magpietts_decoder_only import MagpieTTSDecoderModel
+from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel
 from nemo.collections.tts.models.magpietts_preference_optimization import (
     MagpieTTSModelOfflinePO,
     MagpieTTSModelOfflinePODataGen,
@@ -35,7 +35,7 @@
     "HifiGanModel",
     "InferBatchOutput",
     "MagpieTTSModel",
-    "MagpieTTSDecoderModel",
+    "EasyMagpieTTSModel",
     "MagpieTTSModelOfflinePODataGen",
     "MagpieTTSModelOfflinePO",
     "MagpieTTSModelOnlinePO",
diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py
index 6ec1f8eb60e7..d5c1afb3a5bf 100644
--- a/nemo/collections/tts/models/audio_codec.py
+++ b/nemo/collections/tts/models/audio_codec.py
@@ -182,7 +182,14 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             self.speaker_encoder = ResNetSpeakerEncoder()
             # load pretrained model
             # self.speaker_encoder.load_checkpoint("https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar")
-            self.speaker_encoder.load_checkpoint("/gitrepos/checkpoints/pytorch_model.bin", strict=False)
+            import os
+            # TODO: revert this
+            if os.path.exists("/gitrepos/checkpoints/pytorch_model.bin"):
+                self.speaker_encoder.load_checkpoint("/gitrepos/checkpoints/pytorch_model.bin", strict=False)
+            else:
+                self.speaker_encoder.load_checkpoint(
+                "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin", strict=False
+            )
             # freeze the pretrained speaker encoder
             self.speaker_encoder.freeze()
             logging.info("Speaker encoder loaded and frozen !!")
diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/easy_magpietts.py
similarity index 87%
rename from nemo/collections/tts/models/magpietts_decoder_only.py
rename to nemo/collections/tts/models/easy_magpietts.py
index b0b4e149bcf1..bab703c242ad 100644
--- a/nemo/collections/tts/models/magpietts_decoder_only.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -59,7 +59,7 @@ def worker_init_fn(worker_id):
         dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config)
 
 
-class MagpieTTSDecoderModel(ModelPT):
+class EasyMagpieTTSModel(ModelPT):
     """
     Magpie-TTS Model Decoder Only Model
     audio/text
@@ -107,7 +107,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.codec_model_samples_per_frame = codec_model.samples_per_frame
         # Our codebooks start with actual audio codec tokens, followed by special tokens.
         # The `forced_*` options are for backward compatibility for models trained with older code.
-        num_audio_tokens = codec_model.codebook_size
         # Our codebooks start with actual audio codec tokens, followed by special tokens.
         # The `forced_*` options are for backward compatibility for models trained with older code.
         get_token_index = partial(SpecialAudioToken.get_index, base_codebook_size=self.codebook_size)
@@ -468,47 +467,6 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_
 
         return all_code_logits
 
-    def maskgit_create_random_mask(self, codes):
-        """
-        Creates a mask where True indicates the positions that should be replaced with a MASK_TOKEN.
-        """
-        # Codes: (B, C, T)
-        B, C, T = codes.shape
-        # get a uniform random vector uniformly sampled from [0,1) ## Todo does it need to be inclusive on the right?
-        rand_values = torch.rand(B, T, device=codes.device)
-        # apply the cosine schedule
-        frac_masked = cosine_schedule(rand_values)
-        # how many positions to mask
-        n_masked = torch.ceil(frac_masked * C).long()  # B,T
-        # start from all unmasked
-        mask = torch.zeros_like(codes, dtype=torch.bool)
-        # The code further below is the vectorized version of this:
-        #  for b in range(B):
-        #      for t in range(T):
-        #          if n_masked[b,t] > 0:
-        #              # get a random permutation of the codebook indices
-        #              perm = torch.randperm(C)
-        #              # mask the top n_masked positions
-        #              mask[b, perm[:n_masked[b,t]], t] = True
-        #
-        # Create random permutations
-        random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1)  # (B, C, T)
-        # Create a mask tensor where each position indicates if it should be masked
-        mask_indices = torch.arange(C, device=codes.device).view(1, C, 1)
-        mask = mask_indices < n_masked.view(B, 1, T)  # (B, C, T)
-        # Apply the random permutations to the mask
-        mask = torch.gather(mask, 1, random_permutations)
-
-        return mask  # (B, C, T)
-
-    def maskgit_apply_random_mask(self, codes):
-        # Randomly replaces some codes with the MASK_TOKEN with a proportion following the cosine schedule.
-        # Codes: (B, C, T)
-        mask = self.maskgit_create_random_mask(codes)
-        ## replace some tokens with MASK_TOKEN
-        codes_with_mask = torch.where(mask, self.mask_token_id, codes)
-        return codes_with_mask, mask
-
     def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=None):
         """
         Computes the audio codebook loss. Used by
@@ -601,128 +559,6 @@ def logits_to_audio_codes(self, all_code_logits, audio_codes_lens):
 
         return all_preds
 
-    def local_transformer_sample_maskgit(
-        self,
-        dec_output,
-        temperature=0.7,
-        topk=80,
-        unfinished_items={},
-        finished_items={},
-        use_cfg=False,
-        cfg_scale=1.0,
-        n_steps=3,
-    ):
-        """
-        Sample codes for one timestep from the local transformer using MaskGit.
-        """
-        if self.frame_stacking_factor > 1:
-            raise NotImplementedError("MaskGit sampling is not implemented for frame stacking factor > 1")
-        # dec_output: (B, E)
-        device = dec_output.device
-        # disable KV cache since our transformer is not causal
-        self.local_transformer.reset_cache(use_cache=False)
-        dec_output = dec_output.unsqueeze(1)  # (B, 1, E)
-        local_transformer_input_init = self.local_transformer_in_projection(
-            dec_output
-        )  # (B, 1, D) where D is the dimension of the local transformer
-        C = self.num_audio_codebooks
-        B = dec_output.size(0)
-
-        min_confidence = float("-inf")
-        max_confidence = 10000  # this needs to be large enough that unmasked items will always remain unmasked. # TODO @rfejgin: use float('inf')?
-        confidences = min_confidence * torch.ones(B, C, device=device)
-        # initialize to all masked
-        codes = self.mask_token_id * torch.ones((B, C), device=device, dtype=torch.long)
-        sampled_codes = codes.clone()
-        for step in range(n_steps):
-            # get mask fraction
-            frac_masked = cosine_schedule(torch.tensor(step / (n_steps)))
-            # how many codebooks to mask
-            n_masked = torch.ceil(
-                C * frac_masked
-            ).long()  # TODO @rfejgin: should we force this to be initialized to exactly `C` (to avoid numerical issues)?
-            n_unmasked = C - n_masked
-            # pick top-confidence codebooks up to n_unmasked
-            _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1)
-
-            # replace masks of the top-k confident codebooks with the the codes that were sampled for them
-            unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices)
-            codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
-
-            # build transformer input
-            local_transformer_input = local_transformer_input_init
-            for codebook_num in range(C):
-                next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(
-                    1
-                )  # (B, 1, 768)
-                next_local_transformer_input = self.local_transformer_in_projection(
-                    next_local_transformer_input
-                )  # (B, 1, d_local)
-                local_transformer_input = torch.cat(
-                    [local_transformer_input, next_local_transformer_input], dim=1
-                )  # (B, codebook_num+1, d_local)
-
-            # run transformer
-            _mask = torch.ones(B, C + 1, device=device)
-            local_transformer_output = self.local_transformer(local_transformer_input, _mask)[
-                'output'
-            ]  # (B, C+1, d_local)
-
-            # get logits
-            logits = []
-            for codebook_num in range(C):
-                # The `codebook_num+1` is to drop first position which corresponds to the magpie latent
-                codebook_logits = self.local_transformer_out_projections[codebook_num](
-                    local_transformer_output[:, codebook_num + 1, :]
-                )  # (B, num_audio_tokens_per_codebook)
-                logits.append(codebook_logits)
-            logits = torch.stack(logits, dim=1)  # (B, C, num_audio_tokens_per_codebook)
-
-            # apply CFG
-            if use_cfg:
-                actual_batch_size = logits.size(0) // 2
-                conditional_logits = logits[:actual_batch_size]
-                unconditional_logits = logits[actual_batch_size:]
-                cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
-                logits[:actual_batch_size] = cfg_logits
-
-            # handle unfinished and finished items
-            for item_idx in unfinished_items:
-                logits[item_idx, self.audio_eos_id] = float('-inf')
-            for item_idx in finished_items:
-                logits[item_idx, :, :] = float('-inf')
-                logits[item_idx, :, self.audio_eos_id] = 0.0
-
-            # sample with top-k
-            logits_topk = torch.topk(logits, topk, dim=-1)[0]  # (B, C, topk)
-            indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1)  # (B, C, num_audio_tokens_per_codebook)
-            logits_rescored = logits.clone()
-            logits_rescored[indices_to_remove] = float('-inf')
-            probs = torch.softmax(logits_rescored / temperature, dim=-1)  # (B, C, num_audio_tokens_per_codebook)
-            sampled_codes = torch.multinomial(probs.view(B * C, -1), 1).view(B, C)
-            if use_cfg:
-                # TODO @rfejgin: why do we need to keep second half of the batch? can probably optimize this
-                sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size]
-                probs[actual_batch_size:] = probs[:actual_batch_size]
-            confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1)
-
-            # set confidence to max for unmasked codebooks so that they will remain unmasked
-            confidences.scatter_(
-                index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float)
-            )
-
-            # replace entries in sampled_codes with previously unmasked codebooks
-            sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
-            # optionally: add noise to confidences here (as in token-critic paper) (not implemented)
-
-        codes = sampled_codes
-        assert not (
-            codes == self.mask_token_id
-        ).any(), f"Codes contain mask tokens after completion of MaskGit sampling"
-        if use_cfg:
-            codes = codes[:actual_batch_size]
-        return codes
-
     def local_transformer_sample_autoregressive(
         self,
         dec_output,
@@ -1300,25 +1136,13 @@ def process_batch(self, batch, mode="train"):
         local_transformer_loss = None
         local_transformer_logits = None
         if self.local_transformer_type != LocalTransformerType.NO_LT:
-            if self.local_transformer_type == LocalTransformerType.MASKGIT:
-                # randomly replace some positions with MASK_TOKEN
-                audio_codes_masked, mask_tokens_mask = self.maskgit_apply_random_mask(audio_codes_target)
-                local_transformer_logits = self.compute_local_transformer_logits(
-                    pred_embeddings, audio_codes_masked, targets_offset_by_one=True
-                )
-                # audio_codes_masked = audio_codes_masked[:, 1:, :]
-                local_transformer_loss, _ = self.compute_loss(
-                    local_transformer_logits, audio_codes_target, audio_codes_lens_target, mask_tokens_mask
-                )
-            else:
-                # autoregressive
-                assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type"
-                local_transformer_logits = self.compute_local_transformer_logits(
-                    pred_embeddings, audio_codes_target, targets_offset_by_one=False
-                )
-                local_transformer_loss, _ = self.compute_loss(
-                    local_transformer_logits, audio_codes_target, audio_codes_lens_target, None
-                )
+            assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type"
+            local_transformer_logits = self.compute_local_transformer_logits(
+                pred_embeddings, audio_codes_target, targets_offset_by_one=False
+            )
+            local_transformer_loss, _ = self.compute_loss(
+                local_transformer_logits, audio_codes_target, audio_codes_lens_target, None
+            )
             local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0)
             loss = loss + local_transformer_loss_scale * local_transformer_loss
 
@@ -1438,50 +1262,6 @@ def validation_step(self, batch, batch_idx):
                 if isinstance(logger, WandbLogger) and wandb_log_dict:
                     logger.experiment.log(wandb_log_dict)
 
-            # infer_output_no_cfg_noLT = self.infer_batch(
-            #     batch,
-            #     max_decoder_steps=500,
-            #     temperature=0.7,
-            #     topk=80,
-            #     use_local_transformer_for_inference=False,
-            #     maskgit_n_steps=3,
-            #     use_cfg=False,
-            #     cfg_scale=1.0
-            # )
-            # infer_output_cfg_withLT = self.infer_batch(
-            #     batch,
-            #     max_decoder_steps=500,
-            #     temperature=0.7,
-            #     topk=80,
-            #     use_local_transformer_for_inference=self.local_transformer_type != LocalTransformerType.NO_LT,
-            #     maskgit_n_steps=3,
-            #     use_cfg=True,
-            #     cfg_scale=2.5
-            # )
-            # pred_audio_no_cfg_noLT, pred_audio_no_cfg_noLT_lens = infer_output_no_cfg_noLT[0], infer_output_no_cfg_noLT[1]
-            # pred_audio_cfg_withLT, pred_audio_cfg_withLT_lens = infer_output_cfg_withLT[0], infer_output_cfg_withLT[1]
-
-            # for logger in self.loggers:
-            #     is_wandb = isinstance(logger, WandbLogger)
-            #     is_tb = isinstance(logger, TensorBoardLogger)
-            #     if not is_wandb and not is_tb:
-            #         raise ValueError(f"Invalid logger type for audio logging: {type(logger)}. Only `WandbLogger` and `TensorBoardLogger` are supported.")
-            #     for idx in range(pred_audio_no_cfg_noLT.size(0)):
-            #         pred_audio_no_cfg_noLT_idx = pred_audio_no_cfg_noLT[idx][:pred_audio_no_cfg_noLT_lens[idx]].float().cpu().numpy()
-            #         pred_audio_cfg_withLT_idx = pred_audio_cfg_withLT[idx][:pred_audio_cfg_withLT_lens[idx]].float().cpu().numpy()
-            #         if is_wandb:
-            #             logger.experiment.log({
-            #                 "val/pred_audio_no_cfg_noLT": wandb.Audio(pred_audio_no_cfg_noLT_idx, sample_rate=self.sample_rate, caption="Inference No CFG, No LT"),
-            #                 "val/pred_audio_cfg_withLT": wandb.Audio(pred_audio_cfg_withLT_idx, sample_rate=self.sample_rate, caption="Inference CFG, With LT"),
-            #             })
-            #         if is_tb:
-            #             logger.experiment.add_audio(
-            #                 "val/pred_audio_no_cfg_noLT", pred_audio_no_cfg_noLT_idx, sample_rate=self.sample_rate, global_step=batch_idx
-            #             )
-            #             logger.experiment.add_audio(
-            #                 "val/pred_audio_cfg_withLT", pred_audio_cfg_withLT_idx, sample_rate=self.sample_rate, global_step=batch_idx
-            #             )
-
         local_transformer_loss = batch_output['local_transformer_loss']
         val_output = {
             'val_loss': loss,
@@ -1780,15 +1560,6 @@ def infer_batch(
                             use_cfg=use_cfg,
                             cfg_scale=cfg_scale,
                         )
-                    elif self.local_transformer_type == LocalTransformerType.MASKGIT:
-                        audio_codes_next = self.local_transformer_sample_maskgit(
-                            dec_output=last_hidden[:, -1, :],
-                            temperature=temperature,
-                            topk=topk,
-                            n_steps=maskgit_n_steps,
-                            use_cfg=use_cfg,
-                            cfg_scale=cfg_scale,
-                        )
                     else:
                         raise ValueError(
                             f"Local transformer inference requested by but local transformer type is {self.local_transformer_type}"
diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index 19e5793a892b..34ba8d62c730 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -34,7 +34,7 @@
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
 from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPATokenizer
 from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset, MagpieTTSDataset
-from nemo.collections.tts.models import MagpieTTSDecoderModel, MagpieTTSModel
+from nemo.collections.tts.models import EasyMagpieTTSModel, MagpieTTSModel
 from nemo.collections.tts.models.magpietts import ModelInferenceParameters
 from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors
 from nemo.utils import logging
@@ -134,7 +134,7 @@ class MagpieInferenceRunner:
 
     def __init__(
         self,  # model can be MagpieTTSModel or DecoderOnlyMagpieTTSModel
-        model: Union[MagpieTTSModel, MagpieTTSDecoderModel],
+        model: Union[MagpieTTSModel, EasyMagpieTTSModel],
         config: InferenceConfig,
     ):
         """Initialize the inference runner.
diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py
index cce2855dd82b..d7dd672867c3 100644
--- a/nemo/collections/tts/modules/magpietts_inference/utils.py
+++ b/nemo/collections/tts/modules/magpietts_inference/utils.py
@@ -28,7 +28,7 @@
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 
-from nemo.collections.tts.models import MagpieTTSDecoderModel, MagpieTTSModel
+from nemo.collections.tts.models import EasyMagpieTTSModel, MagpieTTSModel
 from nemo.utils import logging
 
 
@@ -255,7 +255,7 @@ def update_checkpoint_state_dict(state_dict: dict) -> dict:
 
 def load_magpie_model(
     config: ModelLoadConfig, device: str = "cuda", is_decoder_only_model: bool = False
-) -> Tuple[Union[MagpieTTSModel, MagpieTTSDecoderModel], str]:
+) -> Tuple[Union[MagpieTTSModel, EasyMagpieTTSModel], str]:
     """Load a MagpieTTS model from checkpoint or NeMo archive.
 
     Supports two loading modes:
@@ -273,7 +273,7 @@ def load_magpie_model(
         ValueError: If configuration is invalid or sample rates don't match.
     """
     config.validate()
-    model_cls = MagpieTTSDecoderModel if is_decoder_only_model else MagpieTTSModel
+    model_cls = EasyMagpieTTSModel if is_decoder_only_model else MagpieTTSModel
     if config.hparams_file is not None and config.checkpoint_file is not None:
         # Mode 1: Load from hparams + checkpoint
         model_cfg = OmegaConf.load(config.hparams_file)

From d88eda2ad5e7c24b838c5c23161c24ede71919f5 Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Thu, 22 Jan 2026 19:33:55 +0000
Subject: [PATCH 16/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/__init__.py    | 2 +-
 nemo/collections/tts/models/audio_codec.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py
index d9f406a3ba3d..20984cfccc6a 100644
--- a/nemo/collections/tts/models/__init__.py
+++ b/nemo/collections/tts/models/__init__.py
@@ -14,11 +14,11 @@
 
 from nemo.collections.tts.models.aligner import AlignerModel
 from nemo.collections.tts.models.audio_codec import AudioCodecModel
+from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel
 from nemo.collections.tts.models.fastpitch import FastPitchModel
 from nemo.collections.tts.models.fastpitch_ssl import FastPitchModel_SSL
 from nemo.collections.tts.models.hifigan import HifiGanModel
 from nemo.collections.tts.models.magpietts import InferBatchOutput, MagpieTTSModel
-from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel
 from nemo.collections.tts.models.magpietts_preference_optimization import (
     MagpieTTSModelOfflinePO,
     MagpieTTSModelOfflinePODataGen,
diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py
index d5c1afb3a5bf..de11bb4f9229 100644
--- a/nemo/collections/tts/models/audio_codec.py
+++ b/nemo/collections/tts/models/audio_codec.py
@@ -183,13 +183,15 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             # load pretrained model
             # self.speaker_encoder.load_checkpoint("https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar")
             import os
+
             # TODO: revert this
             if os.path.exists("/gitrepos/checkpoints/pytorch_model.bin"):
                 self.speaker_encoder.load_checkpoint("/gitrepos/checkpoints/pytorch_model.bin", strict=False)
             else:
                 self.speaker_encoder.load_checkpoint(
-                "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin", strict=False
-            )
+                    "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin",
+                    strict=False,
+                )
             # freeze the pretrained speaker encoder
             self.speaker_encoder.freeze()
             logging.info("Speaker encoder loaded and frozen !!")

From 122af0ab96dac2129aa53e69ea9196ff7fb8c773 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Tue, 27 Jan 2026 19:29:58 -0500
Subject: [PATCH 17/94] bug fix related to spectral codec

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index bab703c242ad..120b63aef46c 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -845,14 +845,15 @@ def prepare_context_tensors(self, batch, dropout_text_input=False):
         if 'context_audio_codes' in batch:
             context_audio_codes = batch['context_audio_codes']
             context_audio_codes_lens = batch['context_audio_codes_lens']
-            if self._codec_converter is not None:
-                context_audio_codes = self._codec_converter.convert_original_to_new(
-                    audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens
-                ).long()
         else:
             context_audio_codes, context_audio_codes_lens = self.audio_to_codes(
                 batch['context_audio'], batch['context_audio_lens']
             )
+        
+        if self._codec_converter is not None:
+            context_audio_codes = self._codec_converter.convert_original_to_new(
+                audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens
+            ).long()
 
         context_audio_codes, context_audio_codes_lens = self.add_special_tokens(
             codes=context_audio_codes,
@@ -1050,10 +1051,11 @@ def process_batch(self, batch, mode="train"):
         else:
             audio_codes = batch['audio_codes']
             audio_codes_lens = batch['audio_codes_lens']
-            if self._codec_converter is not None:
-                audio_codes = self._codec_converter.convert_original_to_new(
-                    audio_tokens=audio_codes, audio_lens=audio_codes_lens
-                ).long()
+        
+        if self._codec_converter is not None:
+            audio_codes = self._codec_converter.convert_original_to_new(
+                audio_tokens=audio_codes, audio_lens=audio_codes_lens
+            ).long()
 
         audio_codes, audio_codes_lens = self.add_special_tokens(
             codes=audio_codes,

From 59208f1913b47df67d1973c2a9b621222e81035f Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Wed, 28 Jan 2026 00:31:15 +0000
Subject: [PATCH 18/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 120b63aef46c..421d80c453fa 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -849,7 +849,7 @@ def prepare_context_tensors(self, batch, dropout_text_input=False):
             context_audio_codes, context_audio_codes_lens = self.audio_to_codes(
                 batch['context_audio'], batch['context_audio_lens']
             )
-        
+
         if self._codec_converter is not None:
             context_audio_codes = self._codec_converter.convert_original_to_new(
                 audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens
@@ -1051,7 +1051,7 @@ def process_batch(self, batch, mode="train"):
         else:
             audio_codes = batch['audio_codes']
             audio_codes_lens = batch['audio_codes_lens']
-        
+
         if self._codec_converter is not None:
             audio_codes = self._codec_converter.convert_original_to_new(
                 audio_tokens=audio_codes, audio_lens=audio_codes_lens

From 3c8bb40067c4c98304c0929f613e339e0dc0850b Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Wed, 28 Jan 2026 15:32:36 -0500
Subject: [PATCH 19/94] some clean up

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 77 ++++---------------
 1 file changed, 14 insertions(+), 63 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 421d80c453fa..35d7c73d54fa 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -135,7 +135,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         )
 
         num_tokens_tokenizer = len(self.tokenizer.tokens)
-        num_tokens = num_tokens_tokenizer + 3  # +2 for BOS and EOS
+        num_tokens = num_tokens_tokenizer + 3  # +3 for BOS, EOS, CFG_UNK
         self.bos_id = num_tokens - 3
         self.eos_id = num_tokens - 2
         self.cfg_unk_token_id = num_tokens - 1
@@ -168,48 +168,14 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings)
             self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor)
 
-        if cfg.transformer_hf_backend == "custom_qwen3_moe_5layer":
-            from transformers.models import qwen3_moe
-
-            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(
-                hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=64
-            )
-            self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
-        elif cfg.transformer_hf_backend == "custom_qwen3_moe_10layer":
-            from transformers.models import qwen3_moe
-
-            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(
-                hidden_size=1536, intermediate_size=3072, num_hidden_layers=10, num_experts=64
-            )
-            self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
-        elif cfg.transformer_hf_backend == "custom_qwen3_moe_15layer":
-            from transformers.models import qwen3_moe
-
-            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(
-                hidden_size=1536, intermediate_size=3072, num_hidden_layers=15, num_experts=64
-            )
-            self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
-        elif cfg.transformer_hf_backend == "custom_qwen3_moe_20layer":
-            from transformers.models import qwen3_moe
-
-            config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(
-                hidden_size=1536, intermediate_size=3072, num_hidden_layers=20, num_experts=64
-            )
-            self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config)
-            # from transformers.models import qwen2_moe
-            # config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig(
-            #     hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32
-            # )
-            # self.decoder = qwen2_moe.modeling_qwen2_moe.Qwen2MoeModel(config_qwen2)
-        else:
-            self.transformer_backend_config = AutoConfig.from_pretrained(
-                cfg.transformer_hf_backend,
-                trust_remote_code=True,
-            )
+        self.transformer_backend_config = AutoConfig.from_pretrained(
+            cfg.transformer_hf_backend,
+            trust_remote_code=True,
+        )
 
-            hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config)
-            self.decoder = hf_transformer.model
-            self.lm_text_head = hf_transformer.lm_head
+        hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config)
+        self.decoder = hf_transformer.model
+        self.lm_text_head = hf_transformer.lm_head
 
         self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim)
         self.decoder.set_input_embeddings(self.text_embedding)
@@ -467,32 +433,18 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_
 
         return all_code_logits
 
-    def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=None):
+    def compute_loss(self, logits, audio_codes, audio_codes_lens):
         """
         Computes the audio codebook loss. Used by
         (1) The main Magpie-TTS transformer
-        (2) The local transformer, for both autoregressive and MaskGit methods
+        (2) The local transformer
 
         logits: (B, T', num_codebooks * num_tokens_per_codebook)
         audio_codes: (B, C, T')
         audio_codes_lens: (B,)
-        mask_tokens_mask: (B, C, T') True for tokens that were replaced with the MASK_TOKEN and should
-                                     therefore be the only ones included in the loss computation.
         """
         loss_mask = get_mask_from_lengths(audio_codes_lens)
-        if mask_tokens_mask is not None:
-            # For MaskGit we only compute loss for the masked tokens.
-            # *Both* conditions must be true:
-            # 1. the token is masked
-            # 2. the token is not padding
-            loss_mask = loss_mask.unsqueeze(1) * mask_tokens_mask
-            if not loss_mask.any():
-                # Without this we were very rarely getting NaNs in the loss
-                logging.warning("No tokens valid were found in compute_loss()!")
-                return torch.tensor(0.0, device=loss_mask.device), loss_mask
-        else:
-            # repeat loss mask for each codebook to simplify code below
-            loss_mask = loss_mask.unsqueeze(1).repeat(1, audio_codes.size(1), 1)
+        loss_mask = loss_mask.unsqueeze(1).repeat(1, audio_codes.size(1), 1)
         total_codebook_loss = None
         for codebook in range(audio_codes.size(1)):
             si = codebook * self.num_all_tokens_per_codebook
@@ -818,7 +770,6 @@ def join_embeddings_temporally(
         return joined, out_lengths
 
     def prepare_context_tensors(self, batch, dropout_text_input=False):
-        # Transcript
         text = batch['text']
         text_lens = batch['text_lens']
         text_embedded = self.decoder.get_input_embeddings()(text)
@@ -1131,8 +1082,8 @@ def process_batch(self, batch, mode="train"):
         )
 
         logits = self.final_proj(pred_embeddings)  # (B, T', num_codebooks * num_tokens_per_codebook)
-        # import ipdb; ipdb.set_trace()
-        codebook_loss, loss_mask = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target)
+        
+        codebook_loss, _ = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target)
         loss = codebook_loss
 
         local_transformer_loss = None
@@ -1143,7 +1094,7 @@ def process_batch(self, batch, mode="train"):
                 pred_embeddings, audio_codes_target, targets_offset_by_one=False
             )
             local_transformer_loss, _ = self.compute_loss(
-                local_transformer_logits, audio_codes_target, audio_codes_lens_target, None
+                local_transformer_logits, audio_codes_target, audio_codes_lens_target
             )
             local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0)
             loss = loss + local_transformer_loss_scale * local_transformer_loss

From 2067ae944ee027ee57b12112ea200632b2f53cbf Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Wed, 28 Jan 2026 16:01:52 -0500
Subject: [PATCH 20/94] add docstrings and data classes

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 429 ++++++++++++++----
 1 file changed, 348 insertions(+), 81 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 35d7c73d54fa..d9a6705d4a74 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 import random
 import time
+from dataclasses import dataclass
 from functools import partial
-from typing import List, Sequence, Tuple
+from typing import List, Optional, Sequence, Tuple
 
 import torch
 import wandb
@@ -47,6 +48,68 @@
 from nemo.utils import logging
 
 
+@dataclass
+class ContextTensors:
+    """
+    Output dataclass from prepare_context_tensors containing all context-related tensors.
+
+    Attributes:
+        context_embedding: Combined context embedding tensor (B, T_total, E)
+        context_lens: Length of context for each batch item (B,)
+        context_audio_codes: Audio codes for context audio (B, C, T')
+        context_audio_embedded: Embedded context audio codes (B, T', E)
+        context_audio_codes_lens: Length of context audio codes (B,)
+        text_embedded: Embedded text tokens (B, L, E)
+        text_lens: Length of text for each batch item (B,)
+        context_text_tokens: Context text token IDs (B, L)
+        context_text_lens: Length of context text (B,)
+        remaining_text_embedded: Embedded remaining text for streaming mode, None otherwise (B, T, E)
+        remaining_text_lens: Length of remaining text for streaming mode, None otherwise (B,)
+    """
+
+    context_embedding: torch.Tensor
+    context_lens: torch.Tensor
+    context_audio_codes: torch.Tensor
+    context_audio_embedded: torch.Tensor
+    context_audio_codes_lens: torch.Tensor
+    text_embedded: torch.Tensor
+    text_lens: torch.Tensor
+    context_text_tokens: torch.Tensor
+    context_text_lens: torch.Tensor
+    remaining_text_embedded: Optional[torch.Tensor]
+    remaining_text_lens: Optional[torch.Tensor]
+
+
+@dataclass
+class ProcessBatchOutput:
+    """
+    Output dataclass from process_batch containing loss values and model predictions.
+
+    Attributes:
+        loss: Total combined loss (codebook_loss + phoneme_loss + local_transformer_loss)
+        codebook_loss: Loss for audio codebook prediction
+        phoneme_loss: Loss for phoneme prediction (None if phoneme_tokenizer is not used)
+        local_transformer_loss: Loss from local transformer (None if not using local transformer)
+        local_transformer_logits: Logits from local transformer, shape (B, T', num_codebooks * num_tokens_per_codebook)
+        logits: Predicted logits from the main decoder, shape (B, T', num_codebooks * num_tokens_per_codebook)
+        audio_codes_target: Target audio codes for the decoder, shape (B, C, T')
+        audio_codes_lens_target: Length of target audio codes for each batch item, shape (B,)
+        context_audio_codes: Audio codes extracted from context audio, shape (B, C, T')
+        context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,)
+    """
+
+    loss: torch.Tensor
+    codebook_loss: torch.Tensor
+    phoneme_loss: Optional[torch.Tensor]
+    local_transformer_loss: Optional[torch.Tensor]
+    local_transformer_logits: Optional[torch.Tensor]
+    logits: torch.Tensor
+    audio_codes_target: torch.Tensor
+    audio_codes_lens_target: torch.Tensor
+    context_audio_codes: torch.Tensor
+    context_audio_codes_lens: torch.Tensor
+
+
 def worker_init_fn(worker_id):
     # For mp.set_start_method("spawn", force=True)
     # The dataset class should be picklable, so we initialize non-picklable objects here
@@ -769,9 +832,58 @@ def join_embeddings_temporally(
 
         return joined, out_lengths
 
-    def prepare_context_tensors(self, batch, dropout_text_input=False):
-        text = batch['text']
-        text_lens = batch['text_lens']
+    def prepare_context_tensors(
+        self,
+        text: torch.Tensor,
+        text_lens: torch.Tensor,
+        context_text_tokens: torch.Tensor,
+        context_text_tokens_lens: torch.Tensor,
+        context_audio_codes: Optional[torch.Tensor] = None,
+        context_audio_codes_lens: Optional[torch.Tensor] = None,
+        context_audio: Optional[torch.Tensor] = None,
+        context_audio_lens: Optional[torch.Tensor] = None,
+        dropout_text_input: bool = False,
+    ) -> ContextTensors:
+        """
+        Prepare context tensors for the EasyMagpieTTS model.
+
+        This function processes the input text, context audio, and context text to create
+        the combined context embedding that will be fed to the transformer decoder. It handles
+        both 'full' and 'streaming' text input modes.
+
+        Args:
+            text: Input text token IDs (B, L)
+            text_lens: Length of text for each batch item (B,)
+            context_text_tokens: Context text token IDs for speaker/style conditioning (B, L)
+            context_text_tokens_lens: Length of context text for each batch item (B,)
+            context_audio_codes: Pre-computed audio codes for context audio (B, C, T').
+                If None, will be computed from context_audio.
+            context_audio_codes_lens: Length of context audio codes (B,).
+                Required if context_audio_codes is provided.
+            context_audio: Raw context audio waveform (B, T).
+                Used to compute context_audio_codes if not provided.
+            context_audio_lens: Length of context audio (B,).
+                Required if context_audio is provided.
+            dropout_text_input: If True, zero out the text embedding for classifier-free guidance.
+
+        Returns:
+            ContextTensors: A dataclass containing all prepared context tensors including:
+                - context_embedding: Combined context embedding (B, T_total, E)
+                - context_lens: Total context length per batch item (B,)
+                - context_audio_codes: Processed audio codes with special tokens (B, C, T')
+                - context_audio_embedded: Embedded context audio (B, T', E)
+                - context_audio_codes_lens: Length of processed context audio codes (B,)
+                - text_embedded: Embedded text tokens (B, L, E)
+                - text_lens: Text length per batch item (B,)
+                - context_text_tokens: Context text token IDs (B, L)
+                - context_text_lens: Context text length per batch item (B,)
+                - remaining_text_embedded: For streaming mode, embedded remaining text (B, T, E)
+                - remaining_text_lens: For streaming mode, remaining text length (B,)
+
+        Raises:
+            ValueError: If neither context_audio_codes nor context_audio is provided.
+            ValueError: If text_input_mode is not 'full' or 'streaming'.
+        """
         text_embedded = self.decoder.get_input_embeddings()(text)
         if self.use_bpe_char_tokenizer:
             text_mask = get_mask_from_lengths(text_lens)
@@ -793,13 +905,10 @@ def prepare_context_tensors(self, batch, dropout_text_input=False):
             text_embedded = text_embedded * 0.0
 
         # Context Audio
-        if 'context_audio_codes' in batch:
-            context_audio_codes = batch['context_audio_codes']
-            context_audio_codes_lens = batch['context_audio_codes_lens']
-        else:
-            context_audio_codes, context_audio_codes_lens = self.audio_to_codes(
-                batch['context_audio'], batch['context_audio_lens']
-            )
+        if context_audio_codes is None:
+            if context_audio is None:
+                raise ValueError("Either context_audio_codes or context_audio must be provided")
+            context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
 
         if self._codec_converter is not None:
             context_audio_codes = self._codec_converter.convert_original_to_new(
@@ -824,8 +933,7 @@ def prepare_context_tensors(self, batch, dropout_text_input=False):
         context_audio_embedded = self.embed_audio_tokens(context_audio_codes)  # (B, T', E)
 
         # Context Text
-        context_text_tokens = batch['context_text_tokens']
-        context_text_lens = batch['context_text_tokens_lens']
+        context_text_lens = context_text_tokens_lens
         context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens)  # (B, L, E)
 
         remaining_text_embedded = None
@@ -850,19 +958,19 @@ def prepare_context_tensors(self, batch, dropout_text_input=False):
         else:
             raise ValueError(f"Invalid text input mode: {self.text_input_mode}")
 
-        return {
-            'context_embedding': context_embedding,  # (B, T_total, E)
-            'context_lens': context_lens,  # (B,)
-            'context_audio_codes': context_audio_codes,  # (B, C, T')
-            'context_audio_embedded': context_audio_embedded,  # (B, T', E)
-            'context_audio_codes_lens': context_audio_codes_lens,  # (B,)
-            'text_embedded': text_embedded,  # (B, L, E)
-            'text_lens': text_lens,  # (B,)
-            'context_text_tokens': context_text_tokens,  # (B, L)
-            'context_text_lens': context_text_lens,  # (B,)
-            'remaining_text_embedded': remaining_text_embedded,  # (B, T, E)
-            'remaining_text_lens': remaining_text_lens,  # (B,)
-        }
+        return ContextTensors(
+            context_embedding=context_embedding,
+            context_lens=context_lens,
+            context_audio_codes=context_audio_codes,
+            context_audio_embedded=context_audio_embedded,
+            context_audio_codes_lens=context_audio_codes_lens,
+            text_embedded=text_embedded,
+            text_lens=text_lens,
+            context_text_tokens=context_text_tokens,
+            context_text_lens=context_text_lens,
+            remaining_text_embedded=remaining_text_embedded,
+            remaining_text_lens=remaining_text_lens,
+        )
 
     def slice_pred_embeddings(self, transformer_out, context_lens, target_lens):
         """
@@ -968,19 +1076,98 @@ def prepare_phoneme_channel_input(self, phoneme_tokens, phoneme_tokens_lens, con
         )
         return phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens
 
-    def process_batch(self, batch, mode="train"):
+    def process_batch(
+        self,
+        text: torch.Tensor,
+        text_lens: torch.Tensor,
+        context_text_tokens: torch.Tensor,
+        context_text_tokens_lens: torch.Tensor,
+        audio: Optional[torch.Tensor] = None,
+        audio_lens: Optional[torch.Tensor] = None,
+        audio_codes: Optional[torch.Tensor] = None,
+        audio_codes_lens: Optional[torch.Tensor] = None,
+        context_audio: Optional[torch.Tensor] = None,
+        context_audio_lens: Optional[torch.Tensor] = None,
+        context_audio_codes: Optional[torch.Tensor] = None,
+        context_audio_codes_lens: Optional[torch.Tensor] = None,
+        phoneme_tokens: Optional[torch.Tensor] = None,
+        phoneme_tokens_lens: Optional[torch.Tensor] = None,
+        mode: str = "train",
+    ) -> ProcessBatchOutput:
+        """
+        Process a batch of inputs to compute model outputs and losses.
+
+        This function performs the following steps:
+        1. Prepares context tensors from text and audio inputs
+        2. Optionally applies dropout to text/phoneme inputs for regularization
+        3. Optionally applies classifier-free guidance (CFG) unconditional training
+        4. Converts audio to codes if not already provided
+        5. Embeds audio codes and combines with context embeddings
+        6. Runs the transformer forward pass
+        7. Computes codebook loss, phoneme loss (if applicable), and local transformer loss (if applicable)
+
+        Args:
+            text: Input text token IDs, shape (B, L)
+            text_lens: Length of text for each batch item, shape (B,)
+            context_text_tokens: Context text token IDs for conditioning, shape (B, L_ctx)
+            context_text_tokens_lens: Length of context text for each batch item, shape (B,)
+            audio: Raw audio waveform (used if audio_codes not provided), shape (B, T_audio)
+            audio_lens: Length of audio for each batch item, shape (B,)
+            audio_codes: Pre-computed audio codes (optional, computed from audio if not provided), shape (B, C, T)
+            audio_codes_lens: Length of audio codes for each batch item, shape (B,)
+            context_audio: Raw context audio waveform (optional), shape (B, T_ctx_audio)
+            context_audio_lens: Length of context audio for each batch item, shape (B,)
+            context_audio_codes: Pre-computed context audio codes (optional), shape (B, C, T_ctx)
+            context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,)
+            phoneme_tokens: Phoneme token IDs (required if phoneme_tokenizer is enabled), shape (B, P, L_phoneme)
+            phoneme_tokens_lens: Length of phoneme tokens for each batch item, shape (B,)
+            mode: Training mode, either "train" or "val". Affects dropout behavior.
+
+        Returns:
+            ProcessBatchOutput: Dataclass containing:
+                - loss: Total combined loss
+                - codebook_loss: Loss for audio codebook prediction
+                - phoneme_loss: Loss for phoneme prediction (None if not using phonemes)
+                - local_transformer_loss: Loss from local transformer (None if not used)
+                - local_transformer_logits: Logits from local transformer
+                - logits: Predicted logits from the main decoder
+                - audio_codes_target: Target audio codes
+                - audio_codes_lens_target: Length of target audio codes
+                - context_audio_codes: Audio codes from context
+                - context_audio_codes_lens: Length of context audio codes
+        """
+        # Determine whether to apply text/phoneme dropout for regularization during training
+        # Text dropout: randomly drop text input to encourage the model to rely on other signals
         dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False
+        # Phoneme dropout: randomly drop phoneme input, but only if text is not already dropped
+        # This ensures we don't drop both simultaneously
         dropout_phoneme_input = (
             ((random.random() < self.dropout_phoneme_input_prob) and (not dropout_text_input))
             if mode == 'train'
             else False
         )
-        context_tensors = self.prepare_context_tensors(batch, dropout_text_input)
-        # print("text lens", context_tensors['text_lens'])
-        remaining_text_embedded = context_tensors['remaining_text_embedded']
-        context_embedding = context_tensors['context_embedding']
-        context_lens = context_tensors['context_lens']
 
+        # Prepare context tensors by combining text and audio context information
+        context_tensors = self.prepare_context_tensors(
+            text=text,
+            text_lens=text_lens,
+            context_text_tokens=context_text_tokens,
+            context_text_tokens_lens=context_text_tokens_lens,
+            context_audio_codes=context_audio_codes,
+            context_audio_codes_lens=context_audio_codes_lens,
+            context_audio=context_audio,
+            context_audio_lens=context_audio_lens,
+            dropout_text_input=dropout_text_input,
+        )
+
+        # Extract context tensors for use in the forward pass
+        remaining_text_embedded = context_tensors.remaining_text_embedded
+        context_embedding = context_tensors.context_embedding
+        context_lens = context_tensors.context_lens
+
+        # Classifier-Free Guidance (CFG) unconditional training:
+        # With some probability, replace the context with a special unconditional token
+        # This allows the model to generate without conditioning during inference
         dropout_conditional_input = False
         if mode == 'train' and self.cfg_unconditional_prob > 0.0:
             if torch.rand(1).item() < self.cfg_unconditional_prob:
@@ -997,17 +1184,17 @@ def process_batch(self, batch, mode="train"):
                 if self.text_input_mode == 'streaming':
                     remaining_text_embedded = torch.zeros_like(remaining_text_embedded)
 
-        if 'audio_codes' not in batch:
-            audio_codes, audio_codes_lens = self.audio_to_codes(batch['audio'], batch['audio_lens'])
-        else:
-            audio_codes = batch['audio_codes']
-            audio_codes_lens = batch['audio_codes_lens']
+        # Convert raw audio to discrete codes if codes are not already provided
+        if audio_codes is None:
+            audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens)
 
+        # Apply codec conversion if a converter is configured (e.g., for different codec formats)
         if self._codec_converter is not None:
             audio_codes = self._codec_converter.convert_original_to_new(
                 audio_tokens=audio_codes, audio_lens=audio_codes_lens
             ).long()
 
+        # Add BOS (beginning of sequence) and EOS (end of sequence) tokens to audio codes
         audio_codes, audio_codes_lens = self.add_special_tokens(
             codes=audio_codes,
             codes_len=audio_codes_lens,
@@ -1015,6 +1202,8 @@ def process_batch(self, batch, mode="train"):
             eos_id=self.audio_eos_id,
         )
 
+        # Stack audio codes across codebooks for multi-codebook processing
+        # This reshapes codes for parallel prediction of multiple codebooks
         audio_codes, audio_codes_lens = self.stack_codes(
             audio_codes,
             audio_codes_lens,
@@ -1023,14 +1212,23 @@ def process_batch(self, batch, mode="train"):
             self.frame_stacking_factor,
             self.num_audio_codebooks,
         )
+
+        # Prepare input and target sequences for autoregressive training
+        # Input: all tokens except the last (teacher forcing)
+        # Target: all tokens except the first (shifted by one position)
         audio_codes_lens_input = audio_codes_lens_target = audio_codes_lens - 1
         audio_codes_target = audio_codes[:, :, 1:]  # (B, C, T') Target for the decoder
         audio_codes_input = audio_codes[:, :, :-1]  # (B, C, T') Input to the decoder
+
+        # Embed audio tokens to get continuous representations
         audio_codes_input_embedded = self.embed_audio_tokens(
             audio_codes_input
-        )  # (B, T, E) # Computing this to be use in the alignment encoder
+        )  # (B, T, E)
+
+        # In streaming mode, add remaining text embeddings to audio embeddings
+        # This provides text information at each audio timestep
         if remaining_text_embedded is not None:
-            # Make remaining text embedded the same size as audio_codes_input_embedded by padding with zeros on the right
+            # Pad remaining text to match audio sequence length by adding zeros on the right
             padding_len = audio_codes_input_embedded.size(1) - remaining_text_embedded.size(1)
             padding_tensor = torch.zeros(
                 remaining_text_embedded.size(0),
@@ -1039,23 +1237,32 @@ def process_batch(self, batch, mode="train"):
                 device=remaining_text_embedded.device,
             )
             remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1)
+            # Add text information to audio embeddings (element-wise addition)
             audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded
 
+        # Concatenate context embeddings with audio embeddings along the time dimension
+        # Result: [context_embedding | audio_codes_input_embedded]
         context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally(
             embeddings=[context_embedding, audio_codes_input_embedded],
             lengths=[context_lens, audio_codes_lens_input],
         )
 
+        # Process phoneme input if phoneme tokenizer is configured
         if self.phoneme_tokenizer is not None:
+            # Compute context length offset for phoneme alignment
+            # This accounts for different delays in speech vs phoneme streams
             context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay
-            phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens = (
+
+            # Prepare phoneme channel input with proper alignment
+            phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens_processed, phoneme_tokens_lens_processed = (
                 self.prepare_phoneme_channel_input(
-                    batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes
+                    phoneme_tokens, phoneme_tokens_lens, context_lens_for_phonemes
                 )
             )
-            # print("phoneme_tokens_lens", phoneme_tokens_lens)
-            # print("audio_codes_lens", audio_codes_lens_input)
+
+            # Align phoneme channel input to match the combined context+audio sequence length
             if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]:
+                # Pad phoneme channel with zeros if shorter than context+audio
                 padding_tensor = torch.zeros(
                     phoneme_channel_input.shape[0],
                     context_plus_audio_embedded.shape[1] - phoneme_channel_input.shape[1],
@@ -1064,88 +1271,120 @@ def process_batch(self, batch, mode="train"):
                 )
                 phoneme_channel_input = torch.cat([phoneme_channel_input, padding_tensor], dim=1)
             else:
+                # Truncate phoneme channel if longer than context+audio
                 phoneme_channel_input = phoneme_channel_input[:, : context_plus_audio_embedded.shape[1], :]
 
+            # Add phoneme information unless doing unconditional or phoneme dropout training
             if (not dropout_conditional_input) and (not dropout_phoneme_input):
                 context_plus_audio_embedded = context_plus_audio_embedded + phoneme_channel_input
 
+        # Run the transformer forward pass
         transformer_out = self.forward(
             inputs_embeds=context_plus_audio_embedded,
             attention_mask=get_mask_from_lengths(context_plus_audio_lens),
         )
         transformer_hidden_states = transformer_out.last_hidden_state  # (B, T_total, E)
 
+        # Extract prediction embeddings by slicing out the audio portion (excluding context)
         pred_embeddings = self.slice_pred_embeddings(
             transformer_hidden_states,
             context_lens=context_lens,
             target_lens=audio_codes_lens_target,
         )
 
+        # Project embeddings to logits for each codebook
         logits = self.final_proj(pred_embeddings)  # (B, T', num_codebooks * num_tokens_per_codebook)
-        
+
+        # Compute the main codebook prediction loss
         codebook_loss, _ = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target)
         loss = codebook_loss
 
+        # Compute local transformer loss if using local transformer architecture
         local_transformer_loss = None
         local_transformer_logits = None
         if self.local_transformer_type != LocalTransformerType.NO_LT:
             assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type"
+            # Compute logits using the local (autoregressive) transformer
             local_transformer_logits = self.compute_local_transformer_logits(
                 pred_embeddings, audio_codes_target, targets_offset_by_one=False
             )
             local_transformer_loss, _ = self.compute_loss(
                 local_transformer_logits, audio_codes_target, audio_codes_lens_target
             )
+            # Scale and add local transformer loss to total loss
             local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0)
             loss = loss + local_transformer_loss_scale * local_transformer_loss
 
+        # Compute phoneme prediction loss if using phoneme tokenizer
         phoneme_loss = None
         if self.phoneme_tokenizer is not None:
+            # Extract phoneme prediction embeddings with proper alignment
             pred_embeddings_phoneme = self.slice_pred_embeddings(
                 transformer_hidden_states,
                 context_lens=context_lens_for_phonemes,
-                target_lens=phoneme_tokens_lens - 1,
+                target_lens=phoneme_tokens_lens_processed - 1,
             )
+            # Project to phoneme logits
             phoneme_logits = self.phoneme_final_proj(
                 pred_embeddings_phoneme
             )  # (B, T', phoneme_stacking_factor * phoneme_vocab_size)
+
+            # Only compute phoneme loss if not doing any dropout
+            # (unconditional, text dropout, or phoneme dropout)
             if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input):
-                # Only compute phoneme loss if not doing unconditional training or text dropout
                 phoneme_loss, _ = self.compute_phoneme_loss(
-                    phoneme_logits, phoneme_tokens[:, :, 1:].long(), phoneme_tokens_lens - 1
+                    phoneme_logits, phoneme_tokens_processed[:, :, 1:].long(), phoneme_tokens_lens_processed - 1
                 )
                 print("No Dropout - phoneme loss:", phoneme_loss.item())
             else:
+                # Skip phoneme loss computation during dropout training
                 phoneme_loss = torch.tensor(0.0, device=logits.device)
                 print("Dropout - phoneme loss skipped", phoneme_loss.item())
 
             loss = loss + phoneme_loss
 
-        return {
-            'loss': loss,
-            'codebook_loss': codebook_loss,
-            'phoneme_loss': phoneme_loss,
-            'local_transformer_loss': local_transformer_loss,
-            'local_transformer_logits': local_transformer_logits,  # (B, T', num_codebooks * num_tokens_per_codebook)
-            'logits': logits,
-            'audio_codes_target': audio_codes_target,  # (B, C, T')
-            'audio_codes_lens_target': audio_codes_lens_target,  # (B,)
-            'context_audio_codes': context_tensors['context_audio_codes'],  # (B, C, T')
-            'context_audio_codes_lens': context_tensors['context_audio_codes_lens'],  # (B,)
-        }
+        return ProcessBatchOutput(
+            loss=loss,
+            codebook_loss=codebook_loss,
+            phoneme_loss=phoneme_loss,
+            local_transformer_loss=local_transformer_loss,
+            local_transformer_logits=local_transformer_logits,
+            logits=logits,
+            audio_codes_target=audio_codes_target,
+            audio_codes_lens_target=audio_codes_lens_target,
+            context_audio_codes=context_tensors.context_audio_codes,
+            context_audio_codes_lens=context_tensors.context_audio_codes_lens,
+        )
 
     def training_step(self, batch, batch_idx):
-        batch_output = self.process_batch(batch)
-        loss = batch_output['loss']
-        codebook_loss = batch_output['codebook_loss']
+        # Extract inputs from batch and pass explicitly to process_batch
+        batch_output = self.process_batch(
+            text=batch['text'],
+            text_lens=batch['text_lens'],
+            context_text_tokens=batch['context_text_tokens'],
+            context_text_tokens_lens=batch['context_text_tokens_lens'],
+            audio=batch.get('audio'),
+            audio_lens=batch.get('audio_lens'),
+            audio_codes=batch.get('audio_codes'),
+            audio_codes_lens=batch.get('audio_codes_lens'),
+            context_audio=batch.get('context_audio'),
+            context_audio_lens=batch.get('context_audio_lens'),
+            context_audio_codes=batch.get('context_audio_codes'),
+            context_audio_codes_lens=batch.get('context_audio_codes_lens'),
+            phoneme_tokens=batch.get('phoneme_tokens'),
+            phoneme_tokens_lens=batch.get('phoneme_tokens_lens'),
+            mode="train",
+        )
+        loss = batch_output.loss
+        codebook_loss = batch_output.codebook_loss
         self.log('train/codebook_loss', codebook_loss, prog_bar=True, sync_dist=True)
         self.log('train/loss', loss, prog_bar=True, sync_dist=True)
 
         if self.phoneme_tokenizer is not None:
-            phoneme_loss = batch_output['phoneme_loss']
+            phoneme_loss = batch_output.phoneme_loss
             self.log('train/phoneme_loss', phoneme_loss, prog_bar=True, sync_dist=True)
 
-        local_transformer_loss = batch_output['local_transformer_loss']
+        local_transformer_loss = batch_output.local_transformer_loss
         if local_transformer_loss is not None:
             self.log('train/local_transformer_loss', local_transformer_loss, prog_bar=True, sync_dist=True)
 
@@ -1188,16 +1427,34 @@ def training_step(self, batch, batch_idx):
         return loss
 
     def validation_step(self, batch, batch_idx):
-        batch_output = self.process_batch(batch, mode="val")
-        # self.process_batch returns a dict. We currently only log "logits" which come from the parallel prediction
-        # head. If we use local_transformer, then the local_transformer returns "local_transformer_logits"
-        loss = batch_output['loss']
-        codebook_loss = batch_output['codebook_loss']
-        logits = batch_output['logits']
-        audio_codes_target = batch_output['audio_codes_target']
-        audio_codes_lens_target = batch_output['audio_codes_lens_target']
-        context_audio_codes = batch_output['context_audio_codes']
-        context_audio_codes_lens = batch_output['context_audio_codes_lens']
+        # Extract inputs from batch and pass explicitly to process_batch
+        batch_output = self.process_batch(
+            text=batch['text'],
+            text_lens=batch['text_lens'],
+            context_text_tokens=batch['context_text_tokens'],
+            context_text_tokens_lens=batch['context_text_tokens_lens'],
+            audio=batch.get('audio'),
+            audio_lens=batch.get('audio_lens'),
+            audio_codes=batch.get('audio_codes'),
+            audio_codes_lens=batch.get('audio_codes_lens'),
+            context_audio=batch.get('context_audio'),
+            context_audio_lens=batch.get('context_audio_lens'),
+            context_audio_codes=batch.get('context_audio_codes'),
+            context_audio_codes_lens=batch.get('context_audio_codes_lens'),
+            phoneme_tokens=batch.get('phoneme_tokens'),
+            phoneme_tokens_lens=batch.get('phoneme_tokens_lens'),
+            mode="val",
+        )
+        # Access ProcessBatchOutput dataclass attributes
+        # logits come from the parallel prediction head
+        # If using local_transformer, local_transformer_logits are also available
+        loss = batch_output.loss
+        codebook_loss = batch_output.codebook_loss
+        logits = batch_output.logits
+        audio_codes_target = batch_output.audio_codes_target
+        audio_codes_lens_target = batch_output.audio_codes_lens_target
+        context_audio_codes = batch_output.context_audio_codes
+        context_audio_codes_lens = batch_output.context_audio_codes_lens
 
         if batch_idx == 0 and self.global_rank == 0:
             # Prepare dictionary for aggregated wandb logging
@@ -1215,7 +1472,7 @@ def validation_step(self, batch, batch_idx):
                 if isinstance(logger, WandbLogger) and wandb_log_dict:
                     logger.experiment.log(wandb_log_dict)
 
-        local_transformer_loss = batch_output['local_transformer_loss']
+        local_transformer_loss = batch_output.local_transformer_loss
         val_output = {
             'val_loss': loss,
             'val_codebook_loss': codebook_loss,
@@ -1223,7 +1480,7 @@ def validation_step(self, batch, batch_idx):
         }
 
         if self.phoneme_tokenizer is not None:
-            phoneme_loss = batch_output['phoneme_loss']
+            phoneme_loss = batch_output.phoneme_loss
             val_output['val_phoneme_loss'] = phoneme_loss
 
         self.validation_step_outputs.append(val_output)
@@ -1376,11 +1633,21 @@ def infer_batch(
         # TODO: Make this API same as MagpieTTS model.
         with torch.inference_mode():
             start_time = time.time()
-            context_tensors = self.prepare_context_tensors(batch, dropout_text_input=dropout_text_input)
-            context_embedding = context_tensors['context_embedding']  # (B, T_total, E)
-            context_lens = context_tensors['context_lens']  # (B,)
-            remaining_text_embedded = context_tensors['remaining_text_embedded']
-            remaining_text_lens = context_tensors['remaining_text_lens']
+            context_tensors = self.prepare_context_tensors(
+                text=batch['text'],
+                text_lens=batch['text_lens'],
+                context_text_tokens=batch['context_text_tokens'],
+                context_text_tokens_lens=batch['context_text_tokens_lens'],
+                context_audio_codes=batch.get('context_audio_codes'),
+                context_audio_codes_lens=batch.get('context_audio_codes_lens'),
+                context_audio=batch.get('context_audio'),
+                context_audio_lens=batch.get('context_audio_lens'),
+                dropout_text_input=dropout_text_input,
+            )
+            context_embedding = context_tensors.context_embedding  # (B, T_total, E)
+            context_lens = context_tensors.context_lens  # (B,)
+            remaining_text_embedded = context_tensors.remaining_text_embedded
+            remaining_text_lens = context_tensors.remaining_text_lens
 
             if self.phoneme_tokenizer is not None:
                 context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay

From ef6a0e0e86e6c345f2e8574cda72caa914b78d0f Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Wed, 28 Jan 2026 19:33:27 -0500
Subject: [PATCH 21/94] more doc strings

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 68 ++++++++++++++++++-
 1 file changed, 66 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index d9a6705d4a74..7d6e8bccadd2 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -1000,6 +1000,28 @@ def slice_pred_embeddings(self, transformer_out, context_lens, target_lens):
         return sliced
 
     def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_codebooks):
+        """
+        Stack multiple time steps into the channel dimension to reduce sequence length.
+
+        This function reshapes audio/phoneme codes by grouping consecutive time steps together
+        and placing them in the channel dimension. This allows the model to process multiple
+        frames in parallel while reducing the sequence length.
+
+        Args:
+            codes: Input codes tensor of shape (B, C, T) where B is batch size,
+                   C is number of codebooks, and T is sequence length.
+            codes_lens: Length of valid codes for each batch item, shape (B,).
+            bos_id: Beginning-of-sequence token ID used to detect and handle BOS tokens.
+            eos_id: End-of-sequence token ID used for padding.
+            stacking_factor: Number of time steps to stack together. If 1, no stacking is performed.
+            num_codebooks: Number of codebooks in the input.
+
+        Returns:
+            Tuple of:
+                - stacked_codes: Reshaped codes of shape (B, C * stacking_factor, T // stacking_factor).
+                  If input contains BOS tokens, they are preserved at the beginning.
+                - new_lens: Updated sequence lengths after stacking, shape (B,).
+        """
         if stacking_factor == 1:
             return codes, codes_lens
 
@@ -1032,6 +1054,26 @@ def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_co
         return codes, new_lens
 
     def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor):
+        """
+        Reverse the stacking operation to recover the original time dimension.
+
+        This is the inverse of `stack_codes`. It takes codes that have been stacked
+        in the channel dimension and expands them back into the time dimension.
+
+        Args:
+            stacked_codes: Stacked codes tensor of shape (B, C * stacking_factor, T_stacked)
+                          where T_stacked = T_original // stacking_factor.
+            stacked_lens: Length of valid stacked sequences for each batch item, shape (B,).
+            stacking_factor: The stacking factor used in the original `stack_codes` call.
+                            If 1, no unstacking is performed.
+
+        Returns:
+            Tuple of:
+                - unstacked_codes: Codes with restored time dimension, shape (B, C, T_stacked * stacking_factor).
+                - orig_lens: Recovered sequence lengths, shape (B,). Note that these are the
+                  maximum possible lengths; actual valid lengths may be shorter due to
+                  padding applied during stacking.
+        """
         if stacking_factor == 1:
             return stacked_codes, stacked_lens
 
@@ -1051,7 +1093,29 @@ def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor):
         return x, orig_lens
 
     def prepare_phoneme_channel_input(self, phoneme_tokens, phoneme_tokens_lens, context_lens):
-        # import ipdb; ipdb.set_trace()
+        """
+        Prepare phoneme tokens as an auxiliary input channel for the decoder.
+
+        This function processes phoneme tokens by stacking them (if configured), embedding them,
+        and prepending a zero-padded context region. The resulting tensor can be used as an
+        additional input channel to provide phoneme conditioning to the audio decoder.
+
+        Args:
+            phoneme_tokens: Phoneme token IDs, shape (B, L) where B is batch size and
+                           L is the phoneme sequence length.
+            phoneme_tokens_lens: Length of valid phoneme tokens for each batch item, shape (B,).
+            context_lens: Length of the context region for each batch item, shape (B,).
+                         Used to prepend zero-padding to align with audio context.
+
+        Returns:
+            Tuple of:
+                - phoneme_channel_input: Embedded phoneme tokens with zero-padded context,
+                  shape (B, T_context + T_phoneme, E) where E is the embedding dimension.
+                - phoneme_channel_input_lens: Total length of phoneme channel input for each
+                  batch item (context_lens + phoneme_tokens_lens after stacking), shape (B,).
+                - phoneme_tokens: Stacked phoneme tokens, shape (B, phoneme_stacking_factor, T_stacked).
+                - phoneme_tokens_lens: Length of stacked phoneme tokens, shape (B,).
+        """
         phoneme_tokens = phoneme_tokens.unsqueeze(1)  # (B, 1, L)
         phoneme_tokens, phoneme_tokens_lens = self.stack_codes(
             phoneme_tokens,
@@ -1119,7 +1183,7 @@ def process_batch(
             context_audio_lens: Length of context audio for each batch item, shape (B,)
             context_audio_codes: Pre-computed context audio codes (optional), shape (B, C, T_ctx)
             context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,)
-            phoneme_tokens: Phoneme token IDs (required if phoneme_tokenizer is enabled), shape (B, P, L_phoneme)
+            phoneme_tokens: Phoneme token IDs (required if phoneme_tokenizer is enabled), shape (B, L_phoneme)
             phoneme_tokens_lens: Length of phoneme tokens for each batch item, shape (B,)
             mode: Training mode, either "train" or "val". Affects dropout behavior.
 

From 0101a1aaedc204521465b3b5763158207afc00e1 Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Thu, 29 Jan 2026 01:07:43 +0000
Subject: [PATCH 22/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 7d6e8bccadd2..62f3aa99e46d 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -1285,9 +1285,7 @@ def process_batch(
         audio_codes_input = audio_codes[:, :, :-1]  # (B, C, T') Input to the decoder
 
         # Embed audio tokens to get continuous representations
-        audio_codes_input_embedded = self.embed_audio_tokens(
-            audio_codes_input
-        )  # (B, T, E)
+        audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input)  # (B, T, E)
 
         # In streaming mode, add remaining text embeddings to audio embeddings
         # This provides text information at each audio timestep
@@ -1318,11 +1316,12 @@ def process_batch(
             context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay
 
             # Prepare phoneme channel input with proper alignment
-            phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens_processed, phoneme_tokens_lens_processed = (
-                self.prepare_phoneme_channel_input(
-                    phoneme_tokens, phoneme_tokens_lens, context_lens_for_phonemes
-                )
-            )
+            (
+                phoneme_channel_input,
+                phoneme_channel_input_lens,
+                phoneme_tokens_processed,
+                phoneme_tokens_lens_processed,
+            ) = self.prepare_phoneme_channel_input(phoneme_tokens, phoneme_tokens_lens, context_lens_for_phonemes)
 
             # Align phoneme channel input to match the combined context+audio sequence length
             if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]:

From ce19ed6c1c80791a772f3d911a9cccd749f1ff7c Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Wed, 28 Jan 2026 20:43:56 -0500
Subject: [PATCH 23/94] support multiple training modes

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 .../tts/conf/magpietts/easy_magpietts.yaml    |  22 +-
 .../conf/magpietts/easy_magpietts_lhotse.yaml |  21 +-
 nemo/collections/tts/models/easy_magpietts.py | 215 +++++++++++++++---
 3 files changed, 225 insertions(+), 33 deletions(-)

diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
index 8518fa79060b..76f39121322e 100644
--- a/examples/tts/conf/magpietts/easy_magpietts.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -36,11 +36,27 @@ model:
   cfg_unconditional_prob: 0.1
   # To get special_tokens of the tokenzer, you can do:
   # model.tokenizer.first_tokenizer.additional_special_tokens
-  text_input_mode: "streaming"
+  
+  # Multi-mode training configuration
+  # The model will randomly select one of the modes for each batch during training.
+  # Each mode has its own task embedding that is prepended to the context.
+  # During inference, you can specify which mode to use via the 'inference_mode' parameter.
+  training_modes:
+    - name: "full"
+      text_input_mode: "full"
+      streaming_phonemes_delay: 0  # Not used in full mode
+      streaming_speech_delay: 0    # Not used in full mode
+    - name: "streaming_4_8"
+      text_input_mode: "streaming"
+      streaming_phonemes_delay: 4
+      streaming_speech_delay: 8
+    - name: "streaming_2_4"
+      text_input_mode: "streaming"
+      streaming_phonemes_delay: 2
+      streaming_speech_delay: 4
+  
   frame_stacking_factor: 1
   phoneme_stacking_factor: 2
-  streaming_phonemes_delay: 4
-  streaming_speech_delay: 8
   dropout_text_input_prob: 0.3
 
   phoneme_tokenizer:
diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
index 6ed9b529eac6..1683a27f4238 100644
--- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -31,11 +31,26 @@ model:
 
   cfg_unconditional_prob: 0.1
   
-  text_input_mode: "streaming"
+  # Multi-mode training configuration
+  # The model will randomly select one of the modes for each batch during training.
+  # Each mode has its own task embedding that is prepended to the context.
+  # During inference, you can specify which mode to use via the 'inference_mode' parameter.
+  training_modes:
+    - name: "full"
+      text_input_mode: "full"
+      streaming_phonemes_delay: 0  # Not used in full mode
+      streaming_speech_delay: 0    # Not used in full mode
+    - name: "streaming_4_8"
+      text_input_mode: "streaming"
+      streaming_phonemes_delay: 4
+      streaming_speech_delay: 8
+    - name: "streaming_2_4"
+      text_input_mode: "streaming"
+      streaming_phonemes_delay: 2
+      streaming_speech_delay: 4
+
   frame_stacking_factor: 1
   phoneme_stacking_factor: 2
-  streaming_phonemes_delay: 4
-  streaming_speech_delay: 8
   dropout_text_input_prob: 0.3
   
   phoneme_tokenizer:
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 62f3aa99e46d..07057a618d85 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -48,6 +48,26 @@
 from nemo.utils import logging
 
 
+@dataclass
+class TrainingMode:
+    """
+    Configuration for a training mode in multi-mode training.
+
+    Attributes:
+        name: Unique identifier for this mode (e.g., "full", "streaming_4_8")
+        text_input_mode: Either "full" or "streaming"
+        streaming_phonemes_delay: Delay for phoneme stream (only used in streaming mode)
+        streaming_speech_delay: Delay for speech stream (only used in streaming mode)
+        mode_idx: Index of this mode in the list of modes (used for task embedding lookup)
+    """
+
+    name: str
+    text_input_mode: str
+    streaming_phonemes_delay: int
+    streaming_speech_delay: int
+    mode_idx: int
+
+
 @dataclass
 class ContextTensors:
     """
@@ -96,6 +116,7 @@ class ProcessBatchOutput:
         audio_codes_lens_target: Length of target audio codes for each batch item, shape (B,)
         context_audio_codes: Audio codes extracted from context audio, shape (B, C, T')
         context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,)
+        selected_training_mode: Name of the selected training mode (None if multi_mode_training is disabled)
     """
 
     loss: torch.Tensor
@@ -108,6 +129,7 @@ class ProcessBatchOutput:
     audio_codes_lens_target: torch.Tensor
     context_audio_codes: torch.Tensor
     context_audio_codes_lens: torch.Tensor
+    selected_training_mode: Optional[str] = None
 
 
 def worker_init_fn(worker_id):
@@ -187,9 +209,36 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.text_conditioning_tokenizer_name = list(cfg.text_tokenizers.keys())[0]
 
         self.cfg_unconditional_prob = cfg.get('cfg_unconditional_prob', 0.0)
-        self.text_input_mode = cfg.get('text_input_mode', 'full')
-        self.streaming_speech_delay = cfg.get('streaming_speech_delay', 3)
-        self.streaming_phonemes_delay = cfg.get('streaming_phonemes_delay', 2)
+
+        # Multi-mode training configuration
+        # The model trains with multiple text input modes (full, streaming with various delays)
+        # Each mode has its own task embedding that is prepended to the context
+        training_modes_cfg = cfg.get('training_modes', None)
+        if training_modes_cfg is None:
+            raise ValueError("training_modes must be specified in the config")
+
+        self.training_modes = []
+        for mode_idx, mode_cfg in enumerate(training_modes_cfg):
+            mode = TrainingMode(
+                name=mode_cfg.name,
+                text_input_mode=mode_cfg.text_input_mode,
+                streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0),
+                streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0),
+                mode_idx=mode_idx,
+            )
+            self.training_modes.append(mode)
+
+        logging.info(f"Multi-mode training with {len(self.training_modes)} modes:")
+        for mode in self.training_modes:
+            logging.info(f"  - {mode.name}: text_input_mode={mode.text_input_mode}, "
+                       f"streaming_phonemes_delay={mode.streaming_phonemes_delay}, "
+                       f"streaming_speech_delay={mode.streaming_speech_delay}")
+
+        # Create a mapping from mode name to mode object for easy lookup during inference
+        self.mode_name_to_mode = {mode.name: mode for mode in self.training_modes}
+        # Default mode for inference if not specified (first mode in the list)
+        self.default_inference_mode = self.training_modes[0].name
+
         self.frame_stacking_factor = cfg.get('frame_stacking_factor', 1)
 
         self.tokenizer = setup_tokenizers(
@@ -243,6 +292,17 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim)
         self.decoder.set_input_embeddings(self.text_embedding)
 
+        # Task embedding for multi-mode training
+        # Each mode has a unique task embedding that is prepended to the context
+        # Only create task embedding if there are multiple modes
+        num_modes = len(self.training_modes)
+        if num_modes > 1:
+            self.task_embedding = nn.Embedding(num_modes, cfg.embedding_dim)
+            logging.info(f"Created task embedding with {num_modes} modes, embedding_dim={cfg.embedding_dim}")
+        else:
+            self.task_embedding = None
+            logging.info(f"Single training mode '{self.training_modes[0].name}', skipping task embedding")
+
         if self.use_bpe_char_tokenizer:
             # BPE char tokenizer
             assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer"
@@ -843,6 +903,7 @@ def prepare_context_tensors(
         context_audio: Optional[torch.Tensor] = None,
         context_audio_lens: Optional[torch.Tensor] = None,
         dropout_text_input: bool = False,
+        training_mode: Optional[TrainingMode] = None,
     ) -> ContextTensors:
         """
         Prepare context tensors for the EasyMagpieTTS model.
@@ -865,6 +926,8 @@ def prepare_context_tensors(
             context_audio_lens: Length of context audio (B,).
                 Required if context_audio is provided.
             dropout_text_input: If True, zero out the text embedding for classifier-free guidance.
+            training_mode: Optional TrainingMode object specifying the mode to use.
+                If None, uses the first mode from training_modes as default.
 
         Returns:
             ContextTensors: A dataclass containing all prepared context tensors including:
@@ -884,17 +947,27 @@ def prepare_context_tensors(
             ValueError: If neither context_audio_codes nor context_audio is provided.
             ValueError: If text_input_mode is not 'full' or 'streaming'.
         """
+        # Determine the mode parameters to use
+        # If no mode is specified, use the first (default) mode
+        if training_mode is None:
+            training_mode = self.training_modes[0]
+
+        current_text_input_mode = training_mode.text_input_mode
+        current_streaming_speech_delay = training_mode.streaming_speech_delay
+        current_streaming_phonemes_delay = training_mode.streaming_phonemes_delay
+        current_mode_idx = training_mode.mode_idx
+
         text_embedded = self.decoder.get_input_embeddings()(text)
         if self.use_bpe_char_tokenizer:
             text_mask = get_mask_from_lengths(text_lens)
             cas_embedding = self.cas_encoder(text, subword_mask=text_mask)  # (B, L, E)
             text_embedded = text_embedded + cas_embedding
 
-        if text_embedded.shape[1] < self.streaming_speech_delay + 1:
+        if text_embedded.shape[1] < current_streaming_speech_delay + 1:
             # If text is too short, pad it with zeros
             padding_tensor = torch.zeros(
                 text_embedded.shape[0],
-                self.streaming_speech_delay + 1 - text_embedded.shape[1],
+                current_streaming_speech_delay + 1 - text_embedded.shape[1],
                 text_embedded.shape[2],
                 device=text_embedded.device,
             )
@@ -936,27 +1009,51 @@ def prepare_context_tensors(
         context_text_lens = context_text_tokens_lens
         context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens)  # (B, L, E)
 
+        # Prepare task embedding for multi-mode training
+        # Only use task embedding if there are multiple modes (task_embedding is not None)
+        task_embedding = None
+        task_embedding_lens = None
+        if self.task_embedding is not None and current_mode_idx is not None:
+            batch_size = text.size(0)
+            mode_idx_tensor = torch.full(
+                (batch_size,), current_mode_idx, dtype=torch.long, device=text.device
+            )
+            task_embedding = self.task_embedding(mode_idx_tensor).unsqueeze(1)  # (B, 1, E)
+            task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=text.device)  # (B,)
+
         remaining_text_embedded = None
         remaining_text_lens = None
-        if self.text_input_mode == 'full':
-            context_embedding, context_lens = self.join_embeddings_temporally(
-                embeddings=[context_audio_embedded, context_text_embedded, text_embedded],
-                lengths=[context_audio_codes_lens, context_text_lens, text_lens],
-            )
-        elif self.text_input_mode == 'streaming':
-            prompt_text_embedded = text_embedded[:, : self.streaming_speech_delay, :]
-            prompt_text_lens = torch.ones_like(text_lens) * self.streaming_speech_delay
-            context_embedding, context_lens = self.join_embeddings_temporally(
-                embeddings=[context_audio_embedded, context_text_embedded, prompt_text_embedded],
-                lengths=[context_audio_codes_lens, context_text_lens, prompt_text_lens],
-            )
-            remaining_text_embedded = text_embedded[:, self.streaming_speech_delay :, :]
-            remaining_text_lens = text_lens - self.streaming_speech_delay
+        if current_text_input_mode == 'full':
+            if task_embedding is not None:
+                context_embedding, context_lens = self.join_embeddings_temporally(
+                    embeddings=[task_embedding, context_audio_embedded, context_text_embedded, text_embedded],
+                    lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens, text_lens],
+                )
+            else:
+                context_embedding, context_lens = self.join_embeddings_temporally(
+                    embeddings=[context_audio_embedded, context_text_embedded, text_embedded],
+                    lengths=[context_audio_codes_lens, context_text_lens, text_lens],
+                )
+        elif current_text_input_mode == 'streaming':
+            prompt_text_embedded = text_embedded[:, :current_streaming_speech_delay, :]
+            prompt_text_lens = torch.ones_like(text_lens) * current_streaming_speech_delay
+            if task_embedding is not None:
+                context_embedding, context_lens = self.join_embeddings_temporally(
+                    embeddings=[task_embedding, context_audio_embedded, context_text_embedded, prompt_text_embedded],
+                    lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens, prompt_text_lens],
+                )
+            else:
+                context_embedding, context_lens = self.join_embeddings_temporally(
+                    embeddings=[context_audio_embedded, context_text_embedded, prompt_text_embedded],
+                    lengths=[context_audio_codes_lens, context_text_lens, prompt_text_lens],
+                )
+            remaining_text_embedded = text_embedded[:, current_streaming_speech_delay:, :]
+            remaining_text_lens = text_lens - current_streaming_speech_delay
             remaining_text_lens = remaining_text_lens.clamp(min=0)
             remaining_text_mask = get_mask_from_lengths(remaining_text_lens)
             remaining_text_embedded = remaining_text_embedded * remaining_text_mask.unsqueeze(2)  # (B, T, E)
         else:
-            raise ValueError(f"Invalid text input mode: {self.text_input_mode}")
+            raise ValueError(f"Invalid text input mode: {current_text_input_mode}")
 
         return ContextTensors(
             context_embedding=context_embedding,
@@ -1157,6 +1254,7 @@ def process_batch(
         phoneme_tokens: Optional[torch.Tensor] = None,
         phoneme_tokens_lens: Optional[torch.Tensor] = None,
         mode: str = "train",
+        training_mode: Optional[TrainingMode] = None,
     ) -> ProcessBatchOutput:
         """
         Process a batch of inputs to compute model outputs and losses.
@@ -1186,6 +1284,8 @@ def process_batch(
             phoneme_tokens: Phoneme token IDs (required if phoneme_tokenizer is enabled), shape (B, L_phoneme)
             phoneme_tokens_lens: Length of phoneme tokens for each batch item, shape (B,)
             mode: Training mode, either "train" or "val". Affects dropout behavior.
+            training_mode: Optional TrainingMode object specifying which mode to use.
+                If None and multi_mode_training is enabled, a random mode is selected during training.
 
         Returns:
             ProcessBatchOutput: Dataclass containing:
@@ -1200,6 +1300,23 @@ def process_batch(
                 - context_audio_codes: Audio codes from context
                 - context_audio_codes_lens: Length of context audio codes
         """
+        # Select training mode for multi-mode training
+        # During training, randomly select a mode if not specified
+        # During validation, use the first mode (default) if not specified
+        selected_training_mode = training_mode
+        if selected_training_mode is None:
+            if mode == 'train':
+                # Randomly select a mode during training
+                selected_training_mode = random.choice(self.training_modes)
+            else:
+                # Use the first mode during validation
+                selected_training_mode = self.training_modes[0]
+
+        # Get the current mode's parameters
+        current_text_input_mode = selected_training_mode.text_input_mode
+        current_streaming_speech_delay = selected_training_mode.streaming_speech_delay
+        current_streaming_phonemes_delay = selected_training_mode.streaming_phonemes_delay
+
         # Determine whether to apply text/phoneme dropout for regularization during training
         # Text dropout: randomly drop text input to encourage the model to rely on other signals
         dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False
@@ -1222,6 +1339,7 @@ def process_batch(
             context_audio=context_audio,
             context_audio_lens=context_audio_lens,
             dropout_text_input=dropout_text_input,
+            training_mode=selected_training_mode,
         )
 
         # Extract context tensors for use in the forward pass
@@ -1245,7 +1363,7 @@ def process_batch(
                 # inference easier especially with KV caching and using a duplicated batch.
                 context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1)  # (B, T_total, E)
                 # Make unconditional remaining text embedding all zeros. Simplifies the inference implementation.
-                if self.text_input_mode == 'streaming':
+                if current_text_input_mode == 'streaming':
                     remaining_text_embedded = torch.zeros_like(remaining_text_embedded)
 
         # Convert raw audio to discrete codes if codes are not already provided
@@ -1313,7 +1431,8 @@ def process_batch(
         if self.phoneme_tokenizer is not None:
             # Compute context length offset for phoneme alignment
             # This accounts for different delays in speech vs phoneme streams
-            context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay
+            # Use the selected mode's streaming delays
+            context_lens_for_phonemes = context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay
 
             # Prepare phoneme channel input with proper alignment
             (
@@ -1417,6 +1536,7 @@ def process_batch(
             audio_codes_lens_target=audio_codes_lens_target,
             context_audio_codes=context_tensors.context_audio_codes,
             context_audio_codes_lens=context_tensors.context_audio_codes_lens,
+            selected_training_mode=selected_training_mode.name if selected_training_mode is not None else None,
         )
 
     def training_step(self, batch, batch_idx):
@@ -1451,6 +1571,13 @@ def training_step(self, batch, batch_idx):
         if local_transformer_loss is not None:
             self.log('train/local_transformer_loss', local_transformer_loss, prog_bar=True, sync_dist=True)
 
+        # Log training mode info for multi-mode training
+        if batch_output.selected_training_mode is not None:
+            # Log which mode was selected for this batch
+            # Convert mode name to an index for logging
+            mode_idx = self.mode_name_to_mode[batch_output.selected_training_mode].mode_idx
+            self.log('train/training_mode_idx', float(mode_idx), on_step=True)
+
         # Log batch info
         batch_size, text_token_max_len = batch["text"].shape
         text_token_total_num = batch["text_lens"].sum()
@@ -1692,10 +1819,43 @@ def infer_batch(
         phoneme_input_type='gt',
         phoneme_sampling_method='argmax',
         dropout_text_input=False,
+        inference_mode: Optional[str] = None,
     ):
-        # TODO: Make this API same as MagpieTTS model.
+        """
+        Run inference on a batch of inputs.
+
+        Args:
+            batch: Input batch containing text, context, etc.
+            max_decoder_steps: Maximum number of decoding steps.
+            temperature: Sampling temperature.
+            topk: Top-k sampling parameter.
+            use_local_transformer_for_inference: Whether to use local transformer.
+            maskgit_n_steps: Number of MaskGit steps.
+            use_cfg: Whether to use classifier-free guidance.
+            cfg_scale: CFG scale factor.
+            phoneme_input_type: 'gt' for ground truth or 'pred' for predicted phonemes.
+            phoneme_sampling_method: 'argmax' or 'sample'.
+            dropout_text_input: Whether to dropout text input.
+            inference_mode: Name of the inference mode to use (e.g., "full", "streaming_4_8").
+                If None, uses the default inference mode (first mode in training_modes).
+        """
         with torch.inference_mode():
             start_time = time.time()
+
+            # Resolve inference mode
+            mode_name = inference_mode if inference_mode is not None else self.default_inference_mode
+            if mode_name in self.mode_name_to_mode:
+                selected_training_mode = self.mode_name_to_mode[mode_name]
+                logging.info(f"Using inference mode: {selected_training_mode.name}")
+            else:
+                available_modes = list(self.mode_name_to_mode.keys())
+                raise ValueError(f"Unknown inference mode '{mode_name}'. Available modes: {available_modes}")
+
+            # Get current mode parameters
+            current_text_input_mode = selected_training_mode.text_input_mode
+            current_streaming_speech_delay = selected_training_mode.streaming_speech_delay
+            current_streaming_phonemes_delay = selected_training_mode.streaming_phonemes_delay
+
             context_tensors = self.prepare_context_tensors(
                 text=batch['text'],
                 text_lens=batch['text_lens'],
@@ -1706,6 +1866,7 @@ def infer_batch(
                 context_audio=batch.get('context_audio'),
                 context_audio_lens=batch.get('context_audio_lens'),
                 dropout_text_input=dropout_text_input,
+                training_mode=selected_training_mode,
             )
             context_embedding = context_tensors.context_embedding  # (B, T_total, E)
             context_lens = context_tensors.context_lens  # (B,)
@@ -1713,7 +1874,7 @@ def infer_batch(
             remaining_text_lens = context_tensors.remaining_text_lens
 
             if self.phoneme_tokenizer is not None:
-                context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay
+                context_lens_for_phonemes = context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay
                 phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = (
                     self.prepare_phoneme_channel_input(
                         batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes
@@ -1736,7 +1897,7 @@ def infer_batch(
             audio_codes_input = audio_codes_bos
 
             audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input)  # (B, T, E)
-            if self.text_input_mode == 'streaming':
+            if current_text_input_mode == 'streaming':
                 remaining_text_pad_length = max_decoder_steps - remaining_text_lens.max().item() + 1
                 remaining_text_pad_tensor = torch.zeros(
                     remaining_text_embedded.size(0),
@@ -1756,7 +1917,7 @@ def infer_batch(
             min_context_len = context_plus_audio_lens.min().item()
             if self.phoneme_tokenizer is not None:
                 min_context_len = (
-                    min_context_len - self.streaming_speech_delay + self.streaming_phonemes_delay - 1
+                    min_context_len - current_streaming_speech_delay + current_streaming_phonemes_delay - 1
                 )  # 1 for audio BOS that we had added.
 
             actual_batch_size = context_embedding.size(0)
@@ -1947,7 +2108,7 @@ def infer_batch(
                 new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2))  # (B, 1, E)
                 new_emb_unconditional = new_emb * 1
 
-                if self.text_input_mode == 'streaming':
+                if current_text_input_mode == 'streaming':
                     _bs = context_embedding.size(0)
                     remaining_text_embedded_current = remaining_text_embedded[
                         torch.arange(_bs), current_text_positions.clamp(min=0), :

From 704a5c843a05fad96b8701681ca8296b516032d7 Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Thu, 29 Jan 2026 01:46:11 +0000
Subject: [PATCH 24/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 07057a618d85..628808c50d92 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -230,9 +230,11 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
 
         logging.info(f"Multi-mode training with {len(self.training_modes)} modes:")
         for mode in self.training_modes:
-            logging.info(f"  - {mode.name}: text_input_mode={mode.text_input_mode}, "
-                       f"streaming_phonemes_delay={mode.streaming_phonemes_delay}, "
-                       f"streaming_speech_delay={mode.streaming_speech_delay}")
+            logging.info(
+                f"  - {mode.name}: text_input_mode={mode.text_input_mode}, "
+                f"streaming_phonemes_delay={mode.streaming_phonemes_delay}, "
+                f"streaming_speech_delay={mode.streaming_speech_delay}"
+            )
 
         # Create a mapping from mode name to mode object for easy lookup during inference
         self.mode_name_to_mode = {mode.name: mode for mode in self.training_modes}
@@ -1015,9 +1017,7 @@ def prepare_context_tensors(
         task_embedding_lens = None
         if self.task_embedding is not None and current_mode_idx is not None:
             batch_size = text.size(0)
-            mode_idx_tensor = torch.full(
-                (batch_size,), current_mode_idx, dtype=torch.long, device=text.device
-            )
+            mode_idx_tensor = torch.full((batch_size,), current_mode_idx, dtype=torch.long, device=text.device)
             task_embedding = self.task_embedding(mode_idx_tensor).unsqueeze(1)  # (B, 1, E)
             task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=text.device)  # (B,)
 
@@ -1432,7 +1432,9 @@ def process_batch(
             # Compute context length offset for phoneme alignment
             # This accounts for different delays in speech vs phoneme streams
             # Use the selected mode's streaming delays
-            context_lens_for_phonemes = context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay
+            context_lens_for_phonemes = (
+                context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay
+            )
 
             # Prepare phoneme channel input with proper alignment
             (
@@ -1874,7 +1876,9 @@ def infer_batch(
             remaining_text_lens = context_tensors.remaining_text_lens
 
             if self.phoneme_tokenizer is not None:
-                context_lens_for_phonemes = context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay
+                context_lens_for_phonemes = (
+                    context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay
+                )
                 phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = (
                     self.prepare_phoneme_channel_input(
                         batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes

From 038d224311ab8d8f2972dfdeb92288ecdc4a84f0 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Thu, 29 Jan 2026 12:02:58 -0500
Subject: [PATCH 25/94] default mode for backward compatibility

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 34 ++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 628808c50d92..e2f080903700 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -215,18 +215,28 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         # Each mode has its own task embedding that is prepended to the context
         training_modes_cfg = cfg.get('training_modes', None)
         if training_modes_cfg is None:
-            raise ValueError("training_modes must be specified in the config")
-
-        self.training_modes = []
-        for mode_idx, mode_cfg in enumerate(training_modes_cfg):
-            mode = TrainingMode(
-                name=mode_cfg.name,
-                text_input_mode=mode_cfg.text_input_mode,
-                streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0),
-                streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0),
-                mode_idx=mode_idx,
-            )
-            self.training_modes.append(mode)
+            # Create a default training mode for backward compatibility
+            self.training_modes = [
+                TrainingMode(
+                    name="streaming_4_8",
+                    text_input_mode="streaming",
+                    streaming_phonemes_delay=4,
+                    streaming_speech_delay=8,
+                    mode_idx=0,
+                )
+            ]
+        
+        else:
+            self.training_modes = []
+            for mode_idx, mode_cfg in enumerate(training_modes_cfg):
+                mode = TrainingMode(
+                    name=mode_cfg.name,
+                    text_input_mode=mode_cfg.text_input_mode,
+                    streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0),
+                    streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0),
+                    mode_idx=mode_idx,
+                )
+                self.training_modes.append(mode)
 
         logging.info(f"Multi-mode training with {len(self.training_modes)} modes:")
         for mode in self.training_modes:

From 3f582023c964876c1afb85a15ca10cdf0ba68a0a Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Thu, 29 Jan 2026 17:03:54 +0000
Subject: [PATCH 26/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index e2f080903700..06313cd34ec4 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -225,7 +225,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
                     mode_idx=0,
                 )
             ]
-        
+
         else:
             self.training_modes = []
             for mode_idx, mode_cfg in enumerate(training_modes_cfg):

From d58a560adcf4efeb98f062b4832ad0225d2a0d7c Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Thu, 29 Jan 2026 16:12:07 -0800
Subject: [PATCH 27/94] default config changes

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 .../tts/conf/magpietts/easy_magpietts.yaml    | 28 ++++---------------
 .../conf/magpietts/easy_magpietts_lhotse.yaml | 28 ++++---------------
 2 files changed, 10 insertions(+), 46 deletions(-)

diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
index 76f39121322e..15ccfbba9f2a 100644
--- a/examples/tts/conf/magpietts/easy_magpietts.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -42,18 +42,10 @@ model:
   # Each mode has its own task embedding that is prepended to the context.
   # During inference, you can specify which mode to use via the 'inference_mode' parameter.
   training_modes:
-    - name: "full"
-      text_input_mode: "full"
-      streaming_phonemes_delay: 0  # Not used in full mode
-      streaming_speech_delay: 0    # Not used in full mode
     - name: "streaming_4_8"
-      text_input_mode: "streaming"
+      text_input_mode: "streaming" # Options: "full", "streaming"
       streaming_phonemes_delay: 4
       streaming_speech_delay: 8
-    - name: "streaming_2_4"
-      text_input_mode: "streaming"
-      streaming_phonemes_delay: 2
-      streaming_speech_delay: 4
   
   frame_stacking_factor: 1
   phoneme_stacking_factor: 2
@@ -73,20 +65,10 @@ model:
       use_chars: true
       use_stresses: true
       
-  text_tokenizers: # Add more languages for multi-lingual TTS
-    english_phoneme:
-      _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
-      punct: true
-      apostrophe: true
-      pad_with_space: false
-      g2p:
-        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
-        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
-        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
-        phoneme_probability: 0.8
-        ignore_ambiguous_words: false
-        use_chars: true
-        use_stresses: true
+  text_tokenizers:
+    qwen2.5b:
+      _target_: AutoTokenizer
+      pretrained_model: "Qwen/Qwen2.5-1.5B-Instruct"
 
   train_ds:
     dataset:
diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
index 1683a27f4238..cd4b314ee970 100644
--- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -36,18 +36,10 @@ model:
   # Each mode has its own task embedding that is prepended to the context.
   # During inference, you can specify which mode to use via the 'inference_mode' parameter.
   training_modes:
-    - name: "full"
-      text_input_mode: "full"
-      streaming_phonemes_delay: 0  # Not used in full mode
-      streaming_speech_delay: 0    # Not used in full mode
     - name: "streaming_4_8"
-      text_input_mode: "streaming"
+      text_input_mode: "streaming" # Options: "full", "streaming"
       streaming_phonemes_delay: 4
       streaming_speech_delay: 8
-    - name: "streaming_2_4"
-      text_input_mode: "streaming"
-      streaming_phonemes_delay: 2
-      streaming_speech_delay: 4
 
   frame_stacking_factor: 1
   phoneme_stacking_factor: 2
@@ -67,20 +59,10 @@ model:
       use_chars: true
       use_stresses: true
 
-  text_tokenizers: # Add more languages for multi-lingual TTS
-    english_phoneme:
-      _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
-      punct: true
-      apostrophe: true
-      pad_with_space: false
-      g2p:
-        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
-        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
-        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
-        phoneme_probability: 0.8
-        ignore_ambiguous_words: false
-        use_chars: true
-        use_stresses: true
+  text_tokenizers:
+    qwen2.5b:
+      _target_: AutoTokenizer
+      pretrained_model: "Qwen/Qwen2.5-1.5B-Instruct"
 
   train_ds:
     use_lhotse: ${model.use_lhotse}

From a7fa4781e6ac8c9ae406a0b54732708bfde1da5f Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Sun, 1 Feb 2026 15:09:18 -0800
Subject: [PATCH 28/94] Magpietts decoderonly 2601 bpe ipa tokenizer (#57)

* multilingual BPE IPA tokenizer

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* BPE IPA tokenizer, configurable audio embedding size and data processing scripts

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* remove unnecessary scripts

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* clean up scripts

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* simplify dropout logic

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* handle corner cases

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* trainer strategy ddp

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* trainer strategy undo ddp

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

---------

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 .../tts/conf/magpietts/easy_magpietts.yaml    |   32 +-
 .../conf/magpietts/easy_magpietts_lhotse.yaml |   32 +-
 examples/tts/easy_magpietts.py                |    1 -
 .../text_to_speech/tts_tokenizers.py          |   45 +-
 .../tts/data/text_to_speech_dataset.py        |   14 +-
 .../tts/data/text_to_speech_dataset_lhotse.py |   16 +-
 nemo/collections/tts/models/easy_magpietts.py |   94 +-
 .../ipa_scripts/add_ipa_to_lhotse_shards.py   |  359 +
 .../ipa_scripts/analyze_ipa_tokenization.py   |  728 ++
 .../ipa_scripts/cuts_dirs_config.json         |   45 +
 .../ipa_scripts/train_ipa_bpe_tokenizer.py    |  521 +
 ...okenizer_2048_en_de_es_fr_hi_it_vi_zh.json | 9954 +++++++++++++++++
 12 files changed, 11767 insertions(+), 74 deletions(-)
 create mode 100644 scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py
 create mode 100644 scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py
 create mode 100644 scripts/magpietts/ipa_scripts/cuts_dirs_config.json
 create mode 100644 scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py
 create mode 100644 scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json

diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
index 15ccfbba9f2a..eea075870b07 100644
--- a/examples/tts/conf/magpietts/easy_magpietts.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -21,17 +21,18 @@ model:
   
   embedding_dim: 1536
   hidden_dim: 1536
+  audio_embedding_dim: 256  # Smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection.
   codecmodel_path: ???
   max_epochs: ${max_epochs}
   steps_per_epoch: ${weighted_sampling_steps_per_epoch}
   
   # Local transformer parameters for autoregressive codebook prediction within a frame
-  local_transformer_type: "none" # "none", "autoregressive", "maskgit"
-  # Below args are only relevant if use_local_transformer is autoregressive, maskgit
+  local_transformer_type: "autoregressive" # "none", "autoregressive"
+  # Below args are only relevant if use_local_transformer is autoregressive
   local_transformer_loss_scale: 1.0
   local_transformer_n_layers: 3
-  local_transformer_n_heads: 1
-  local_transformer_hidden_dim: 256
+  local_transformer_n_heads: 12
+  local_transformer_hidden_dim: 1536
 
   cfg_unconditional_prob: 0.1
   # To get special_tokens of the tokenzer, you can do:
@@ -47,28 +48,19 @@ model:
       streaming_phonemes_delay: 4
       streaming_speech_delay: 8
   
-  frame_stacking_factor: 1
-  phoneme_stacking_factor: 2
+  frame_stacking_factor: 2
+  phoneme_stacking_factor: 1
   dropout_text_input_prob: 0.3
+  dropout_phoneme_input_prob: 0.3
 
   phoneme_tokenizer:
-    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
-    punct: true
-    apostrophe: true
-    pad_with_space: false
-    g2p:
-      _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
-      phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
-      heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
-      phoneme_probability: 1.0
-      ignore_ambiguous_words: false
-      use_chars: true
-      use_stresses: true
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
+    tokenizer_path: "scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json"
       
   text_tokenizers:
-    qwen2.5b:
+    nemotron_nano_30b:
       _target_: AutoTokenizer
-      pretrained_model: "Qwen/Qwen2.5-1.5B-Instruct"
+      pretrained_model: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
 
   train_ds:
     dataset:
diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
index cd4b314ee970..2327820e44a4 100644
--- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -19,15 +19,16 @@ model:
   
   embedding_dim: 1536
   hidden_dim: 1536
+  audio_embedding_dim: 256  # Smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection.
   codecmodel_path: ???
   
   # Local transformer parameters for autoregressive codebook prediction within a frame
-  local_transformer_type: "none" # "none", "autoregressive", "maskgit"
-  # Below args are only relevant if use_local_transformer is autoregressive, maskgit
+  local_transformer_type: "autoregressive" # "none", "autoregressive"
+  # Below args are only relevant if use_local_transformer is autoregressive
   local_transformer_loss_scale: 1.0
   local_transformer_n_layers: 3
-  local_transformer_n_heads: 1
-  local_transformer_hidden_dim: 256
+  local_transformer_n_heads: 12
+  local_transformer_hidden_dim: 1536
 
   cfg_unconditional_prob: 0.1
   
@@ -41,28 +42,19 @@ model:
       streaming_phonemes_delay: 4
       streaming_speech_delay: 8
 
-  frame_stacking_factor: 1
-  phoneme_stacking_factor: 2
+  frame_stacking_factor: 2
+  phoneme_stacking_factor: 1
   dropout_text_input_prob: 0.3
+  dropout_phoneme_input_prob: 0.3
   
   phoneme_tokenizer:
-    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
-    punct: true
-    apostrophe: true
-    pad_with_space: false
-    g2p:
-      _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
-      phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
-      heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
-      phoneme_probability: 1.0
-      ignore_ambiguous_words: false
-      use_chars: true
-      use_stresses: true
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
+    tokenizer_path: "scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json"
 
   text_tokenizers:
-    qwen2.5b:
+    nemotron_nano_30b:
       _target_: AutoTokenizer
-      pretrained_model: "Qwen/Qwen2.5-1.5B-Instruct"
+      pretrained_model: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
 
   train_ds:
     use_lhotse: ${model.use_lhotse}
diff --git a/examples/tts/easy_magpietts.py b/examples/tts/easy_magpietts.py
index 4195060b87ef..705c4ab77134 100644
--- a/examples/tts/easy_magpietts.py
+++ b/examples/tts/easy_magpietts.py
@@ -21,7 +21,6 @@
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
-
 @hydra_runner(config_path="conf/magpietts", config_name="easy_magpietts")
 def main(cfg):
     logging.info('\nConfig Params:\n%s', OmegaConf.to_yaml(cfg, resolve=True))
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
index 4ecd544df81e..81f875750d64 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -1172,6 +1172,39 @@ def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None):
         return [self._token2id[p] for p in ps]
 
 
+class IPABPETokenizer:
+    """Simple IPA BPE tokenizer wrapper around HuggingFace tokenizers.
+
+    Args:
+        tokenizer_path: Path to the tokenizer.json file (or directory containing it).
+    """
+
+    def __init__(self, tokenizer_path: str):
+        import os
+
+        from tokenizers import Tokenizer
+
+        if os.path.isdir(tokenizer_path):
+            tokenizer_file = os.path.join(tokenizer_path, "tokenizer.json")
+        else:
+            tokenizer_file = tokenizer_path
+
+        if not os.path.exists(tokenizer_file):
+            raise ValueError(f"Tokenizer file not found: {tokenizer_file}")
+
+        self._tokenizer = Tokenizer.from_file(tokenizer_file)
+        self.tokens = self._tokenizer.get_vocab()
+        self.pad = self.tokens.get("<pad>", None)
+
+    def encode(self, text: str) -> List[int]:
+        """Encode IPA text to token IDs."""
+        return self._tokenizer.encode(text).ids
+
+    def decode(self, tokens: List[int]) -> str:
+        """Decode token IDs back to IPA text."""
+        return self._tokenizer.decode(tokens)
+
+
 # TODO @xueyang: subclassing from `nemo/collections/common/tokenizers/tokenizer_spec.py::TokenizerSpec`, and/or
 #  adjust to reuse `nemo/collections/common/tokenizers/aggregate_tokenizer.py::AggregateTokenizer`
 class AggregatedTTSTokenizer:
@@ -1202,7 +1235,13 @@ def __init__(self, tokenizers: List[Union[BaseTokenizer, PreTrainedTokenizerBase
                 _tokens = list(tokenizer.get_vocab().keys())
                 tokens.extend(_tokens)
                 num_tokens = len(_tokens)
-                tokenizer_pad_ids[tokenizer_name] = tokenizer.pad_token_id + tokenizer_offset
+                pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.unk_token_id
+                if pad_token_id is None:
+                    raise ValueError(
+                        f"Tokenizer '{tokenizer_name}' has no pad_token_id or unk_token_id. "
+                        "Please set one before using with AggregatedTTSTokenizer."
+                    )
+                tokenizer_pad_ids[tokenizer_name] = pad_token_id + tokenizer_offset
             else:
                 raise ValueError("Tokenizers must be either BaseTokenizer or HuggingFace PreTrainedTokenizerBase.")
             tokenizer_offset += num_tokens
@@ -1217,8 +1256,10 @@ def __init__(self, tokenizers: List[Union[BaseTokenizer, PreTrainedTokenizerBase
         # Define aggregated token's pad value from the first tokenizer's pad value
         first_tokenizer = self.tokenizers[tokenizer_names[0]]
         self.first_tokenizer = first_tokenizer
-        if hasattr(first_tokenizer, "pad_token_id"):  # Defined in PreTrainedTokenizerBase subclasses
+        if hasattr(first_tokenizer, "pad_token_id") and first_tokenizer.pad_token_id is not None:
             self.pad = first_tokenizer.pad_token_id
+        elif hasattr(first_tokenizer, "unk_token_id") and first_tokenizer.unk_token_id is not None:
+            self.pad = first_tokenizer.unk_token_id
         elif hasattr(first_tokenizer, "pad"):  # Defined in BaseTokenizer subclasses
             self.pad = first_tokenizer.pad
         else:
diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py
index 254169f621c6..e25e703f52ee 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset.py
@@ -24,7 +24,7 @@
 import torch.utils.data
 
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
-from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import BaseTokenizer
+from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import BaseTokenizer, IPABPETokenizer
 from nemo.collections.tts.parts.preprocessing.feature_processors import FeatureProcessor
 from nemo.collections.tts.parts.preprocessing.features import Featurizer
 from nemo.collections.tts.parts.utils.tts_dataset_utils import (
@@ -436,7 +436,17 @@ def __getitem__(self, index):
         }
 
         if self.phoneme_tokenizer is not None:
-            phoneme_tokens = self.phoneme_tokenizer.encode(data.text)
+            # Use IPA text for IPABPETokenizer (required), otherwise use regular text
+            if isinstance(self.phoneme_tokenizer, IPABPETokenizer):
+                if 'ipa' not in data.manifest_entry:
+                    raise ValueError(
+                        f"IPABPETokenizer requires 'ipa' field but it is not available in the manifest entry. "
+                        f"Text: {data.text}"
+                    )
+                phoneme_text = data.manifest_entry['ipa']
+            else:
+                phoneme_text = data.text
+            phoneme_tokens = self.phoneme_tokenizer.encode(phoneme_text)
             phoneme_tokens = (
                 [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
             )
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index 9bad7a36e44a..480119202e28 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -24,7 +24,7 @@
 from omegaconf import DictConfig
 from transformers import AutoTokenizer, T5Tokenizer
 
-from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer
+from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPABPETokenizer
 from nemo.collections.tts.parts.utils.tts_dataset_utils import (
     beta_binomial_prior_distribution,
     normalize_volume,
@@ -41,7 +41,7 @@ def setup_tokenizers(all_tokenizers_config, mode='train'):
     for tokenizer_name in all_tokenizers_config:
         tokenizer_config = all_tokenizers_config[tokenizer_name]
         if tokenizer_config._target_ == 'AutoTokenizer':
-            tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.pretrained_model)
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.pretrained_model, trust_remote_code=True)
         elif tokenizer_config._target_ == 'T5Tokenizer':
             tokenizer = T5Tokenizer.from_pretrained(tokenizer_config.pretrained_model)
         else:
@@ -411,7 +411,17 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
             token_len_list.append(text_len)
 
             if self.phoneme_tokenizer is not None:
-                phoneme_tokens = self.phoneme_tokenizer.encode(text_str)
+                # Use IPA text for IPABPETokenizer (required), otherwise use regular text_str
+                if isinstance(self.phoneme_tokenizer, IPABPETokenizer):
+                    if not cut.supervisions[0].has_custom("ipa"):
+                        raise ValueError(
+                            f"IPABPETokenizer requires 'ipa' field but it is not available in the cut. "
+                            f"Cut ID: {cut.id}, Text: {text_str}"
+                        )
+                    phoneme_text = cut.supervisions[0].ipa
+                else:
+                    phoneme_text = text_str
+                phoneme_tokens = self.phoneme_tokenizer.encode(phoneme_text)
                 phoneme_tokens = (
                     [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
                 )
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 06313cd34ec4..68a48ab9701c 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -280,11 +280,20 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self._codec_model.freeze()  # Lightning does requires_grad = False and self.eval()
         self._codec_converter = codec_converter
 
+        # Audio embedding dimension - can be smaller than hidden_dim to reduce parameters
+        self.audio_embedding_dim = cfg.get('audio_embedding_dim', cfg.hidden_dim)
+
         audio_embeddings = []
         for _ in range(self.num_audio_codebooks * self.frame_stacking_factor):
-            audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, cfg.embedding_dim))
+            audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, self.audio_embedding_dim))
         self.audio_embeddings = nn.ModuleList(audio_embeddings)
 
+        # Projection from audio_embedding_dim to embedding_dim (Identity if same)
+        if self.audio_embedding_dim != cfg.embedding_dim:
+            self.audio_in_projection = nn.Linear(self.audio_embedding_dim, cfg.embedding_dim)
+        else:
+            self.audio_in_projection = nn.Identity()
+
         if self.phoneme_tokenizer is not None:
             phoneme_embeddings = []
             for _ in range(self.phoneme_stacking_factor):
@@ -299,6 +308,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
 
         hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config)
         self.decoder = hf_transformer.model
+        # self.decoder.to(torch.float32)
         self.lm_text_head = hf_transformer.lm_head
 
         self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim)
@@ -335,8 +345,14 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
                 special_vocab=special_vocab,
             )
 
+        # Projection from hidden_dim to audio_embedding_dim before final_proj (Identity if same)
+        if self.audio_embedding_dim != cfg.hidden_dim:
+            self.audio_out_projection = nn.Linear(cfg.hidden_dim, self.audio_embedding_dim)
+        else:
+            self.audio_out_projection = nn.Identity()
+
         self.final_proj = nn.Linear(
-            cfg.hidden_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor
+            self.audio_embedding_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor
         )
         self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none')
 
@@ -358,11 +374,16 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
                 max_length_causal_mask=self.num_audio_codebooks * self.frame_stacking_factor + 2,
                 use_learnable_pos_emb=True,
             )
+            # Projection from local_transformer_hidden_dim to audio_embedding_dim (Identity if same)
+            if self.audio_embedding_dim != local_transformer_hidden_dim:
+                self.local_transformer_audio_out_projection = nn.Linear(local_transformer_hidden_dim, self.audio_embedding_dim)
+            else:
+                self.local_transformer_audio_out_projection = nn.Identity()
             local_transformer_out_projections = []
             for _ in range(self.num_audio_codebooks * self.frame_stacking_factor):
                 # Have a separate projection layer for each codebook, to distinguish between them
                 local_transformer_out_projections.append(
-                    nn.Linear(local_transformer_hidden_dim, self.num_all_tokens_per_codebook)
+                    nn.Linear(self.audio_embedding_dim, self.num_all_tokens_per_codebook)
                 )
             self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections)
 
@@ -496,6 +517,8 @@ def embed_audio_tokens(self, audio_tokens):
             else:
                 audio_embedding = audio_embedding + embedding
         audio_embedding = audio_embedding / audio_tokens.size(1)
+        # Project from audio_embedding_dim to embedding_dim
+        audio_embedding = self.audio_in_projection(audio_embedding)
         return audio_embedding
 
     def embed_phoneme_tokens(self, phoneme_tokens):
@@ -532,12 +555,14 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_
         targets_offset_by_one: bool, if False, the target for index 0 is codebook 0, for index 1 is codebook 1, etc. (autoregressive)
                                      if True,  the target for index 1 is codebook 0, for index 2 is codebook 1, etc. (MaskGit)
         """
-        dec_out_all = dec_out.reshape(-1, dec_out.size(-1))  # (B*T', E)
+        dec_out_all = dec_out.reshape(-1, dec_out.size(-1))  # (B*T', hidden_dim)
         local_transformer_input = [dec_out_all]
         for codebook_num in range(audio_codes_target.size(1)):
             codes = audio_codes_target[:, codebook_num]  # (B, T')
             codes = codes.reshape(-1)  # (B*T',)
-            codebook_embedding = self.audio_embeddings[codebook_num](codes)  # (B*T', E)
+            codebook_embedding = self.audio_embeddings[codebook_num](codes)  # (B*T', audio_embedding_dim)
+            # Project from audio_embedding_dim to embedding_dim
+            codebook_embedding = self.audio_in_projection(codebook_embedding)
             local_transformer_input.append(codebook_embedding)
 
         local_transformer_input = torch.stack(local_transformer_input, dim=1)  # (B*T', C+1, E)
@@ -552,6 +577,8 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_
         else:
             # for MaskGit the target for index **1** is codebook 0, for index 2 is codebook 1, etc.
             local_transformer_output = local_transformer_output[:, 1:, :]  # (B*T', C, E)
+        # Project from local_transformer_hidden_dim to audio_embedding_dim
+        local_transformer_output = self.local_transformer_audio_out_projection(local_transformer_output)
         all_code_logits = []
         for codebook_num in range(audio_codes_target.size(1)):
             # Using a separate projection layer for each codebook (to distinguish between them)
@@ -666,8 +693,12 @@ def local_transformer_sample_autoregressive(
                 local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
             )
             local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']  # (B, T, 128)
-            codebook_logits = self.local_transformer_out_projections[codebook_num](
+            # Project from local_transformer_hidden_dim to audio_embedding_dim
+            local_transformer_output_projected = self.local_transformer_audio_out_projection(
                 local_transformer_output[:, -1, :]
+            )
+            codebook_logits = self.local_transformer_out_projections[codebook_num](
+                local_transformer_output_projected
             )  # (B, num_all_tokens_per_codebook)
             if use_cfg:
                 actual_batch_size = codebook_logits.size(0) // 2
@@ -697,13 +728,15 @@ def local_transformer_sample_autoregressive(
             all_preds.append(codebook_preds)
             next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(
                 1
-            )  # (B, 1, 128)
+            )  # (B, 1, audio_embedding_dim)
+            # Project from audio_embedding_dim to embedding_dim, then to local_transformer_hidden_dim
+            next_local_transformer_input = self.audio_in_projection(next_local_transformer_input)
             next_local_transformer_input = self.local_transformer_in_projection(
                 next_local_transformer_input
-            )  # (B, 1, 128)
+            )  # (B, 1, local_transformer_hidden_dim)
             local_transformer_input = torch.cat(
                 [local_transformer_input, next_local_transformer_input], dim=1
-            )  # (B, T+1, 128)
+            )  # (B, T+1, local_transformer_hidden_dim)
 
         all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
         if use_cfg:
@@ -897,7 +930,8 @@ def join_embeddings_temporally(
             dest_cols = offset.unsqueeze(1) + t_idx  # (B,Ti)
 
             # Assign embedding_i to the correct positions in joined
-            joined[batch_rows.expand_as(mask)[mask], dest_cols[mask]] = embedding_i[mask]
+            # Ensure dtype matches to avoid errors during mixed-precision training
+            joined[batch_rows.expand_as(mask)[mask], dest_cols[mask]] = embedding_i[mask].to(joined.dtype)
 
             # move cursor past this segment
             offset += len_i
@@ -1330,13 +1364,11 @@ def process_batch(
         # Determine whether to apply text/phoneme dropout for regularization during training
         # Text dropout: randomly drop text input to encourage the model to rely on other signals
         dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False
-        # Phoneme dropout: randomly drop phoneme input, but only if text is not already dropped
-        # This ensures we don't drop both simultaneously
-        dropout_phoneme_input = (
-            ((random.random() < self.dropout_phoneme_input_prob) and (not dropout_text_input))
-            if mode == 'train'
-            else False
-        )
+        dropout_phoneme_input = (random.random() < self.dropout_phoneme_input_prob) if mode == 'train' else False
+        if (dropout_phoneme_input and dropout_text_input):
+            # Only one of the two can be True, so choose randomly
+            dropout_phoneme_input = random.random() < 0.5
+            dropout_text_input = not dropout_phoneme_input
 
         # Prepare context tensors by combining text and audio context information
         context_tensors = self.prepare_context_tensors(
@@ -1420,13 +1452,18 @@ def process_batch(
         if remaining_text_embedded is not None:
             # Pad remaining text to match audio sequence length by adding zeros on the right
             padding_len = audio_codes_input_embedded.size(1) - remaining_text_embedded.size(1)
-            padding_tensor = torch.zeros(
-                remaining_text_embedded.size(0),
-                padding_len,
-                remaining_text_embedded.size(2),
-                device=remaining_text_embedded.device,
-            )
-            remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1)
+            if padding_len > 0:
+                padding_tensor = torch.zeros(
+                    remaining_text_embedded.size(0),
+                    padding_len,
+                    remaining_text_embedded.size(2),
+                    device=remaining_text_embedded.device,
+                )
+                remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1)
+            else:
+                # Log Warning
+                print(f"Warning: Remaining text length {remaining_text_embedded.size(1)} is greater than audio codes input length {audio_codes_input_embedded.size(1)}")
+                remaining_text_embedded = remaining_text_embedded[:, : audio_codes_input_embedded.size(1), :]
             # Add text information to audio embeddings (element-wise addition)
             audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded
 
@@ -1487,7 +1524,9 @@ def process_batch(
         )
 
         # Project embeddings to logits for each codebook
-        logits = self.final_proj(pred_embeddings)  # (B, T', num_codebooks * num_tokens_per_codebook)
+        # First project from hidden_dim to audio_embedding_dim, then to logits
+        pred_embeddings_audio = self.audio_out_projection(pred_embeddings)
+        logits = self.final_proj(pred_embeddings_audio)  # (B, T', num_codebooks * num_tokens_per_codebook)
 
         # Compute the main codebook prediction loss
         codebook_loss, _ = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target)
@@ -1553,6 +1592,7 @@ def process_batch(
 
     def training_step(self, batch, batch_idx):
         # Extract inputs from batch and pass explicitly to process_batch
+        # import ipdb; ipdb.set_trace()
         batch_output = self.process_batch(
             text=batch['text'],
             text_lens=batch['text_lens'],
@@ -1993,8 +2033,10 @@ def infer_batch(
                 if idx % 20 == 0:
                     print(f"Decoding timestep {idx}")
 
+                # Project from hidden_dim to audio_embedding_dim, then to logits
+                last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :])
                 all_code_logits_t = self.final_proj(
-                    last_hidden[:, -1, :]
+                    last_hidden_audio
                 )  # (B, num_codebooks * num_tokens_per_codebook)
 
                 if self.phoneme_tokenizer is not None:
diff --git a/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py b/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py
new file mode 100644
index 000000000000..61a124d56ccc
--- /dev/null
+++ b/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py
@@ -0,0 +1,359 @@
+#!/usr/bin/env python3
+"""
+Add IPA strings (from espeak/espeak-ng) to Lhotse cuts jsonl.gz shards.
+
+For each cuts directory like:
+  /Data/.../de/.../cuts
+creates:
+  /Data/.../de/.../cuts_with_ipa
+and writes corresponding cuts.000000.jsonl.gz, etc. with an added IPA field.
+
+IPA is added to each supervision under:
+  cut["supervisions"][i]["custom"]["ipa"]
+
+Usage:
+  python add_ipa_to_cuts.py --lang de
+  python add_ipa_to_cuts.py --lang all  # run all languages
+
+Edit the `CUTS_DIRS_BY_LANG` dict below (or replace with argparse/config as desired).
+"""
+
+from __future__ import annotations
+
+import argparse
+import concurrent.futures as cf
+import gzip
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple
+
+# -------------------------
+# USER CONFIG
+# -------------------------
+
+# Default config file path (same directory as this script)
+DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json"
+
+
+def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]:
+    """Load CUTS_DIRS_BY_LANG from a JSON config file."""
+    if config_path is None:
+        config_path = DEFAULT_CONFIG_PATH
+    
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    
+    with open(config_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+# Map your dataset language keys to espeak voice codes (adjust as needed).
+# For German, espeak-ng uses "de" typically.
+ESPEAK_VOICE_BY_LANG: Dict[str, str] = {
+    "de": "de",
+    "en": "en",
+    "es": "es",
+    "fr": "fr",
+    "hi": "hi",
+    "it": "it",
+    "vi": "vi",
+    "zh": "zh",
+    "ru": "ru",
+    "ja": "ja",
+    "ko": "ko",
+    "ar": "ar",
+    "he": "he",
+    "nl": "nl",
+    "pl": "pl",
+    "pt": "pt",
+}
+
+OUTPUT_SUFFIX = "_with_ipa"  # cuts -> cuts_with_ipa
+SHARD_GLOB = "cuts.*.jsonl.gz"
+
+# Parallelism
+MAX_WORKERS = max(1, (os.cpu_count() or 4) - 1)
+# MAX_WORKERS = 8
+
+# If True, skip writing if output shard exists (basic resume)
+SKIP_EXISTING_OUTPUT_SHARDS = False
+# -------------------------
+# IMPLEMENTATION
+# -------------------------
+
+IPA_FLAG = "--ipa"  # espeak-ng uses --ipa, espeak supports --ipa in many builds
+# Use --quiet if available; safe to try.
+COMMON_FLAGS = ["-q"]
+
+# Some espeak builds output extra spaces/newlines; we normalize.
+_WS_RE = re.compile(r"\s+")
+
+
+def _find_espeak_binary() -> str:
+    """Prefer espeak-ng if present, else espeak."""
+    for exe in ("espeak-ng", "espeak"):
+        if shutil.which(exe):
+            return exe
+    raise RuntimeError(
+        "Neither 'espeak-ng' nor 'espeak' was found on PATH. "
+        "Install espeak-ng (recommended) or espeak."
+    )
+
+
+@dataclass(frozen=True)
+class EspeakRunner:
+    exe: str
+    voice: str
+
+    def text_to_ipa(self, text: str) -> str:
+        """
+        Convert text -> IPA using espeak/espeak-ng.
+        """
+        # Note: We pass text via stdin to avoid shell escaping issues.
+        cmd = [self.exe, "-v", self.voice, IPA_FLAG] + COMMON_FLAGS
+        try:
+            proc = subprocess.run(
+                cmd,
+                input=text.encode("utf-8"),
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                check=False,
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to run {cmd}: {e}") from e
+
+        if proc.returncode != 0:
+            raise RuntimeError(
+                f"espeak command failed (rc={proc.returncode})\n"
+                f"cmd: {' '.join(cmd)}\n"
+                f"stderr: {proc.stderr.decode('utf-8', errors='replace')}"
+            )
+
+        out = proc.stdout.decode("utf-8", errors="replace").strip()
+        # Normalize whitespace to single spaces
+        out = _WS_RE.sub(" ", out).strip()
+        return out
+
+
+def iter_shards(cuts_dir: Path) -> List[Path]:
+    return sorted(cuts_dir.glob(SHARD_GLOB))
+
+
+def derive_output_dir(cuts_dir: Path) -> Path:
+    # If dir name ends with "cuts", produce "cuts_with_ipa".
+    # Otherwise append suffix to the directory name.
+    name = cuts_dir.name
+    if name == "cuts":
+        out_name = f"cuts{OUTPUT_SUFFIX}"
+    else:
+        out_name = f"{name}{OUTPUT_SUFFIX}"
+    return cuts_dir.parent / out_name
+
+
+def load_json_line(line: str) -> dict:
+    return json.loads(line)
+
+
+def dump_json_line(obj: dict) -> str:
+    # compact, consistent output
+    return json.dumps(obj, ensure_ascii=False)
+
+
+class IPACache:
+    """
+    Process-local cache. Speeds up repeated identical texts.
+    """
+
+    def __init__(self) -> None:
+        self._cache: Dict[Tuple[str, str], str] = {}
+
+    def get(self, voice: str, text: str) -> Optional[str]:
+        return self._cache.get((voice, text))
+
+    def set(self, voice: str, text: str, ipa: str) -> None:
+        self._cache[(voice, text)] = ipa
+
+
+def add_ipa_to_cut(
+    cut: dict,
+    espeak: EspeakRunner,
+    cache: IPACache,
+) -> dict:
+    """
+    Adds IPA to each supervision custom field: custom["ipa"].
+    Uses supervision["custom"]["normalized_text"] if available, otherwise supervision["text"] as source text.
+    For Vietnamese (vi), uses original_text and updates text/normalized_text fields.
+    """
+    sups = cut.get("supervisions") or []
+    is_vietnamese = espeak.voice == "vi"
+    for sup in sups:
+        custom = sup.get("custom")
+        if custom is None:
+            custom = {}
+            sup["custom"] = custom
+
+        # For Vietnamese, use original_text and fix the text fields
+        if is_vietnamese and custom.get("original_text"):
+            text = custom["original_text"]
+            sup["text"] = text
+            custom["normalized_text"] = text
+        else:
+            text = custom.get("normalized_text") or sup.get("text")
+        
+        if not text:
+            continue
+
+        # If already has IPA, keep it
+        if "ipa" in custom and isinstance(custom["ipa"], str) and custom["ipa"].strip():
+            continue
+
+        cached = cache.get(espeak.voice, text)
+        if cached is None:
+            cached = espeak.text_to_ipa(text)
+            cache.set(espeak.voice, text, cached)
+
+        custom["ipa"] = cached
+
+    return cut
+
+
+def process_shard(
+    shard_path: Path,
+    out_shard_path: Path,
+    espeak: EspeakRunner,
+) -> Tuple[Path, int]:
+    """
+    Read shard jsonl.gz, add IPA, write out shard jsonl.gz
+    Returns: (out_shard_path, num_lines)
+    """
+    cache = IPACache()
+    n = 0
+
+    with gzip.open(shard_path, "rt", encoding="utf-8") as fin, gzip.open(
+        out_shard_path, "wt", encoding="utf-8"
+    ) as fout:
+        for line in fin:
+            line = line.strip()
+            if not line:
+                continue
+            cut = load_json_line(line)
+            cut = add_ipa_to_cut(cut, espeak=espeak, cache=cache)
+            fout.write(dump_json_line(cut))
+            fout.write("\n")
+            n += 1
+
+    return out_shard_path, n
+
+
+def process_cuts_dir(lang: str, cuts_dir: Path) -> None:
+    voice = ESPEAK_VOICE_BY_LANG.get(lang, lang)
+    exe = _find_espeak_binary()
+    espeak = EspeakRunner(exe=exe, voice=voice)
+
+    out_dir = derive_output_dir(cuts_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    shards = iter_shards(cuts_dir)
+    if not shards:
+        print(f"[WARN] No shards matched {SHARD_GLOB} in {cuts_dir}", file=sys.stderr)
+        return
+
+    print(f"[INFO] {lang}: {cuts_dir} -> {out_dir}  (shards={len(shards)})")
+
+    jobs: List[Tuple[Path, Path]] = []
+    for shard in shards:
+        out_shard = out_dir / shard.name
+        if SKIP_EXISTING_OUTPUT_SHARDS and out_shard.exists():
+            continue
+        jobs.append((shard, out_shard))
+
+    if not jobs:
+        print(f"[INFO] {lang}: nothing to do in {cuts_dir} (all outputs exist).")
+        return
+
+    # Parallelize per shard
+    with cf.ProcessPoolExecutor(max_workers=MAX_WORKERS) as ex:
+        futures = []
+        for shard, out_shard in jobs:
+            futures.append(ex.submit(_process_shard_worker, shard, out_shard, espeak.exe, espeak.voice))
+
+        for fut in cf.as_completed(futures):
+            out_shard_path, n = fut.result()
+            print(f"[OK] wrote {out_shard_path}  (lines={n})")
+
+
+def _process_shard_worker(shard: Path, out_shard: Path, exe: str, voice: str) -> Tuple[Path, int]:
+    # Re-create runner in worker process
+    espeak = EspeakRunner(exe=exe, voice=voice)
+    return process_shard(shard, out_shard, espeak)
+
+
+def get_available_languages(cuts_dirs: Dict[str, List[str]]) -> List[str]:
+    """Return list of all available language codes."""
+    return list(cuts_dirs.keys())
+
+
+def process_language(lang: str, cuts_dirs: Dict[str, List[str]]) -> bool:
+    """
+    Process all directories for a given language.
+    Returns True if successful, False if there was an issue.
+    """
+    if lang not in cuts_dirs:
+        print(f"[ERROR] Unknown language: {lang}", file=sys.stderr)
+        print(f"[ERROR] Available languages: {get_available_languages(cuts_dirs)}", file=sys.stderr)
+        return False
+
+    dirs = cuts_dirs[lang]
+    for d in dirs:
+        cuts_dir = Path(d)
+        if not cuts_dir.exists():
+            print(f"[WARN] missing dir: {cuts_dir}", file=sys.stderr)
+            continue
+        process_cuts_dir(lang, cuts_dir)
+    
+    return True
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Add IPA strings to Lhotse cuts jsonl.gz shards."
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        required=True,
+        help="Language code to process (e.g., 'de', 'en', 'fr') or 'all' for all languages."
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=None,
+        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}"
+    )
+    args = parser.parse_args()
+
+    # Load config
+    config_path = Path(args.config) if args.config else None
+    cuts_dirs = load_cuts_dirs_config(config_path)
+    print(f"[INFO] Loaded config with languages: {get_available_languages(cuts_dirs)}")
+
+    if args.lang == "all":
+        # Process all languages
+        for lang in cuts_dirs.keys():
+            print(f"\n{'='*60}")
+            print(f"[INFO] Processing language: {lang}")
+            print(f"{'='*60}")
+            process_language(lang, cuts_dirs)
+    else:
+        success = process_language(args.lang, cuts_dirs)
+        if not success:
+            sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py b/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py
new file mode 100644
index 000000000000..7032e1eeca0f
--- /dev/null
+++ b/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py
@@ -0,0 +1,728 @@
+#!/usr/bin/env python3
+"""
+Analyze and compare tokenization (tokens per second of audio) between:
+1. Qwen/Qwen2.5-1.5B-Instruct tokenizer on raw text
+2. NVIDIA Nemotron Nano 30B tokenizer on raw text
+3. IPABPETokenizer on phonemized IPA text at different vocab sizes
+
+This script:
+1. Creates a balanced IPA corpus (equal samples per language) from train_langs
+2. Trains IPA BPE tokenizers at vocab sizes 512, 1024, 2048, 4096
+3. For each test language, samples text pairs from cuts_with_ipa directories
+4. Computes tokens per second (tokens / audio duration) for each tokenizer
+5. Outputs comparison statistics showing tokens/second for each tokenizer
+
+Features:
+- Reads data once and reuses across all vocab sizes (efficient)
+- Balances training data across languages (uses min count across all train langs)
+- Supports separate train and test language sets
+- Computes tokens per second using audio duration from cuts
+
+Usage:
+    # Train and test on all languages
+    python analyze_ipa_tokenization.py --output_dir /path/to/output
+
+    # Train on en,de,fr but test on all languages
+    python analyze_ipa_tokenization.py --output_dir /path/to/output --train_langs en,de,fr --test_langs all
+
+    # Train on all, test on specific languages
+    python analyze_ipa_tokenization.py --output_dir /path/to/output --train_langs all --test_langs en,zh
+
+    # Cap training samples per language
+    python analyze_ipa_tokenization.py --output_dir /path/to/output --max_samples_per_lang 50000
+"""
+
+import argparse
+import gzip
+import json
+import os
+import random
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Generator, List, Optional, Tuple
+
+import numpy as np
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.pre_tokenizers import ByteLevel
+from tokenizers.trainers import BpeTrainer
+from transformers import AutoTokenizer
+
+# -------------------------
+# CONFIGURATION
+# -------------------------
+
+VOCAB_SIZES = [512, 1024, 2048, 4096]
+
+# Default config file path (same directory as this script)
+DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json"
+
+
+def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]:
+    """Load CUTS_DIRS_BY_LANG from a JSON config file."""
+    if config_path is None:
+        config_path = DEFAULT_CONFIG_PATH
+    
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    
+    with open(config_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+OUTPUT_SUFFIX = "_with_ipa"
+SHARD_GLOB = "cuts.*.jsonl.gz"
+
+
+@dataclass
+class TextPair:
+    """A pair of raw text and its IPA phonemization with audio duration."""
+    raw_text: str
+    ipa_text: str
+    lang: str
+    duration: float  # audio duration in seconds
+
+
+@dataclass
+class TokenizationStats:
+    """Statistics for tokenization comparison (tokens per second)."""
+    lang: str
+    num_samples: int
+    total_duration: float  # sum of all durations in seconds
+    qwen_tokens_per_second: float
+    nemotron_tokens_per_second: float
+    ipa_tokens_per_second: Dict[int, float]  # vocab_size -> tokens/sec
+
+
+def get_ipa_dir(cuts_dir: Path) -> Path:
+    """Convert a cuts directory path to its corresponding cuts_with_ipa path."""
+    name = cuts_dir.name
+    if name == "cuts":
+        out_name = f"cuts{OUTPUT_SUFFIX}"
+    else:
+        out_name = f"{name}{OUTPUT_SUFFIX}"
+    return cuts_dir.parent / out_name
+
+
+def iter_shards(ipa_dir: Path) -> List[Path]:
+    """Get all shard files in a directory."""
+    return sorted(ipa_dir.glob(SHARD_GLOB))
+
+
+def extract_text_pairs_from_shard(shard_path: Path, lang: str) -> Generator[TextPair, None, None]:
+    """
+    Extract text pairs (raw text + IPA) from a single shard file.
+    
+    Yields:
+        TextPair objects with raw_text, ipa_text, and duration
+    """
+    with gzip.open(shard_path, "rt", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                cut = json.loads(line)
+                # Get duration from the top-level cut object
+                duration = cut.get("duration", 0.0)
+                supervisions = cut.get("supervisions", [])
+                for sup in supervisions:
+                    custom = sup.get("custom", {})
+                    ipa = custom.get("ipa")
+                    # Get raw text - prefer normalized_text, fallback to text
+                    raw_text = custom.get("normalized_text") or sup.get("text")
+                    
+                    if ipa and raw_text and isinstance(ipa, str) and isinstance(raw_text, str):
+                        ipa = ipa.strip()
+                        raw_text = raw_text.strip()
+                        if ipa and raw_text and duration > 0:
+                            yield TextPair(raw_text=raw_text, ipa_text=ipa, lang=lang, duration=duration)
+            except json.JSONDecodeError:
+                continue
+
+
+def sample_text_pairs(
+    lang: str,
+    cuts_dirs: Dict[str, List[str]],
+    num_samples: int = 1000,
+    seed: int = 42,
+) -> List[TextPair]:
+    """
+    Sample text pairs from a language's cuts_with_ipa directories.
+    
+    Args:
+        lang: Language code
+        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
+        num_samples: Number of samples to collect
+        seed: Random seed for reproducibility
+    
+    Returns:
+        List of TextPair objects
+    """
+    random.seed(seed)
+    
+    if lang not in cuts_dirs:
+        raise ValueError(f"Unknown language: {lang}")
+    
+    # Collect all text pairs from all directories
+    all_pairs = []
+    for cuts_dir_str in cuts_dirs[lang]:
+        cuts_dir = Path(cuts_dir_str)
+        ipa_dir = get_ipa_dir(cuts_dir)
+        
+        if not ipa_dir.exists():
+            print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr)
+            continue
+        
+        shards = iter_shards(ipa_dir)
+        for shard in shards:
+            for pair in extract_text_pairs_from_shard(shard, lang):
+                all_pairs.append(pair)
+                # Early exit if we have way more than needed
+                if len(all_pairs) >= num_samples * 10:
+                    break
+            if len(all_pairs) >= num_samples * 10:
+                break
+        if len(all_pairs) >= num_samples * 10:
+            break
+    
+    # Sample
+    if len(all_pairs) <= num_samples:
+        print(f"[INFO] {lang}: Only found {len(all_pairs)} pairs, using all")
+        return all_pairs
+    
+    return random.sample(all_pairs, num_samples)
+
+
+def iter_ipa_strings_for_lang(
+    lang: str,
+    cuts_dirs: Dict[str, List[str]],
+) -> Generator[str, None, None]:
+    """Iterate over all IPA strings for a single language (memory-efficient)."""
+    if lang not in cuts_dirs:
+        return
+    
+    for cuts_dir_str in cuts_dirs[lang]:
+        cuts_dir = Path(cuts_dir_str)
+        ipa_dir = get_ipa_dir(cuts_dir)
+        
+        if not ipa_dir.exists():
+            continue
+        
+        shards = iter_shards(ipa_dir)
+        for shard in shards:
+            with gzip.open(shard, "rt", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        cut = json.loads(line)
+                        for sup in cut.get("supervisions", []):
+                            ipa = sup.get("custom", {}).get("ipa")
+                            if ipa and isinstance(ipa, str) and ipa.strip():
+                                yield ipa.strip()
+                    except json.JSONDecodeError:
+                        continue
+
+
+def count_ipa_strings_for_lang(lang: str, cuts_dirs: Dict[str, List[str]], max_count: int = 100000) -> int:
+    """Count IPA strings for a language without loading into memory."""
+    count = 0
+    for _ in iter_ipa_strings_for_lang(lang, cuts_dirs):
+        count += 1
+        if count >= max_count:
+            break
+    return count
+
+
+def simple_sample_ipa_strings(
+    lang: str,
+    cuts_dirs: Dict[str, List[str]],
+    k: int,
+    max_collect: int = 100000,
+    seed: int = 42,
+) -> List[str]:
+    """
+    Simple sampling: collect up to max_collect IPA strings, then randomly sample k.
+    
+    This avoids reading through all data like reservoir sampling does.
+    
+    Args:
+        lang: Language code
+        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
+        k: Number of samples to select
+        max_collect: Maximum number of strings to collect before sampling
+        seed: Random seed for reproducibility
+    
+    Returns:
+        List of up to k sampled IPA strings
+    """
+    rng = random.Random(seed)
+    collected: List[str] = []
+    
+    for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs):
+        collected.append(ipa)
+        if len(collected) >= max_collect:
+            break
+    
+    # If we have fewer than k, return all
+    if len(collected) <= k:
+        return collected
+    
+    # Otherwise, randomly sample k
+    return rng.sample(collected, k)
+
+
+def create_balanced_corpus(
+    train_langs: List[str],
+    cuts_dirs: Dict[str, List[str]],
+    output_file: str,
+    max_samples_per_lang: Optional[int] = None,
+    max_count_per_lang: int = 100000,
+    seed: int = 42,
+) -> Tuple[str, Dict[str, int]]:
+    """
+    Create a balanced IPA corpus file with equal samples from each language.
+    
+    Uses a memory-efficient two-pass approach:
+    1. First pass: Count sentences per language (up to max_count_per_lang)
+    2. Second pass: Use simple sampling to select samples
+    
+    Args:
+        train_langs: List of language codes to include
+        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
+        output_file: Path to write the balanced corpus
+        max_samples_per_lang: Optional cap on samples per language
+        max_count_per_lang: Max count per language when counting IPA strings
+        seed: Random seed for reproducibility
+    
+    Returns:
+        Tuple of (corpus_file_path, dict of lang -> actual_count)
+    """
+    # First pass: Count sentences per language
+    print("[INFO] Pass 1: Counting IPA strings per language...")
+    lang_counts: Dict[str, int] = {}
+    
+    for lang in train_langs:
+        if lang not in cuts_dirs:
+            print(f"[WARN] Language {lang} not in config, skipping")
+            continue
+        print(f"[INFO] Counting {lang}...", end=" ", flush=True)
+        count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang)
+        lang_counts[lang] = count
+        print(f"{count} IPA strings")
+    
+    if not lang_counts:
+        raise ValueError("No IPA strings found for any language")
+    
+    # Find minimum count across languages
+    min_count = min(lang_counts.values())
+    print(f"[INFO] Minimum count across languages: {min_count}")
+    
+    # Apply max_samples_per_lang cap if specified
+    samples_per_lang = min_count
+    if max_samples_per_lang is not None and max_samples_per_lang < min_count:
+        samples_per_lang = max_samples_per_lang
+        print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}")
+    
+    # Second pass: Sample from each language using simple sampling
+    print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...")
+    actual_counts: Dict[str, int] = {}
+    total_written = 0
+    
+    with open(output_file, "w", encoding="utf-8") as f:
+        for lang in lang_counts.keys():
+            print(f"[INFO] Sampling from {lang}...", end=" ", flush=True)
+            # Use different seed per language for variety, but reproducible
+            lang_seed = seed + hash(lang) % 10000
+            sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed)
+            
+            for ipa in sampled:
+                f.write(ipa + "\n")
+                total_written += 1
+            
+            actual_counts[lang] = len(sampled)
+            print(f"sampled {len(sampled)} strings")
+    
+    print(f"[INFO] Total IPA strings written to corpus: {total_written}")
+    print(f"[INFO] Balanced corpus saved to: {output_file}")
+    
+    return output_file, actual_counts
+
+
+def train_ipa_bpe_tokenizer(
+    output_dir: str,
+    vocab_size: int,
+    corpus_file: str,
+    min_frequency: int = 2,
+) -> Tokenizer:
+    """
+    Train a byte-level BPE tokenizer on IPA strings from a pre-built corpus file.
+    
+    Args:
+        output_dir: Directory to save tokenizer files
+        vocab_size: Target vocabulary size
+        corpus_file: Path to the IPA corpus file (one IPA string per line)
+        min_frequency: Minimum frequency for a token to be included
+    
+    Returns:
+        Trained Tokenizer object
+    """
+    tokenizer_dir = os.path.join(output_dir, f"ipa_bpe_v{vocab_size}")
+    os.makedirs(tokenizer_dir, exist_ok=True)
+    
+    tokenizer_file = os.path.join(tokenizer_dir, "tokenizer.json")
+    
+    # Check if already trained
+    if os.path.exists(tokenizer_file):
+        print(f"[INFO] Loading existing tokenizer from {tokenizer_file}")
+        return Tokenizer.from_file(tokenizer_file)
+    
+    # Initialize tokenizer
+    tokenizer = Tokenizer(BPE(unk_token="<unk>"))
+    tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
+    
+    special_tokens = ["<pad>", "<blank>", "<unk>"]
+    
+    trainer = BpeTrainer(
+        vocab_size=vocab_size,
+        min_frequency=min_frequency,
+        special_tokens=special_tokens,
+        show_progress=True,
+    )
+    
+    print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}...")
+    tokenizer.train(files=[corpus_file], trainer=trainer)
+    
+    # Save
+    tokenizer.save(tokenizer_file)
+    tokenizer.model.save(tokenizer_dir)
+    
+    print(f"[INFO] Saved tokenizer to {tokenizer_dir}")
+    
+    return tokenizer
+
+
+def compute_stats(
+    text_pairs: List[TextPair],
+    qwen_tokenizer: AutoTokenizer,
+    nemotron_tokenizer: AutoTokenizer,
+    ipa_tokenizers: Dict[int, Tokenizer],
+    lang: str,
+) -> TokenizationStats:
+    """
+    Compute tokenization statistics (tokens per second) for a set of text pairs.
+    """
+    qwen_counts = []
+    nemotron_counts = []
+    ipa_counts = {vs: [] for vs in ipa_tokenizers.keys()}
+    
+    for pair in text_pairs:
+        # Qwen tokenizer on raw text
+        qwen_tokens = qwen_tokenizer.encode(pair.raw_text)
+        qwen_counts.append(len(qwen_tokens))
+        
+        # Nemotron tokenizer on raw text
+        nemotron_tokens = nemotron_tokenizer.encode(pair.raw_text)
+        nemotron_counts.append(len(nemotron_tokens))
+        
+        # IPA tokenizers on IPA text
+        for vocab_size, tokenizer in ipa_tokenizers.items():
+            ipa_tokens = tokenizer.encode(pair.ipa_text)
+            ipa_counts[vocab_size].append(len(ipa_tokens.ids))
+    
+    # Calculate total duration and token counts
+    total_duration = sum(pair.duration for pair in text_pairs)
+    qwen_total = sum(qwen_counts)
+    nemotron_total = sum(nemotron_counts)
+    
+    # Compute tokens per second
+    qwen_tps = qwen_total / total_duration if total_duration > 0 else 0.0
+    nemotron_tps = nemotron_total / total_duration if total_duration > 0 else 0.0
+    
+    ipa_tps = {}
+    for vocab_size in ipa_tokenizers.keys():
+        ipa_total = sum(ipa_counts[vocab_size])
+        ipa_tps[vocab_size] = ipa_total / total_duration if total_duration > 0 else 0.0
+    
+    return TokenizationStats(
+        lang=lang,
+        num_samples=len(text_pairs),
+        total_duration=total_duration,
+        qwen_tokens_per_second=qwen_tps,
+        nemotron_tokens_per_second=nemotron_tps,
+        ipa_tokens_per_second=ipa_tps,
+    )
+
+
+def print_stats_table(all_stats: List[TokenizationStats], vocab_sizes: List[int]):
+    """Print a formatted table of tokens per second statistics."""
+    print("\n" + "=" * 120)
+    print("TOKENS PER SECOND: Qwen2.5-1.5B-Instruct & Nemotron Nano 30B (raw text) vs IPA BPE (phonemized)")
+    print("=" * 120)
+    
+    # Header
+    header = f"{'Lang':<6} {'Samples':>8} {'Duration(s)':>12} {'Qwen tok/s':>12} {'Nemo tok/s':>12}"
+    for vs in vocab_sizes:
+        header += f" {'IPA-' + str(vs):>10}"
+    print(header)
+    print("-" * 120)
+    
+    # Data rows
+    for stats in all_stats:
+        row = f"{stats.lang:<6} {stats.num_samples:>8} {stats.total_duration:>12.2f} {stats.qwen_tokens_per_second:>12.2f} {stats.nemotron_tokens_per_second:>12.2f}"
+        for vs in vocab_sizes:
+            row += f" {stats.ipa_tokens_per_second[vs]:>10.2f}"
+        print(row)
+    
+    # Aggregated stats
+    print("-" * 120)
+    total_samples = sum(s.num_samples for s in all_stats)
+    total_duration = sum(s.total_duration for s in all_stats)
+    
+    # Compute overall tokens per second (weighted by duration)
+    total_qwen_tokens = sum(s.qwen_tokens_per_second * s.total_duration for s in all_stats)
+    total_nemotron_tokens = sum(s.nemotron_tokens_per_second * s.total_duration for s in all_stats)
+    overall_qwen_tps = total_qwen_tokens / total_duration if total_duration > 0 else 0
+    overall_nemotron_tps = total_nemotron_tokens / total_duration if total_duration > 0 else 0
+    
+    agg_row = f"{'TOTAL':<6} {total_samples:>8} {total_duration:>12.2f} {overall_qwen_tps:>12.2f} {overall_nemotron_tps:>12.2f}"
+    for vs in vocab_sizes:
+        total_ipa_tokens = sum(s.ipa_tokens_per_second[vs] * s.total_duration for s in all_stats)
+        overall_ipa_tps = total_ipa_tokens / total_duration if total_duration > 0 else 0
+        agg_row += f" {overall_ipa_tps:>10.2f}"
+    print(agg_row)
+    print("=" * 120)
+    
+    # Summary
+    print("\nSUMMARY:")
+    print(f"  - Total samples analyzed: {total_samples}")
+    print(f"  - Total audio duration: {total_duration:.2f} seconds ({total_duration/3600:.2f} hours)")
+    print(f"  - Qwen tokens/second: {overall_qwen_tps:.2f}")
+    print(f"  - Nemotron tokens/second: {overall_nemotron_tps:.2f}")
+    for vs in vocab_sizes:
+        total_ipa_tokens = sum(s.ipa_tokens_per_second[vs] * s.total_duration for s in all_stats)
+        overall_ipa_tps = total_ipa_tokens / total_duration if total_duration > 0 else 0
+        print(f"  - IPA-{vs} tokens/second: {overall_ipa_tps:.2f}")
+    print()
+
+
+def save_results_json(
+    all_stats: List[TokenizationStats],
+    output_path: str,
+    train_langs: Optional[List[str]] = None,
+    test_langs: Optional[List[str]] = None,
+):
+    """Save results to JSON file with metadata."""
+    output = {
+        "metadata": {
+            "train_langs": train_langs or [],
+            "test_langs": test_langs or [],
+        },
+        "results": [],
+    }
+    
+    for stats in all_stats:
+        output["results"].append({
+            "lang": stats.lang,
+            "num_samples": stats.num_samples,
+            "total_duration_seconds": stats.total_duration,
+            "qwen_tokens_per_second": stats.qwen_tokens_per_second,
+            "nemotron_tokens_per_second": stats.nemotron_tokens_per_second,
+            "ipa_tokens_per_second": {
+                str(vs): stats.ipa_tokens_per_second[vs]
+                for vs in stats.ipa_tokens_per_second.keys()
+            }
+        })
+    
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(output, f, indent=2)
+    print(f"[INFO] Saved results to {output_path}")
+
+
+def parse_lang_arg(arg: str, available_langs: List[str]) -> List[str]:
+    """Parse a language argument (comma-separated or 'all')."""
+    if arg == "all":
+        return available_langs
+    langs = [l.strip() for l in arg.split(",") if l.strip()]
+    # Validate languages
+    for lang in langs:
+        if lang not in available_langs:
+            raise ValueError(f"Unknown language: {lang}. Available: {available_langs}")
+    return langs
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compare tokenization between Qwen and IPA BPE tokenizers."
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="Directory to save tokenizers and results",
+    )
+    parser.add_argument(
+        "--samples_per_lang",
+        type=int,
+        default=1000,
+        help="Number of samples per language for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--train_langs",
+        type=str,
+        default="all",
+        help="Comma-separated languages for training tokenizer, or 'all' (default: all)",
+    )
+    parser.add_argument(
+        "--test_langs",
+        type=str,
+        default="all",
+        help="Comma-separated languages for testing/analysis, or 'all' (default: all)",
+    )
+    parser.add_argument(
+        "--max_samples_per_lang",
+        type=int,
+        default=None,
+        help="Optional cap on training samples per language (default: use min count across langs)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for sampling (default: 42)",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=None,
+        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}"
+    )
+    parser.add_argument(
+        "--max_count_per_lang",
+        type=int,
+        default=100000,
+        help="Max count per language when counting IPA strings (default: 100000)",
+    )
+    args = parser.parse_args()
+    
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    # Load config
+    config_path = Path(args.config) if args.config else None
+    cuts_dirs = load_cuts_dirs_config(config_path)
+    available_langs = list(cuts_dirs.keys())
+    print(f"[INFO] Loaded config with languages: {available_langs}")
+    
+    # Parse train and test languages
+    try:
+        train_langs = parse_lang_arg(args.train_langs, available_langs)
+        test_langs = parse_lang_arg(args.test_langs, available_langs)
+    except ValueError as e:
+        print(f"[ERROR] {e}")
+        sys.exit(1)
+    
+    print(f"[INFO] Training languages: {train_langs}")
+    print(f"[INFO] Testing languages: {test_langs}")
+    print(f"[INFO] Samples per language for testing: {args.samples_per_lang}")
+    print(f"[INFO] Max samples per language for training: {args.max_samples_per_lang or 'auto (min across langs)'}")
+    print(f"[INFO] Vocab sizes: {VOCAB_SIZES}")
+    
+    # Step 1: Create balanced IPA corpus once
+    print("\n" + "=" * 60)
+    print("STEP 1: Creating balanced IPA corpus")
+    print("=" * 60)
+    
+    corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt")
+    
+    # Check if corpus already exists
+    if os.path.exists(corpus_file):
+        print(f"[INFO] Using existing corpus file: {corpus_file}")
+        with open(corpus_file, "r", encoding="utf-8") as f:
+            line_count = sum(1 for _ in f)
+        print(f"[INFO] Corpus contains {line_count} IPA strings")
+    else:
+        corpus_file, lang_counts = create_balanced_corpus(
+            train_langs=train_langs,
+            cuts_dirs=cuts_dirs,
+            output_file=corpus_file,
+            max_samples_per_lang=args.max_samples_per_lang,
+            max_count_per_lang=args.max_count_per_lang,
+            seed=args.seed,
+        )
+    
+    # Step 2: Train IPA BPE tokenizers at different vocab sizes (reusing corpus)
+    print("\n" + "=" * 60)
+    print("STEP 2: Training IPA BPE tokenizers")
+    print("=" * 60)
+    
+    ipa_tokenizers = {}
+    for vocab_size in VOCAB_SIZES:
+        print(f"\n[INFO] Training tokenizer with vocab_size={vocab_size}")
+        ipa_tokenizers[vocab_size] = train_ipa_bpe_tokenizer(
+            output_dir=args.output_dir,
+            vocab_size=vocab_size,
+            corpus_file=corpus_file,
+            min_frequency=2,
+        )
+    
+    # Step 3: Load Qwen and Nemotron tokenizers
+    print("\n" + "=" * 60)
+    print("STEP 3: Loading Qwen and Nemotron tokenizers")
+    print("=" * 60)
+    
+    print("[INFO] Loading Qwen/Qwen2.5-1.5B-Instruct tokenizer...")
+    qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+    print(f"[INFO] Qwen tokenizer vocab size: {qwen_tokenizer.vocab_size}")
+    
+    print("[INFO] Loading nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 tokenizer...")
+    
+    nemotron_tokenizer =  AutoTokenizer.from_pretrained("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", trust_remote_code=True)
+    
+    print(f"[INFO] Nemotron tokenizer vocab size: {nemotron_tokenizer.vocab_size}")
+    
+    # Step 4: Sample text pairs and compute statistics (on test languages)
+    print("\n" + "=" * 60)
+    print("STEP 4: Sampling and analyzing (test languages)")
+    print("=" * 60)
+    
+    all_stats = []
+    for lang in test_langs:
+        print(f"\n[INFO] Processing language: {lang}")
+        
+        # Sample text pairs
+        text_pairs = sample_text_pairs(lang, cuts_dirs, args.samples_per_lang, args.seed)
+        
+        if not text_pairs:
+            print(f"[WARN] No text pairs found for {lang}, skipping")
+            continue
+        
+        print(f"[INFO] Sampled {len(text_pairs)} text pairs for {lang}")
+        
+        # Compute stats
+        stats = compute_stats(text_pairs, qwen_tokenizer, nemotron_tokenizer, ipa_tokenizers, lang)
+        all_stats.append(stats)
+        
+        # Print intermediate results
+        print(f"[INFO] {lang}: duration={stats.total_duration:.2f}s, Qwen={stats.qwen_tokens_per_second:.2f} tok/s, Nemotron={stats.nemotron_tokens_per_second:.2f} tok/s")
+        for vs in VOCAB_SIZES:
+            print(f"       IPA-{vs}={stats.ipa_tokens_per_second[vs]:.2f} tok/s")
+    
+    # Step 5: Print and save results
+    print("\n" + "=" * 60)
+    print("STEP 5: Results")
+    print("=" * 60)
+    
+    print_stats_table(all_stats, VOCAB_SIZES)
+    
+    # Save to JSON with metadata
+    results_path = os.path.join(args.output_dir, "tokenization_comparison.json")
+    save_results_json(all_stats, results_path, train_langs, test_langs)
+    
+    print("[INFO] Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/magpietts/ipa_scripts/cuts_dirs_config.json b/scripts/magpietts/ipa_scripts/cuts_dirs_config.json
new file mode 100644
index 000000000000..8785de53211e
--- /dev/null
+++ b/scripts/magpietts/ipa_scripts/cuts_dirs_config.json
@@ -0,0 +1,45 @@
+{
+    "de": ["/Data/tts_lhotse_datasets/speech_data/de/cmltts_de_train/cuts"],
+    "es": [
+        "/Data/tts_lhotse_datasets/speech_data/es/cmltts_es_train/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/es/riva_ES_RubbyCarlos/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/es/riva_ES_RubbyCarlos/cuts_textContext"
+    ],
+    "fr": [
+        "/Data/tts_lhotse_datasets/speech_data/fr/cmltts_fr_train/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/fr/riva_FR_VirginieSamy/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/fr/riva_FR_VirginieSamy/cuts_textContext"
+    ],
+    "hi": [
+        "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi/filter_1/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi/filter_2/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi_2/filter_1/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi_2/filter_2/cuts"
+    ],
+    "it": ["/Data/tts_lhotse_datasets/speech_data/it/cmltts_it_train/cuts"],
+    "vi": [
+        "/Data/tts_lhotse_datasets/speech_data/vi/Infore1_2_lsvsc/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/vi/Long_ContextAudio/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/vi/Long_ContextAudio/cuts_textContext",
+        "/Data/tts_lhotse_datasets/speech_data/vi/NorthFemale/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/vi/nvyt_vi/nvyt_yt2025/cuts"
+    ],
+    "zh": [
+        "/Data/tts_lhotse_datasets/speech_data/zh/riva_ZH_SiweiHouZhen/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/zh/riva_ZH_SiweiHouZhen/cuts_textContext",
+        "/Data/tts_lhotse_datasets/speech_data/zh/nvyt_zh/filter_1/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/zh/nvyt_zh/filter_2/cuts"
+    ],
+    "en": [
+        "/Data/tts_lhotse_datasets/speech_data/en/nvyt2505/lhotse_shar_shuffle_shardSize256/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/en/hifitts/lhotse_shar_shuffle_shardSize256/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/en/hifitts2/lhotse_shar_shuffle_shardSize256/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/en/jhsdGtc20Amp20Keynote/lhotse_shar_shuffle_shardSize256/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/en/libritts/lhotse_shar_shuffle_shardSize256/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/en/rivaLindyRodney/lhotse_shar_shuffle_shardSize256/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/en/rivaLindyRodney/lhotse_shar_shuffle_shardSize256/cuts_textContext",
+        "/Data/tts_lhotse_datasets/speech_data/en/rivaEmmaMeganSeanTom/lhotse_shar_shuffle_shardSize256/cuts",
+        "/Data/tts_lhotse_datasets/speech_data/en/rivaEmmaMeganSeanTom/lhotse_shar_shuffle_shardSize256/cuts_textContext",
+        "/Data/tts_lhotse_datasets/speech_data/en/jhsdGtc20Amp20Keynote/lhotse_shar_shuffle_shardSize256/cuts_textContext"
+    ]
+}
diff --git a/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py b/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py
new file mode 100644
index 000000000000..c6098d93839a
--- /dev/null
+++ b/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py
@@ -0,0 +1,521 @@
+#!/usr/bin/env python3
+"""
+Train a byte-level BPE tokenizer on IPA strings from Lhotse cuts_with_ipa shards.
+
+This script:
+1. Reads IPA strings from cuts_with_ipa directories (output of add_ipa_to_lhotse_shards.py)
+2. Optionally balances data across languages (samples equal amounts from each)
+3. Trains a HuggingFace ByteLevelBPETokenizer on all extracted IPA strings
+4. Saves vocab.json and merges.txt to the specified output directory
+
+Features:
+- Language balancing: uses the same number of samples from each language
+- Configurable max samples per language
+
+Usage:
+    python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --vocab_size 1024
+    python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --train_langs en,de --vocab_size 2048
+    python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --train_langs all --max_samples_per_lang 50000
+
+The trained tokenizer can be loaded using the IPABPETokenizer class in:
+    nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
+"""
+
+from __future__ import annotations
+
+import argparse
+import gzip
+import json
+import os
+import random
+import sys
+from pathlib import Path
+from typing import Dict, Generator, List, Optional, Tuple
+
+from tokenizers import Tokenizer
+from tokenizers.decoders import ByteLevel as ByteLevelDecoder
+from tokenizers.models import BPE
+from tokenizers.pre_tokenizers import ByteLevel
+from tokenizers.trainers import BpeTrainer
+
+# -------------------------
+# USER CONFIG - Same structure as add_ipa_to_lhotse_shards.py
+# -------------------------
+
+# Default config file path (same directory as this script)
+DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json"
+
+
+def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]:
+    """Load CUTS_DIRS_BY_LANG from a JSON config file."""
+    if config_path is None:
+        config_path = DEFAULT_CONFIG_PATH
+    
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    
+    with open(config_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+OUTPUT_SUFFIX = "_with_ipa"  # cuts -> cuts_with_ipa
+SHARD_GLOB = "cuts.*.jsonl.gz"
+
+
+def get_ipa_dir(cuts_dir: Path) -> Path:
+    """Convert a cuts directory path to its corresponding cuts_with_ipa path."""
+    name = cuts_dir.name
+    if name == "cuts":
+        out_name = f"cuts{OUTPUT_SUFFIX}"
+    elif name.endswith("_textContext"):
+        # Handle cuts_textContext -> cuts_textContext_with_ipa
+        out_name = f"{name}{OUTPUT_SUFFIX}"
+    else:
+        out_name = f"{name}{OUTPUT_SUFFIX}"
+    return cuts_dir.parent / out_name
+
+
+def iter_shards(ipa_dir: Path) -> List[Path]:
+    """Get all shard files in a directory."""
+    return sorted(ipa_dir.glob(SHARD_GLOB))
+
+
+def extract_ipa_from_shard(shard_path: Path) -> Generator[str, None, None]:
+    """
+    Extract all IPA strings from a single shard file.
+    
+    Yields:
+        IPA strings from cut["supervisions"][i]["custom"]["ipa"]
+    """
+    with gzip.open(shard_path, "rt", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                cut = json.loads(line)
+                supervisions = cut.get("supervisions", [])
+                for sup in supervisions:
+                    custom = sup.get("custom", {})
+                    ipa = custom.get("ipa")
+                    if ipa and isinstance(ipa, str) and ipa.strip():
+                        yield ipa.strip()
+            except json.JSONDecodeError:
+                continue
+
+
+def extract_ipa_from_dir(ipa_dir: Path) -> Generator[str, None, None]:
+    """Extract all IPA strings from all shards in a directory."""
+    shards = iter_shards(ipa_dir)
+    for shard in shards:
+        yield from extract_ipa_from_shard(shard)
+
+
+def get_available_languages(cuts_dirs: Dict[str, List[str]]) -> List[str]:
+    """Return list of all available language codes."""
+    return list(cuts_dirs.keys())
+
+
+def collect_ipa_strings(
+    cuts_dirs: Dict[str, List[str]],
+    lang: Optional[str] = None,
+) -> Generator[str, None, None]:
+    """
+    Collect all IPA strings from the specified language(s).
+    
+    Args:
+        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
+        lang: Language code or None for all languages.
+    
+    Yields:
+        IPA strings
+    """
+    if lang is None or lang == "all":
+        langs_to_process = list(cuts_dirs.keys())
+    else:
+        if lang not in cuts_dirs:
+            raise ValueError(f"Unknown language: {lang}. Available: {get_available_languages(cuts_dirs)}")
+        langs_to_process = [lang]
+    
+    for lang_code in langs_to_process:
+        print(f"[INFO] Processing language: {lang_code}")
+        for cuts_dir_str in cuts_dirs[lang_code]:
+            cuts_dir = Path(cuts_dir_str)
+            ipa_dir = get_ipa_dir(cuts_dir)
+            
+            if not ipa_dir.exists():
+                print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr)
+                continue
+            
+            print(f"[INFO] Reading from: {ipa_dir}")
+            count = 0
+            for ipa in extract_ipa_from_dir(ipa_dir):
+                yield ipa
+                count += 1
+            print(f"[INFO] Extracted {count} IPA strings from {ipa_dir}")
+
+
+def iter_ipa_strings_for_lang(
+    lang: str,
+    cuts_dirs: Dict[str, List[str]],
+) -> Generator[str, None, None]:
+    """Iterate over all IPA strings for a single language (memory-efficient)."""
+    if lang not in cuts_dirs:
+        return
+    
+    for cuts_dir_str in cuts_dirs[lang]:
+        cuts_dir = Path(cuts_dir_str)
+        ipa_dir = get_ipa_dir(cuts_dir)
+        
+        if not ipa_dir.exists():
+            continue
+        
+        for ipa in extract_ipa_from_dir(ipa_dir):
+            yield ipa
+
+
+def count_ipa_strings_for_lang(lang: str, cuts_dirs: Dict[str, List[str]], max_count: int = 100000) -> int:
+    """Count IPA strings for a language without loading into memory."""
+    count = 0
+    for _ in iter_ipa_strings_for_lang(lang, cuts_dirs):
+        count += 1
+        if count >= max_count:
+            break
+    return count
+
+
+def simple_sample_ipa_strings(
+    lang: str,
+    cuts_dirs: Dict[str, List[str]],
+    k: int,
+    max_collect: int = 100000,
+    seed: int = 42,
+) -> List[str]:
+    """
+    Simple sampling: collect up to max_collect IPA strings, then randomly sample k.
+    
+    This avoids reading through all data like reservoir sampling does.
+    
+    Args:
+        lang: Language code
+        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
+        k: Number of samples to select
+        max_collect: Maximum number of strings to collect before sampling
+        seed: Random seed for reproducibility
+    
+    Returns:
+        List of up to k sampled IPA strings
+    """
+    rng = random.Random(seed)
+    collected: List[str] = []
+    
+    for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs):
+        collected.append(ipa)
+        if len(collected) >= max_collect:
+            break
+    
+    # If we have fewer than k, return all
+    if len(collected) <= k:
+        return collected
+    
+    # Otherwise, randomly sample k
+    return rng.sample(collected, k)
+
+
+def parse_langs_arg(arg: str, available_langs: List[str]) -> List[str]:
+    """Parse a language argument (comma-separated or 'all')."""
+    if arg == "all":
+        return available_langs
+    langs = [l.strip() for l in arg.split(",") if l.strip()]
+    for lang in langs:
+        if lang not in available_langs:
+            raise ValueError(f"Unknown language: {lang}. Available: {available_langs}")
+    return langs
+
+
+def create_balanced_corpus(
+    train_langs: List[str],
+    cuts_dirs: Dict[str, List[str]],
+    output_file: str,
+    max_samples_per_lang: Optional[int] = None,
+    max_count_per_lang: int = 100000,
+    seed: int = 42,
+) -> Tuple[str, Dict[str, int]]:
+    """
+    Create a balanced IPA corpus file with equal samples from each language.
+    
+    Uses a memory-efficient two-pass approach:
+    1. First pass: Count sentences per language (up to max_count_per_lang)
+    2. Second pass: Use simple sampling to select samples
+    
+    Args:
+        train_langs: List of language codes to include
+        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
+        output_file: Path to write the balanced corpus
+        max_samples_per_lang: Optional cap on samples per language
+        max_count_per_lang: Max count per language when counting IPA strings
+        seed: Random seed for reproducibility
+    
+    Returns:
+        Tuple of (corpus_file_path, dict of lang -> actual_count)
+    """
+    # First pass: Count sentences per language
+    print("[INFO] Pass 1: Counting IPA strings per language...")
+    lang_counts: Dict[str, int] = {}
+    
+    for lang in train_langs:
+        if lang not in cuts_dirs:
+            print(f"[WARN] Language {lang} not in config, skipping")
+            continue
+        print(f"[INFO] Counting {lang}...", end=" ", flush=True)
+        count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang)
+        lang_counts[lang] = count
+        print(f"{count} IPA strings")
+    
+    if not lang_counts:
+        raise ValueError("No IPA strings found for any language")
+    
+    # Find minimum count across languages
+    min_count = min(lang_counts.values())
+    print(f"[INFO] Minimum count across languages: {min_count}")
+    
+    # Apply max_samples_per_lang cap if specified
+    samples_per_lang = min_count
+    if max_samples_per_lang is not None and max_samples_per_lang < min_count:
+        samples_per_lang = max_samples_per_lang
+        print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}")
+    
+    # Second pass: Sample from each language using simple sampling
+    print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...")
+    actual_counts: Dict[str, int] = {}
+    total_written = 0
+    
+    with open(output_file, "w", encoding="utf-8") as f:
+        for lang in lang_counts.keys():
+            print(f"[INFO] Sampling from {lang}...", end=" ", flush=True)
+            # Use different seed per language for variety, but reproducible
+            lang_seed = seed + hash(lang) % 10000
+            sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed)
+            
+            for ipa in sampled:
+                f.write(ipa + "\n")
+                total_written += 1
+            
+            actual_counts[lang] = len(sampled)
+            print(f"sampled {len(sampled)} strings")
+    
+    print(f"[INFO] Total IPA strings written to corpus: {total_written}")
+    print(f"[INFO] Balanced corpus saved to: {output_file}")
+    
+    return output_file, actual_counts
+
+
+def train_bpe_tokenizer(
+    corpus_file: str,
+    vocab_size: int = 1024,
+    min_frequency: int = 2,
+    output_dir: str = "./ipa_bpe_tokenizer",
+) -> Tokenizer:
+    """
+    Train a byte-level BPE tokenizer on IPA strings from a corpus file.
+    
+    Args:
+        corpus_file: Path to the IPA corpus file (one IPA string per line)
+        vocab_size: Target vocabulary size
+        min_frequency: Minimum frequency for a token to be included
+        output_dir: Directory to save the tokenizer files
+    
+    Returns:
+        Trained Tokenizer object
+    """
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Check if tokenizer already exists
+    tokenizer_path = os.path.join(output_dir, "tokenizer.json")
+    if os.path.exists(tokenizer_path):
+        print(f"[INFO] Loading existing tokenizer from {tokenizer_path}")
+        return Tokenizer.from_file(tokenizer_path)
+    
+    # Count lines in corpus
+    with open(corpus_file, "r", encoding="utf-8") as f:
+        total_count = sum(1 for _ in f)
+    print(f"[INFO] Corpus contains {total_count} IPA strings")
+    
+    if total_count == 0:
+        raise ValueError("Corpus file is empty. Make sure the cuts_with_ipa directories exist.")
+    
+    # Initialize a byte-level BPE tokenizer
+    tokenizer = Tokenizer(BPE(unk_token="<unk>"))
+    
+    # Use byte-level pre-tokenization (like GPT-2)
+    tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
+    
+    # Add byte-level decoder to properly convert back to original text
+    tokenizer.decoder = ByteLevelDecoder()
+    
+    # Define special tokens
+    special_tokens = ["<pad>", "<blank>", "<unk>"]
+    
+    # Create trainer
+    trainer = BpeTrainer(
+        vocab_size=vocab_size,
+        min_frequency=min_frequency,
+        special_tokens=special_tokens,
+        show_progress=True,
+    )
+    
+    # Train the tokenizer
+    print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}, min_frequency={min_frequency}")
+    tokenizer.train(files=[corpus_file], trainer=trainer)
+    
+    # Save the tokenizer
+    vocab_path = os.path.join(output_dir, "vocab.json")
+    merges_path = os.path.join(output_dir, "merges.txt")
+    
+    # Save using the tokenizer's model save method
+    tokenizer.model.save(output_dir)
+    
+    # Also save the full tokenizer for easy loading
+    tokenizer.save(tokenizer_path)
+    
+    print(f"[INFO] Tokenizer saved to: {output_dir}")
+    print(f"[INFO]   - vocab.json: {vocab_path}")
+    print(f"[INFO]   - merges.txt: {merges_path}")
+    print(f"[INFO]   - tokenizer.json: {tokenizer_path}")
+    print(f"[INFO] Vocabulary size: {tokenizer.get_vocab_size()}")
+    
+    return tokenizer
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Train a byte-level BPE tokenizer on IPA strings from Lhotse cuts_with_ipa shards."
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="Directory to save the trained tokenizer files (vocab.json, merges.txt, tokenizer.json)",
+    )
+    parser.add_argument(
+        "--vocab_size",
+        type=int,
+        default=1024,
+        help="Vocabulary size for the BPE tokenizer (default: 1024)",
+    )
+    parser.add_argument(
+        "--min_frequency",
+        type=int,
+        default=2,
+        help="Minimum frequency for a token to be included in vocabulary (default: 2)",
+    )
+    parser.add_argument(
+        "--train_langs",
+        type=str,
+        default="all",
+        help="Comma-separated language codes for training, or 'all' (default: all)",
+    )
+    parser.add_argument(
+        "--max_samples_per_lang",
+        type=int,
+        default=None,
+        help="Optional cap on samples per language (default: use min count across langs for balance)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for sampling (default: 42)",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=None,
+        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}"
+    )
+    parser.add_argument(
+        "--max_count_per_lang",
+        type=int,
+        default=100000,
+        help="Max count per language when counting IPA strings (default: 100000)",
+    )
+    args = parser.parse_args()
+    
+    # Load config
+    config_path = Path(args.config) if args.config else None
+    cuts_dirs = load_cuts_dirs_config(config_path)
+    available_langs = get_available_languages(cuts_dirs)
+    
+    # Parse train_langs
+    try:
+        train_langs = parse_langs_arg(args.train_langs, available_langs)
+    except ValueError as e:
+        print(f"[ERROR] {e}")
+        sys.exit(1)
+    
+    print(f"[INFO] Training IPA BPE tokenizer")
+    print(f"[INFO]   Output directory: {args.output_dir}")
+    print(f"[INFO]   Vocabulary size: {args.vocab_size}")
+    print(f"[INFO]   Min frequency: {args.min_frequency}")
+    print(f"[INFO]   Training languages: {train_langs}")
+    print(f"[INFO]   Max samples per lang: {args.max_samples_per_lang or 'auto (min across langs)'}")
+    print(f"[INFO]   Max count per lang: {args.max_count_per_lang}")
+    print(f"[INFO]   Available languages: {available_langs}")
+    
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    # Step 1: Create balanced corpus
+    print("\n" + "=" * 60)
+    print("STEP 1: Creating balanced IPA corpus")
+    print("=" * 60)
+    
+    corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt")
+    
+    if os.path.exists(corpus_file):
+        print(f"[INFO] Using existing corpus file: {corpus_file}")
+        with open(corpus_file, "r", encoding="utf-8") as f:
+            line_count = sum(1 for _ in f)
+        print(f"[INFO] Corpus contains {line_count} IPA strings")
+    else:
+        corpus_file, lang_counts = create_balanced_corpus(
+            train_langs=train_langs,
+            cuts_dirs=cuts_dirs,
+            output_file=corpus_file,
+            max_samples_per_lang=args.max_samples_per_lang,
+            max_count_per_lang=args.max_count_per_lang,
+            seed=args.seed,
+        )
+    
+    # Step 2: Train tokenizer
+    print("\n" + "=" * 60)
+    print("STEP 2: Training BPE tokenizer")
+    print("=" * 60)
+    
+    tokenizer = train_bpe_tokenizer(
+        corpus_file=corpus_file,
+        vocab_size=args.vocab_size,
+        min_frequency=args.min_frequency,
+        output_dir=args.output_dir,
+    )
+    
+    # Test the tokenizer
+    print("\n[INFO] Testing tokenizer with sample IPA strings:")
+    test_strings = [
+        "həˈloʊ wɜːld",  # hello world
+        "ˈaɪ pʰiː eɪ",   # IPA
+        "ˈtɛstɪŋ wʌn tuː θriː",  # testing one two three
+    ]
+    for test_str in test_strings:
+        encoded = tokenizer.encode(test_str)
+        decoded = tokenizer.decode(encoded.ids)
+        print(f"  Input:   '{test_str}'")
+        print(f"  Tokens:  {encoded.tokens}")
+        print(f"  IDs:     {encoded.ids}")
+        print(f"  Decoded: '{decoded}'")
+        print()
+    
+    print("[INFO] Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json
new file mode 100644
index 000000000000..6d7e35116405
--- /dev/null
+++ b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json
@@ -0,0 +1,9954 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<blank>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "ByteLevel",
+    "add_prefix_space": false,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "post_processor": null,
+  "decoder": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "<unk>",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {
+      "<pad>": 0,
+      "<blank>": 1,
+      "<unk>": 2,
+      "(": 3,
+      ")": 4,
+      "-": 5,
+      ".": 6,
+      "1": 7,
+      "2": 8,
+      "4": 9,
+      "5": 10,
+      "6": 11,
+      "7": 12,
+      "F": 13,
+      "a": 14,
+      "b": 15,
+      "c": 16,
+      "d": 17,
+      "e": 18,
+      "f": 19,
+      "h": 20,
+      "i": 21,
+      "j": 22,
+      "k": 23,
+      "l": 24,
+      "m": 25,
+      "n": 26,
+      "o": 27,
+      "p": 28,
+      "q": 29,
+      "r": 30,
+      "s": 31,
+      "t": 32,
+      "u": 33,
+      "v": 34,
+      "w": 35,
+      "x": 36,
+      "y": 37,
+      "z": 38,
+      "¡": 39,
+      "£": 40,
+      "¦": 41,
+      "§": 42,
+      "©": 43,
+      "ª": 44,
+      "¬": 45,
+      "°": 46,
+      "²": 47,
+      "³": 48,
+      "¸": 49,
+      "¹": 50,
+      "¾": 51,
+      "Ã": 52,
+      "Å": 53,
+      "É": 54,
+      "Ê": 55,
+      "Ë": 56,
+      "Ì": 57,
+      "Î": 58,
+      "Ï": 59,
+      "Ċ": 60,
+      "Ġ": 61,
+      "Ģ": 62,
+      "ģ": 63,
+      "Ĥ": 64,
+      "ĥ": 65,
+      "ĩ": 66,
+      "Ī": 67,
+      "Ĭ": 68,
+      "ĭ": 69,
+      "Į": 70,
+      "į": 71,
+      "İ": 72,
+      "ı": 73,
+      "Ĳ": 74,
+      "ĳ": 75,
+      "Ĵ": 76,
+      "ĵ": 77,
+      "Ķ": 78,
+      "ķ": 79,
+      "ĸ": 80,
+      "Ĺ": 81,
+      "Ļ": 82,
+      "Ľ": 83,
+      "ľ": 84,
+      "Ŀ": 85,
+      "Ł": 86,
+      "ËĪ": 87,
+      "ËĲ": 88,
+      "ËĪÉ": 89,
+      "ËĮ": 90,
+      "ÉĻ": 91,
+      "ËĪa": 92,
+      "ËĪi": 93,
+      "Ġt": 94,
+      "Éª": 95,
+      "É¾": 96,
+      "ĠÉ": 97,
+      "Ġk": 98,
+      "Éľ": 99,
+      "Ġs": 100,
+      "ËĪe": 101,
+      "ÉĽ": 102,
+      "ËĪo": 103,
+      "Ġl": 104,
+      "ËĪÉĽ": 105,
+      "Ġd": 106,
+      "ÊĬ": 107,
+      "ËĪaËĲ": 108,
+      "Ġp": 109,
+      "Ìĥ": 110,
+      "Ġm": 111,
+      "ËĪu": 112,
+      "Åĭ": 113,
+      "Ã°": 114,
+      "ËĪÉĶ": 115,
+      "ÊĮ": 116,
+      "ËĮa": 117,
+      "Ġh": 118,
+      "ËĪÊĮ": 119,
+      "Ġn": 120,
+      "Êģ": 121,
+      "ËĪÉĳ": 122,
+      "Êĥ": 123,
+      "eËĲ": 124,
+      "Ġa": 125,
+      "Ġb": 126,
+      "ÉĶ": 127,
+      "ËĪÉĻ": 128,
+      "ÉĻn": 129,
+      "Ġf": 130,
+      "ËĪÉª": 131,
+      "É¡": 132,
+      "ËĪeËĲ": 133,
+      "Ġj": 134,
+      "nt": 135,
+      "ĠÃ°": 136,
+      "ĠËĮ": 137,
+      "Ġts": 138,
+      "ĠÉ¡": 139,
+      "Éķ": 140,
+      "ËĪoËĲ": 141,
+      "Ê°": 142,
+      "aËĲ": 143,
+      "ËĪy": 144,
+      "ĠtÉķ": 145,
+      "ËĪiËĲ": 146,
+      "ĠÊ": 147,
+      "Ġv": 148,
+      "Ġw": 149,
+      "st": 150,
+      "Éĳ": 151,
+      "nd": 152,
+      "ËĮi": 153,
+      "Ìª": 154,
+      "ËĮe": 155,
+      "Ġz": 156,
+      "ËĪaÉª": 157,
+      "ËĪiÉĽ": 158,
+      "Î²": 159,
+      "É¹": 160,
+      "ĠËĮa": 161,
+      "Î¸": 162,
+      "ĠhÉĽ": 163,
+      "ÊĪ": 164,
+      "iËĲ": 165,
+      "ËĮo": 166,
+      "ĠÉª": 167,
+      "Éľn": 168,
+      "Ġx": 169,
+      "ĠtÉĻ": 170,
+      "ËĪuËĲ": 171,
+      "ËĮÉĻ": 172,
+      "ĠjËĪi": 173,
+      "ËĮÉĽ": 174,
+      "ĠÉĽ": 175,
+      "ĠËĪa": 176,
+      "ËĮaËĲ": 177,
+      "Ġla": 178,
+      "ĠÃ°e": 179,
+      "ĠhÉĽËĲ": 180,
+      "Ġe": 181,
+      "Ã§": 182,
+      "ÉĻl": 183,
+      "oËĲ": 184,
+      "ËĪÉĳu": 185,
+      "ÊĴ": 186,
+      "uËĲ": 187,
+      "ĠÉĹ": 188,
+      "ĠÉķ": 189,
+      "ËĮeËĲ": 190,
+      "ĠtÉķËĪi": 191,
+      "os": 192,
+      "ËĪÉĶËĲ": 193,
+      "as": 194,
+      "ËĪÊĬ": 195,
+      "Ġi": 196,
+      "ËĪai": 197,
+      "É²": 198,
+      "Éªn": 199,
+      "ts": 200,
+      "ÉľÅĭ": 201,
+      "ĠÉŁ": 202,
+      "ĠÊĥ": 203,
+      "ËĪeÉª": 204,
+      "ÉĽÉ¾": 205,
+      "ËĪÉĽËĲ": 206,
+      "ËĪÉĽÉ¾": 207,
+      "Ġr": 208,
+      "tÊĥ": 209,
+      "ËĮÉĶ": 210,
+      "ĠdÉĻ": 211,
+      "tÉĻ": 212,
+      "ou": 213,
+      "ËĪyÉĻ": 214,
+      "ĠËĮi": 215,
+      "ÉĻÉ¾": 216,
+      "ËĪÉĻÊĬ": 217,
+      "ËĪÊĮÉ¾": 218,
+      "ËĪÉĴ": 219,
+      "Ġth": 220,
+      "ËĪon": 221,
+      "Êĭ": 222,
+      "ËĪÉĳËĲ": 223,
+      "ËĪÊĮh": 224,
+      "wËĪa": 225,
+      "ËĪei": 226,
+      "ll": 227,
+      "ĠÉĲ": 228,
+      "ÉĳËĲ": 229,
+      "an": 230,
+      "ÉŁ": 231,
+      "ĠÊĭ": 232,
+      "Ġko": 233,
+      "kh": 234,
+      "ÉªÅĭ": 235,
+      "ËĪaËĲÉª": 236,
+      "ĠtÊĥ": 237,
+      "ËĪaËĲt": 238,
+      "ĠËĮe": 239,
+      "ĠtÉķh": 240,
+      "ËĪuo": 241,
+      "ËĪonÉ¡": 242,
+      "Éĸ": 243,
+      "at": 244,
+      "Ġke": 245,
+      "ÉĴ": 246,
+      "ĠÉķËĪi": 247,
+      "Ã¸": 248,
+      "ĠÉĳ": 249,
+      "ËĪeËĲk": 250,
+      "Åĵ": 251,
+      "re": 252,
+      "ĠÉ¾": 253,
+      "ĠkÉĶ": 254,
+      "ËĮÊĬ": 255,
+      "sk": 256,
+      "ĠÊĬ": 257,
+      "Ġand": 258,
+      "ÉªÃ§": 259,
+      "Ġme": 260,
+      "ËĪaÉ¾": 261,
+      "ĠËĪÉª": 262,
+      "na": 263,
+      "ĠÎ²": 264,
+      "ĠlËĪi": 265,
+      "jaËĲ": 266,
+      "li": 267,
+      "no": 268,
+      "ĠÉªn": 269,
+      "ĠdËĮi": 270,
+      "ĠÉ²": 271,
+      "tËĲ": 272,
+      "ÉĻm": 273,
+      "ĠlÉĻ": 274,
+      "ĠÃ°ÉĻ": 275,
+      "Éªk": 276,
+      "ËĪÉĽl": 277,
+      "Éľt": 278,
+      "Ġse": 279,
+      "es": 280,
+      "ËĪou": 281,
+      "ËĪaÊĬ": 282,
+      "ĠÉĶ": 283,
+      "Éªt": 284,
+      "ĠÅĭ": 285,
+      "ËĪÉĽn": 286,
+      "Êİ": 287,
+      "Ġkh": 288,
+      "ËĪÉĽnt": 289,
+      "ËĪaËĲÉ¾": 290,
+      "Ġki": 291,
+      "mp": 292,
+      "lt": 293,
+      "É£": 294,
+      "Ġpa": 295,
+      "ËĪÉĻËĲ": 296,
+      "Éªs": 297,
+      "ĠÉĴ": 298,
+      "Ġle": 299,
+      "ÉªÉľ": 300,
+      "ËĪÉĽt": 301,
+      "Ġde": 302,
+      "ĠÉ¹": 303,
+      "ĠtËĪoËĲ": 304,
+      "ĠÊģ": 305,
+      "ÊĥÉĻn": 306,
+      "ĠÊĬnt": 307,
+      "ËĪÉĶÉ¾": 308,
+      "ËĪaÃ°": 309,
+      "ĠaÉª": 310,
+      "ĠÊĲ": 311,
+      "ĠmËĪa": 312,
+      "ra": 313,
+      "ĠkËĪÉª": 314,
+      "kt": 315,
+      "ËĲp": 316,
+      "ĠÊĪ": 317,
+      "ËĪaËĲÊĬ": 318,
+      "ĠkËĪÊĮÉ¾": 319,
+      "ĠËĪÊĮ": 320,
+      "ĠÉĴv": 321,
+      "Ġel": 322,
+      "ks": 323,
+      "Ġkw": 324,
+      "ÉĻt": 325,
+      "ndo": 326,
+      "ei": 327,
+      "ĠËĮaËĲp": 328,
+      "se": 329,
+      "ÉĻÉ¹": 330,
+      "ËĪuei": 331,
+      "ÉĻs": 332,
+      "ĠkËĮo": 333,
+      "ĠÊĤ": 334,
+      "ĠËĮÊĬ": 335,
+      "Ġc": 336,
+      "ĠÉĽn": 337,
+      "ËĪant": 338,
+      "Î¸j": 339,
+      "ËĮoËĲ": 340,
+      "ĠËĪaËĲ": 341,
+      "ĠpÉ¾": 342,
+      "si": 343,
+      "ĠËĪe": 344,
+      "ĠjuËĲ": 345,
+      "ĠkËĮe": 346,
+      "ËĮÉª": 347,
+      "ÉĶn": 348,
+      "ĠsËĪÊĮ": 349,
+      "ĠËĪu": 350,
+      "ni": 351,
+      "Ġst": 352,
+      "ĠdiËĲ": 353,
+      "ĠkeËĲ": 354,
+      "ĠjËĪiou": 355,
+      "ËĪaiÉľ": 356,
+      "ĠdÊĴ": 357,
+      "ĠËĪÉĶ": 358,
+      "va": 359,
+      "ËĲÉ¾": 360,
+      "ËĪÃ¸": 361,
+      "ËĮÉĻÊĬ": 362,
+      "ĠpËĪu": 363,
+      "Ġsu": 364,
+      "Ġma": 365,
+      "ĠÉĻ": 366,
+      "dÊĴ": 367,
+      "ĠpÊ°": 368,
+      "le": 369,
+      "in": 370,
+      "ĠtÉķhËĪi": 371,
+      "ĠwËĪo": 372,
+      "ro": 373,
+      "ËĮy": 374,
+      "É¾a": 375,
+      "ĠsËĪi": 376,
+      "Ã°ÉĻ": 377,
+      "ĠseËĲ": 378,
+      "la": 379,
+      "ĠÊĴ": 380,
+      "mb": 381,
+      "ĠhËĪoËĲ": 382,
+      "ĠbÊ°": 383,
+      "ĠÉĽÉ¾": 384,
+      "ĠÃ°at": 385,
+      "sp": 386,
+      "ÉĶÉ¾": 387,
+      "en": 388,
+      "ĠsÉĻ": 389,
+      "ËĪÉĶÉľ": 390,
+      "ĠlËĮa": 391,
+      "ĠËĮÉĽ": 392,
+      "ĠËĪy": 393,
+      "É¡aËĲ": 394,
+      "ĠdÉĽÉ¾": 395,
+      "ËĪÉĽÊģ": 396,
+      "Éľkh": 397,
+      "ËĪiÉĻ": 398,
+      "ËĪan": 399,
+      "ĠmËĪo": 400,
+      "ËĪaÎ²": 401,
+      "Ġal": 402,
+      "ĠËĪeËĲ": 403,
+      "ĠÎ¸": 404,
+      "ĠnËĪi": 405,
+      "pÊ°": 406,
+      "lla": 407,
+      "Ġpl": 408,
+      "ËĪÅĵ": 409,
+      "jËĪÉĳu": 410,
+      "Ġav": 411,
+      "ĠmËĪi": 412,
+      "ĠfËĪa": 413,
+      "ËĪÉľ": 414,
+      "me": 415,
+      "ËĮÉĻh": 416,
+      "ËĪuÉĻ": 417,
+      "it": 418,
+      "jËĪe": 419,
+      "Ġo": 420,
+      "ËĪÉľËĲ": 421,
+      "ĠtÉķËĪiou": 422,
+      "ÉĶËĲ": 423,
+      "ĠnÉĻ": 424,
+      "ËĪÉĻÉľn": 425,
+      "ĠmÉĻ": 426,
+      "ĠdeËĲ": 427,
+      "mo": 428,
+      "sa": 429,
+      "jËĪÉĶ": 430,
+      "ËĪal": 431,
+      "ĠtÉķËĪiÉĽ": 432,
+      "ĠÉ¡ÉĻ": 433,
+      "Ã°a": 434,
+      "ĠÉªz": 435,
+      "Ġsa": 436,
+      "ri": 437,
+      "ĠËĮil": 438,
+      "ËĮu": 439,
+      "ĠkaËĲ": 440,
+      "ĠÉĻËĲ": 441,
+      "ĠÉĸ": 442,
+      "Ġka": 443,
+      "ËĪÊĮhi": 444,
+      "ĠjeËĲ": 445,
+      "ĠtÊ°": 446,
+      "ne": 447,
+      "kËĲ": 448,
+      "ĠtsËĪai": 449,
+      "ĠËĪeËĲk": 450,
+      "nk": 451,
+      "ti": 452,
+      "ËĪaÉľn": 453,
+      "ĠkËĲ": 454,
+      "É¡ÉĻn": 455,
+      "ËĪia": 456,
+      "ĠÉĶËĲÉ¾": 457,
+      "Êı": 458,
+      "ĠËĮÊĮ": 459,
+      "ĠzËĪaËĲ": 460,
+      "Ġlos": 461,
+      "ÉĽs": 462,
+      "ËĪÉĶn": 463,
+      "ÉĽnt": 464,
+      "ÉĽn": 465,
+      "ĠÉŁËĪoËĲ": 466,
+      "Ã§t": 467,
+      "Ġdas": 468,
+      "ĠxËĮo": 469,
+      "ËĪuÉľ": 470,
+      "ËĪas": 471,
+      "ĠbËĪÊĮ": 472,
+      "ËĪiÉĽÉľn": 473,
+      "ÉĲ": 474,
+      "ĠtsuËĲ": 475,
+      "ĠpËĮÉĽ": 476,
+      "ĠnËĪÉĶ": 477,
+      "ÊĬt": 478,
+      "ma": 479,
+      "ĠnËĪo": 480,
+      "ĠlËĪÉª": 481,
+      "ËĪÉĽs": 482,
+      "Éªl": 483,
+      "ĠÉķËĪiÉĽ": 484,
+      "ĠËĪÊĬ": 485,
+      "ÉĴt": 486,
+      "to": 487,
+      "ĠËĪo": 488,
+      "ËĮon": 489,
+      "ĠkwËĪa": 490,
+      "ĠÉªt": 491,
+      "ĠhoËĲ": 492,
+      "ËĪiËĲk": 493,
+      "ĠËĮaËĲpk": 494,
+      "ËĪaÉªn": 495,
+      "Ã¦": 496,
+      "ÉĻnt": 497,
+      "ta": 498,
+      "lo": 499,
+      "ĠnËĪÉĳ": 500,
+      "ĠlËĪa": 501,
+      "ËĪiÉľ": 502,
+      "ĠwËĪei": 503,
+      "ÉĽÊģ": 504,
+      "ĠtËĪa": 505,
+      "ĠÉ¾ËĮÉĻh": 506,
+      "ĠÉķËĪiÉĳ": 507,
+      "ËĮiËĲ": 508,
+      "ËĮÉĽl": 509,
+      "ĠtÉĻÉľ": 510,
+      "ĠkËĪuo": 511,
+      "ĠtËĪu": 512,
+      "jËĪÉĽ": 513,
+      "ĠËĮin": 514,
+      "É¾e": 515,
+      "ĠkoËĲ": 516,
+      "ĠkËĪa": 517,
+      "É¾i": 518,
+      "ĠtÉķËĪiÉĳ": 519,
+      "lÉĻ": 520,
+      "ĠkÉĻ": 521,
+      "ĠtËĪi": 522,
+      "ĠÅĭËĪyÉĻ": 523,
+      "Ġtsh": 524,
+      "er": 525,
+      "av": 526,
+      "ĠkÉĶn": 527,
+      "ËĪÉĻÉľÅĭ": 528,
+      "Ã°o": 529,
+      "ËĪaËĲn": 530,
+      "ĠbÊ°ËĪi": 531,
+      "ĠkËĲjaËĲ": 532,
+      "ÉĻz": 533,
+      "ĠpÊģ": 534,
+      "ĠdËĪÉª": 535,
+      "ĠziËĲ": 536,
+      "É¡eËĲ": 537,
+      "ĠtËĪÉĻ": 538,
+      "Éªz": 539,
+      "ĠnËĮon": 540,
+      "taËĲ": 541,
+      "bl": 542,
+      "te": 543,
+      "nËĮeËĲ": 544,
+      "ËĪÉªl": 545,
+      "so": 546,
+      "ko": 547,
+      "uÊģ": 548,
+      "ĠÉ£": 549,
+      "ĠpaÊģ": 550,
+      "ĠËĪÉĽ": 551,
+      "jËĪuËĲ": 552,
+      "ËĮÊĮ": 553,
+      "yn": 554,
+      "ËĪiËĲn": 555,
+      "ĠlËĪaÉª": 556,
+      "ËĪÉªÅĭ": 557,
+      "ĠtÉķhËĪy": 558,
+      "ĠnËĪÊĮhi": 559,
+      "ĠdËĮe": 560,
+      "ĠjËĪÉĳu": 561,
+      "ĠtËĪÉĳu": 562,
+      "ĠhËĪo": 563,
+      "Éªd": 564,
+      "ĠthËĪÉĳ": 565,
+      "mËĪe": 566,
+      "ĠËĪÉĻ": 567,
+      "ja": 568,
+      "Ġph": 569,
+      "ÉĽt": 570,
+      "ĠkËĪÊĮ": 571,
+      "tÉĻn": 572,
+      "mËĪÉĳ": 573,
+      "wËĪe": 574,
+      "ĠËĮaÉªn": 575,
+      "ĠÃ°Éªs": 576,
+      "É¡ÉĻ": 577,
+      "ĠnËĪaËĲ": 578,
+      "ĠbËĪaËĲ": 579,
+      "ĠaÎ¸": 580,
+      "ĠmËĮa": 581,
+      "ËĪÊĮha": 582,
+      "ĠdËĮa": 583,
+      "ËĪÊı": 584,
+      "ĠÉ²ËĮy": 585,
+      "ĠpËĪa": 586,
+      "ËĪaÃ°o": 587,
+      "di": 588,
+      "bÉľ": 589,
+      "É³": 590,
+      "ĠwiËĲ": 591,
+      "ĠnËĪÉª": 592,
+      "ĠÉ¡ËĪÉĶÉľ": 593,
+      "tËĲo": 594,
+      "ËĮÉĻm": 595,
+      "ËĪaËĲr": 596,
+      "ĠmÉĽ": 597,
+      "ËĪeËĲÉ¡aËĲ": 598,
+      "ĠsËĮi": 599,
+      "ĠlËĮaËĲ": 600,
+      "nËĮaËĲ": 601,
+      "Ġsp": 602,
+      "tÊģ": 603,
+      "ĠÊİ": 604,
+      "ËĮÉĳËĲ": 605,
+      "Ġkl": 606,
+      "kÊ°": 607,
+      "il": 608,
+      "ĠÊĥt": 609,
+      "ĠËĮÊĬn": 610,
+      "al": 611,
+      "ĠsËĪÉĽ": 612,
+      "ĠmËĪaËĲ": 613,
+      "ĠÅĵ": 614,
+      "ĠÉ¡ËĪÊĮ": 615,
+      "ĠpËĮÉĽr": 616,
+      "É¾ËĪa": 617,
+      "ËĲÊĪ": 618,
+      "ËĪaÎ²a": 619,
+      "ĠwËĪÉĴ": 620,
+      "ĠxËĪuei": 621,
+      "ĠkhËĪo": 622,
+      "Ġlas": 623,
+      "ĠÉĹËĪo": 624,
+      "ĠfÉĽÉ¾": 625,
+      "ĠjËĪiÉĽ": 626,
+      "ĠtËĪe": 627,
+      "ĠkËĮÉĶ": 628,
+      "ĠdeËĲn": 629,
+      "Ġmo": 630,
+      "ĠpËĪi": 631,
+      "ĠtËĪÉĳ": 632,
+      "ËĪÉĽst": 633,
+      "wËĪÉĳ": 634,
+      "ËĪaÉªt": 635,
+      "ÉĻÊĬ": 636,
+      "ĠËĪi": 637,
+      "Éªj": 638,
+      "aÉª": 639,
+      "ËĪaËĲÉľ": 640,
+      "ĠËĪÉªs": 641,
+      "ĠpÉĶÉ¾": 642,
+      "Ã¦Éľn": 643,
+      "ka": 644,
+      "ÅĭÉ¡": 645,
+      "bÉĻn": 646,
+      "ÊĬf": 647,
+      "ĠpÉ¹": 648,
+      "ĠlËĮe": 649,
+      "ËĪiËĲd": 650,
+      "ËĪaËĲre": 651,
+      "ĠmËĪÊĮ": 652,
+      "ÉĻr": 653,
+      "ĠdÉĳ": 654,
+      "ËĪaËĲto": 655,
+      "ĠpËĪeËĲ": 656,
+      "ĠdËĪoËĲ": 657,
+      "ĠsËĮÊĬ": 658,
+      "ĠhËĪi": 659,
+      "ĠsËĪa": 660,
+      "ËĪeËĲn": 661,
+      "dÉĻ": 662,
+      "Ġpj": 663,
+      "ËĪÅĵÊģ": 664,
+      "lÉªÃ§": 665,
+      "ÉĴn": 666,
+      "ĠËĪÉĻr": 667,
+      "tËĪe": 668,
+      "Ġil": 669,
+      "ËĪaËĲl": 670,
+      "ĠsËĮÉĻÊĬ": 671,
+      "sÊĪ": 672,
+      "ĠdËĪuËĲ": 673,
+      "hËĪÉĳ": 674,
+      "ĠxËĪou": 675,
+      "ĠlËĪaiÉľ": 676,
+      "wËĪo": 677,
+      "ËĪÉĽnte": 678,
+      "Ġsy": 679,
+      "ĠzÉªÃ§": 680,
+      "ĠÉ¡ËĪu": 681,
+      "ĠÉķËĪy": 682,
+      "ËĪÉĶËĲl": 683,
+      "ÉĶl": 684,
+      "ĠtËĪo": 685,
+      "ĠÊĭoËĲ": 686,
+      "ĠiËĲ": 687,
+      "wËĪaÃ°a": 688,
+      "ËĪando": 689,
+      "ĠaÎ¸ÉĽnt": 690,
+      "ĠaÎ¸ÉĽntwËĪaÃ°a": 691,
+      "ĠtËĪiÉĽ": 692,
+      "ËĪeiÉľ": 693,
+      "ĠpËĮa": 694,
+      "ĠnËĪaÉª": 695,
+      "wa": 696,
+      "Ġfr": 697,
+      "ĠÊĲËĪÉĻÉľn": 698,
+      "ËĪua": 699,
+      "mi": 700,
+      "ĠmËĪÉĽ": 701,
+      "ËĪeËĲkÊ°": 702,
+      "cÊ°": 703,
+      "ĠwËĪÉĳ": 704,
+      "sta": 705,
+      "Ġtu": 706,
+      "Ġsk": 707,
+      "ËĪÉĶl": 708,
+      "ËĪeËĲÊĪ": 709,
+      "ĠlËĪaËĲÉª": 710,
+      "ĠlËĪaËĲ": 711,
+      "ËĪÉĽËĲs": 712,
+      "ËĪÉĽÉ¾a": 713,
+      "ËĪÉĻÉľt": 714,
+      "Ġyn": 715,
+      "dÉĻn": 716,
+      "Ġdi": 717,
+      "ËĪiËĲs": 718,
+      "ĠÃ°el": 719,
+      "ËĪÊĮr": 720,
+      "ĠhËĪaËĲ": 721,
+      "ĠbÉĻ": 722,
+      "ĠjËĪuËĲ": 723,
+      "lle": 724,
+      "sto": 725,
+      "ËĪÉªt": 726,
+      "ËĪoËĲÉ¾": 727,
+      "bÊ°": 728,
+      "mÉĻn": 729,
+      "ËĮuÉĻ": 730,
+      "ËĮÉĻÉ¾": 731,
+      "ËĪÊĮn": 732,
+      "ĠlËĪaÉªk": 733,
+      "ĠbËĪa": 734,
+      "ÉªÃ°": 735,
+      "Ġlo": 736,
+      "zi": 737,
+      "ËĪÊĮst": 738,
+      "mËĪi": 739,
+      "ÉĶÊģ": 740,
+      "ĠnËĪÉªÃ§t": 741,
+      "ĠtÉ¾": 742,
+      "ĠdËĪeËĲkÊ°": 743,
+      "ĠsËĮe": 744,
+      "ĠnËĪÉĻÊĬ": 745,
+      "Ġu": 746,
+      "Ġsi": 747,
+      "ĠÉªÃ§": 748,
+      "Ġpr": 749,
+      "ĠtÉķËĪy": 750,
+      "ĠmËĪu": 751,
+      "za": 752,
+      "ĠtÊģ": 753,
+      "ĠwÉªÃ°": 754,
+      "tËĪÉĽ": 755,
+      "ĠpËĪÊĮÉ¾": 756,
+      "ĠkËĪÉĶ": 757,
+      "ËĪoËĲr": 758,
+      "ĠhËĮa": 759,
+      "ĠkËĪonÉ¡": 760,
+      "ĠpuÊģ": 761,
+      "Ġdy": 762,
+      "ËĪÉªn": 763,
+      "nte": 764,
+      "ĠkËĮa": 765,
+      "ËĪÉĻÉª": 766,
+      "Ġmi": 767,
+      "ĠÉ¡ËĮuÉĻ": 768,
+      "ĠÊ²": 769,
+      "ĠfËĪÉĳ": 770,
+      "ĠvÉĳËĲ": 771,
+      "ĠËĮaÊĬ": 772,
+      "ËĮuËĲ": 773,
+      "ĠËĪun": 774,
+      "ĠjËĪÊĮha": 775,
+      "juËĲ": 776,
+      "ĠmÉªt": 777,
+      "ĠlËĪÉĽ": 778,
+      "ËĪeËĲÊĥ": 779,
+      "ĠfÉĶËĲ": 780,
+      "mÉĻ": 781,
+      "É¾t": 782,
+      "ĠkËĮon": 783,
+      "ĠlËĪÉĶ": 784,
+      "ĠxËĪÉĳu": 785,
+      "pl": 786,
+      "ĠdËĪi": 787,
+      "ĠlËĪoËĲ": 788,
+      "sÉĻ": 789,
+      "ËĪaËĲva": 790,
+      "ĠlËĪu": 791,
+      "ĠÉ¡ËĮÉĻÊĬ": 792,
+      "Ġhav": 793,
+      "ĠËĮaËĲpkËĮoËĲ": 794,
+      "É¾ËĪi": 795,
+      "ĠfËĪÉĻ": 796,
+      "ĠhËĮÉĻm": 797,
+      "ËĪonÉ¡Éľ": 798,
+      "jo": 799,
+      "ĠsÉĶ": 800,
+      "ËĪaËĲd": 801,
+      "wËĪiÉĻ": 802,
+      "ËĪand": 803,
+      "ËĮaÉªn": 804,
+      "tÉ¾": 805,
+      "ĠËĮÉª": 806,
+      "ĠËĪuna": 807,
+      "ĠxwËĪÉĳ": 808,
+      "ĠjÉĶËĲ": 809,
+      "ÊģËĪi": 810,
+      "ĠkËĪuoÉľ": 811,
+      "ĠaÎ²": 812,
+      "ĠÉ¡ËĪaËĲ": 813,
+      "ano": 814,
+      "tÉĻl": 815,
+      "ĠrËĮe": 816,
+      "ËĮÊĮt": 817,
+      "ĠjËĪiÉĳ": 818,
+      "ĠÉ¾ËĮÉĻhaËĲ": 819,
+      "ĠmËĪe": 820,
+      "ĠËĪyÃ¦Éľn": 821,
+      "ĠfËĪu": 822,
+      "Ġbl": 823,
+      "nËĪi": 824,
+      "sÉĻn": 825,
+      "ĠaÉªn": 826,
+      "ËĪiÊĬ": 827,
+      "ĠÃ°eÉª": 828,
+      "ĠÉªts": 829,
+      "Ġ(": 830,
+      "ËĪyËĲ": 831,
+      "ÉĻd": 832,
+      "ĠËĮo": 833,
+      "ĠÉĽs": 834,
+      "ĠviËĲ": 835,
+      "ËĲÉ¡eËĲ": 836,
+      "kËĪe": 837,
+      "ĠËĪal": 838,
+      "ÉĽl": 839,
+      "ĠÊĮ": 840,
+      "ËĲo": 841,
+      "ĠkËĪo": 842,
+      "ĠÊĪËĪuËĲ": 843,
+      "ĠsËĪÉª": 844,
+      "ËĪeËĲÉ¾": 845,
+      "Éľm": 846,
+      "ËĮÉĻn": 847,
+      "ËĪaËĲi": 848,
+      "ËĪoËĲl": 849,
+      "ÉªËĮeËĲ": 850,
+      "ĠÊ²ËĪy": 851,
+      "ĠkËĪÉĶËĲ": 852,
+      "sËĪi": 853,
+      "ĠlËĪe": 854,
+      "ËĮÉĴt": 855,
+      "ËĪiËĲp": 856,
+      "aÊģ": 857,
+      "ĠÎ¸ËĪÉªÅĭ": 858,
+      "ËĪÉĻËĲÉª": 859,
+      "ËĪÊĮl": 860,
+      "ĠhËĪoËĲtaËĲ": 861,
+      "ËĪoÉª": 862,
+      "nto": 863,
+      "zh": 864,
+      "ĠdeËĲm": 865,
+      "ĠkÉĶm": 866,
+      "Ê°ËĪiËĲk": 867,
+      "ĠdÊĴËĪÊĮst": 868,
+      "pÉ¾": 869,
+      "Ġly": 870,
+      "hËĪu": 871,
+      "ËĪÉĶÃ¸": 872,
+      "ËĪaËĲs": 873,
+      "ĠËĪan": 874,
+      "ĠËĪÉĴ": 875,
+      "Ġkan": 876,
+      "ĠtsËĪuo": 877,
+      "ËĪeËĲva": 878,
+      "ĠÉ¡É¾": 879,
+      "Ġpo": 880,
+      "ĠtÊĥËĪÉĶ": 881,
+      "Êİa": 882,
+      "ĠmËĮi": 883,
+      "Êĥt": 884,
+      "tËĪi": 885,
+      "ĠhËĪÊĮ": 886,
+      "tÊĥe": 887,
+      "ĠfÉĶn": 888,
+      "ve": 889,
+      "ĠnËĮe": 890,
+      "ËĪÉĶÊģ": 891,
+      "iz": 892,
+      "ĠsËĪuo": 893,
+      "ËĪÉĽËĲr": 894,
+      "wËĪaÊģ": 895,
+      "ËĪaÃ°a": 896,
+      "Åĭk": 897,
+      "po": 898,
+      "ĠkËĪi": 899,
+      "ËĪad": 900,
+      "ĠvËĪi": 901,
+      "tÉķ": 902,
+      "ĠkËĪÉĻ": 903,
+      "ĠwËĪu": 904,
+      "ÉĴz": 905,
+      "ĠvÉĳËĲÉ¾": 906,
+      "ÊģËĪÉĽ": 907,
+      "ĠkËĪaËĲ": 908,
+      "ke": 909,
+      "nÉĻ": 910,
+      "ËĪÊĮb": 911,
+      "ËĪuËĲÉ¾": 912,
+      "ËĮÉĻËĲ": 913,
+      "ĠÊĪÊ°ËĪiËĲk": 914,
+      "ĠkËĪu": 915,
+      "ĠbËĮÊĮt": 916,
+      "Ġat": 917,
+      "ĠfÉ¹": 918,
+      "ËĪax": 919,
+      "ĠzoËĲ": 920,
+      "ĠtËĪaËĲ": 921,
+      "ĠÃ°ËĮe": 922,
+      "neËĲ": 923,
+      "ĠÉĳËĲ": 924,
+      "ĠaÊĬf": 925,
+      "am": 926,
+      "ÊĬÅĭ": 927,
+      "ĠÉĶËĲ": 928,
+      "ĠÉķËĪiÉľÅĭ": 929,
+      "ĠËĪÉĶËĲl": 930,
+      "Éªm": 931,
+      "jËĪo": 932,
+      "ËĪiËĲÉŁ": 933,
+      "ĠkwËĮÉĽ": 934,
+      "ĠmËĪas": 935,
+      "ÉĻh": 936,
+      "ĠËĪaÊĬ": 937,
+      "ËĪÉĶÉª": 938,
+      "É¡ÉĻÉ¾": 939,
+      "rÉĻn": 940,
+      "ËĪÉªk": 941,
+      "sse": 942,
+      "ĠpËĪÉĳ": 943,
+      "ĠÉĹËĮe": 944,
+      "ĠÉĹËĪi": 945,
+      "Ġaz": 946,
+      "ĠÉ¡ËĪÊĮjaËĲ": 947,
+      "ze": 948,
+      "ĠÉĹËĮaËĲ": 949,
+      "ĠfËĪi": 950,
+      "ĠËĮÉĴn": 951,
+      "ĠxËĪo": 952,
+      "ĠËĮÊĬna": 953,
+      "ĠtÊ°aËĲ": 954,
+      "ĠsÉĳ": 955,
+      "ËĪeÉªÊĥÉĻn": 956,
+      "ĠtÉķËĪiÉľ": 957,
+      "ĠÉŁaËĲ": 958,
+      "pËĲ": 959,
+      "Ġply": 960,
+      "Î¸ËĪi": 961,
+      "ËĲÉĸ": 962,
+      "ĠtËĪuei": 963,
+      "ĠlËĪÉĻ": 964,
+      "ĠdÉĳËĲ": 965,
+      "ft": 966,
+      "ËĪam": 967,
+      "ĠsËĪÊĮkt": 968,
+      "ĠtËĪou": 969,
+      "ĠpËĪiÉĽ": 970,
+      "ĠËĪai": 971,
+      "ĠwËĪÉĴn": 972,
+      "ĠzËĮaÉªn": 973,
+      "Ġest": 974,
+      "ĠmÉĶ": 975,
+      "ĠtÉķjËĪÉĳu": 976,
+      "Éľp": 977,
+      "ËĪÊĮz": 978,
+      "bi": 979,
+      "ËĪÉĽËĲseËĲ": 980,
+      "ĠlËĪy": 981,
+      "ĠmËĮe": 982,
+      "ĠdËĮÉĽl": 983,
+      "ËĪiËĲl": 984,
+      "ĠkËĮomo": 985,
+      "ĠhËĪaÉľn": 986,
+      "ËĪoËĲne": 987,
+      "ĠkËĪÊĮÉ¾t": 988,
+      "ĠsyÊģ": 989,
+      "ËĮÉĶÉ¾": 990,
+      "ĠÉªf": 991,
+      "uv": 992,
+      "zÉĻn": 993,
+      "ol": 994,
+      "Ïĩ": 995,
+      "im": 996,
+      "ĠmËĪiÉĽ": 997,
+      "ĠÃ°Éª": 998,
+      "ĠvËĪÉĽ": 999,
+      "ÊĬd": 1000,
+      "Ġtr": 1001,
+      "ËĪeËĲs": 1002,
+      "Ã°e": 1003,
+      "de": 1004,
+      "Ê°Ïĩ": 1005,
+      "ÉŁÊ°": 1006,
+      "ËĮÉĻËĲÉªÉľ": 1007,
+      "bËĲ": 1008,
+      "ËĪÊĬk": 1009,
+      "ĠnËĪÉĶÉªÉľ": 1010,
+      "ĠËĮiËĲ": 1011,
+      "ËĪÉĳËĲt": 1012,
+      "ËĪiËĲÉ¾": 1013,
+      "ĠtÉ¹": 1014,
+      "É¾ÉĶ": 1015,
+      "ĠwÉĴz": 1016,
+      "Ġvu": 1017,
+      "bÉĻl": 1018,
+      "bÉĻ": 1019,
+      "É¹i": 1020,
+      "nts": 1021,
+      "ĠsËĪaËĲ": 1022,
+      "dÊ°": 1023,
+      "ĠtÊĬ": 1024,
+      "ĠÊİËĮi": 1025,
+      "Î²a": 1026,
+      "hËĪÉĻÉľÅĭ": 1027,
+      "ĠsËĪiËĲ": 1028,
+      "ĠpËĮaÉ¾a": 1029,
+      "ËĪÉĽÉ¾ÉĶ": 1030,
+      "ËĪÉªs": 1031,
+      "É£o": 1032,
+      "ĠËĮal": 1033,
+      "or": 1034,
+      "ĠbËĪÊĮh": 1035,
+      "ĠkËĪoËĲ": 1036,
+      "ĠtËĪÉĽ": 1037,
+      "ĠpËĪo": 1038,
+      "ĠÊĴÉĻ": 1039,
+      "pÊģ": 1040,
+      "ĠËĪaÉª": 1041,
+      "hËĪÉĳÉľÅĭ": 1042,
+      "ÉĻli": 1043,
+      "ËĪeÉªt": 1044,
+      "ĠjËĪiouÉľ": 1045,
+      "ĠdËĪÉĻ": 1046,
+      "ĠmËĪÉĶËĲ": 1047,
+      "lËĪi": 1048,
+      "ËĮyÉĻ": 1049,
+      "ĠlËĪoËĲÉ¡": 1050,
+      "ĠnËĪÊĮ": 1051,
+      "ĠhËĪÊĬ": 1052,
+      "ĠnËĪÉĻÉľÅĭ": 1053,
+      "ĠÊģÉĻ": 1054,
+      "zËĪi": 1055,
+      "ĠtËĪuËĲ": 1056,
+      "ĠkËĮome": 1057,
+      "ĠlËĪeËĲ": 1058,
+      "ËĪaËĲtaËĲ": 1059,
+      "Ġan": 1060,
+      "ĠËĪyu": 1061,
+      "ĠËĮÊĮÉ¡ÉĻÉ¾": 1062,
+      "ĠËĪÉªn": 1063,
+      "ĠhËĪoÉĻ": 1064,
+      "vÉĻ": 1065,
+      "ËĪÃ¸ËĲ": 1066,
+      "Î¸ja": 1067,
+      "ËĪuÉĻÉľn": 1068,
+      "ĠkÉĻÉ¾": 1069,
+      "ËĪat": 1070,
+      "jËĪÃ¸": 1071,
+      "ËĪÉĽtÊģ": 1072,
+      "ĠpËĪÉĳu": 1073,
+      "stÉĻ": 1074,
+      "ĠwÉĴt": 1075,
+      "ËĪeËĲl": 1076,
+      "ÊĪi": 1077,
+      "ĠxËĪaiÉľ": 1078,
+      "ËĪyÊģ": 1079,
+      "ĠhËĪoËĲÉ¡aËĲ": 1080,
+      "ĠtsËĪi": 1081,
+      "ĠËĪÊĮp": 1082,
+      "ĠnËĮÉĴt": 1083,
+      "ĠlËĪÉªeËĲ": 1084,
+      "ĠhËĪa": 1085,
+      "Ġfl": 1086,
+      "ĠnËĪeËĲ": 1087,
+      "ËĮaËĲÉª": 1088,
+      "ĠtËĪuo": 1089,
+      "tÊĥËĲ": 1090,
+      "sËĪe": 1091,
+      "bÊ°i": 1092,
+      "ĠbËĪÊĮhÊĬt": 1093,
+      "ËĪÉĽnd": 1094,
+      "ĠsËĪÉĶ": 1095,
+      "ÉĻns": 1096,
+      "ËĮÉĻl": 1097,
+      "ÉĽÉľ": 1098,
+      "ĠÉ¡l": 1099,
+      "ËĪÉªÉ¾": 1100,
+      "ËĪaËĲta": 1101,
+      "ÉľËĲ": 1102,
+      "ËĪÉĽnto": 1103,
+      "skËĮoËĲ": 1104,
+      "ËĪÉĽk": 1105,
+      "tsi": 1106,
+      "ĠtËĪonÉ¡": 1107,
+      "ĠbiËĲ": 1108,
+      "ĠhËĪaËĲÉª": 1109,
+      "ĠbËĪi": 1110,
+      "jj": 1111,
+      "Êİi": 1112,
+      "ĠkÊ°": 1113,
+      "ĠsËĪo": 1114,
+      "llo": 1115,
+      "ĠbaÉª": 1116,
+      "ĠÉĽnt": 1117,
+      "ĠËĪiËĲ": 1118,
+      "ĠÉ¡ËĪo": 1119,
+      "É¾eËĲ": 1120,
+      "ĠkÊĭ": 1121,
+      "ĠmËĪeiÉľ": 1122,
+      "ÊĬËĪÉĶËĲ": 1123,
+      "ĠtËĪaÉª": 1124,
+      "Ġsus": 1125,
+      "Ġri": 1126,
+      "ĠvËĮÉĽ": 1127,
+      "ËĪiËĲno": 1128,
+      "vano": 1129,
+      "ĠdËĮiËĲ": 1130,
+      "ĠÊĲËĪaÉľn": 1131,
+      "ÊĤ": 1132,
+      "ĠÉĲb": 1133,
+      "ËĪaËĲh": 1134,
+      "ÉªÊĥ": 1135,
+      "ĠdËĮella": 1136,
+      "tËĲi": 1137,
+      "ĠËĪÊĬn": 1138,
+      "ĠhiËĲ": 1139,
+      "ĠbËĪaËĲt": 1140,
+      "ĠthËĪi": 1141,
+      "Ġam": 1142,
+      "ĠËĪoËĲ": 1143,
+      "Ġhu": 1144,
+      "ĠkËĪÊĮh": 1145,
+      "ĠzËĪÉĳËĲ": 1146,
+      "ĠÉ¡ËĮÉĶ": 1147,
+      "ĠËĪÉĻÊĬ": 1148,
+      "yËĪi": 1149,
+      "ĠlËĪÊĮ": 1150,
+      "ĠdËĪeËĲ": 1151,
+      "ĠsËĪÉĶËĲ": 1152,
+      "skËĮeËĲ": 1153,
+      "É¾o": 1154,
+      "ÊģËĪÉĳ": 1155,
+      "tËĪa": 1156,
+      "ĠkËĪÊĬ": 1157,
+      "ËĪante": 1158,
+      "ĠdÉĶ": 1159,
+      "ĠsËĪeÉª": 1160,
+      "ĠsÉĽt": 1161,
+      "É¹Éª": 1162,
+      "ĠÉ¡ËĮÉĻÊĬÉªÅĭ": 1163,
+      "zo": 1164,
+      "ĠjËĪaËĲ": 1165,
+      "ĠÉĴvÃ°ÉĻ": 1166,
+      "ĠÊĿ": 1167,
+      "ĠÉĽl": 1168,
+      "ĠsËĪoËĲ": 1169,
+      "ĠthËĪiÉľ": 1170,
+      "ĠËĪÉĽl": 1171,
+      "ĠlyËĮi": 1172,
+      "ndÊĴ": 1173,
+      "ĠÉķjËĪÉĳu": 1174,
+      "Î¸a": 1175,
+      "ĠÉ¾ËĮÉĻheËĲ": 1176,
+      "ĠmaÉª": 1177,
+      "jÉĻ": 1178,
+      "ĠËĪÊĮb": 1179,
+      "asjËĪÉĶ": 1180,
+      "dÊģ": 1181,
+      "ĠkhËĪa": 1182,
+      "ĠËĪes": 1183,
+      "vi": 1184,
+      "fi": 1185,
+      "ËĮÉĻb": 1186,
+      "Ġre": 1187,
+      "ĠavËĮÉĽ": 1188,
+      "ĠtËĮi": 1189,
+      "ĠkÉ¾": 1190,
+      "ĠbÉªk": 1191,
+      "ste": 1192,
+      "ËĪeËĲÊĥc": 1193,
+      "pt": 1194,
+      "zÉĻ": 1195,
+      "ĠwËĪaËĲ": 1196,
+      "kl": 1197,
+      "ĠsËĪÊĮm": 1198,
+      "ÉªÊĪ": 1199,
+      "dz": 1200,
+      "vo": 1201,
+      "ËĮaÊĬt": 1202,
+      "nde": 1203,
+      "ĠdÉĽs": 1204,
+      "ĠÉŁËĪaËĲ": 1205,
+      "ĠrËĮi": 1206,
+      "sËĮeËĲ": 1207,
+      "É¡i": 1208,
+      "Ġals": 1209,
+      "ËĪiÃ°o": 1210,
+      "ĠnËĪiÉľn": 1211,
+      "ÊĬl": 1212,
+      "tsËĲ": 1213,
+      "ËĪanto": 1214,
+      "ĠÉĹËĪÉĻÊĬ": 1215,
+      "kËĲi": 1216,
+      "ĠsËĪÊĮb": 1217,
+      "ĠnËĪa": 1218,
+      "ĠlËĮo": 1219,
+      "ĠphËĪi": 1220,
+      "mËĮe": 1221,
+      "Ġfa": 1222,
+      "kÉĻ": 1223,
+      "ĠzËĪu": 1224,
+      "ns": 1225,
+      "ĠÊģe": 1226,
+      "ĠbËĪo": 1227,
+      "ËĪaËĲti": 1228,
+      "Ġman": 1229,
+      "ĠlËĪiÉĳ": 1230,
+      "ĠÉĹËĮyÉĻ": 1231,
+      "ĠfËĪÉĶËĲ": 1232,
+      "ĠkÊĭËĪeËĲÊĥc": 1233,
+      "ĠxËĪÉĳ": 1234,
+      "ĠtÉķËĪu": 1235,
+      "jÉĻÉ¾": 1236,
+      "ĠÉªst": 1237,
+      "wËĪi": 1238,
+      "ĠËĮaÉªnÉĻ": 1239,
+      "ÉªÉ¡": 1240,
+      "ĠsÊĪ": 1241,
+      "ËĪiÉĻl": 1242,
+      "ĠnËĪiÉĽÉľn": 1243,
+      "ĠËĮÉĽËĲ": 1244,
+      "ËĪaÉªnd": 1245,
+      "ĠzËĪi": 1246,
+      "vÉĻn": 1247,
+      "mz": 1248,
+      "Ã°os": 1249,
+      "dÊĴËĲ": 1250,
+      "jËĪa": 1251,
+      "É¾ËĪÉĶ": 1252,
+      "lËĪe": 1253,
+      "Ê²": 1254,
+      "ĠvËĪÉĶ": 1255,
+      "ĠlËĪiÉĽ": 1256,
+      "Î¸e": 1257,
+      "mËĪente": 1258,
+      "ĠÉªnÃ°ÉĻ": 1259,
+      "ĠaÉªm": 1260,
+      "nÉĻn": 1261,
+      "ĠhÉĻm": 1262,
+      "É¾aËĲ": 1263,
+      "ĠsËĪuoÉľ": 1264,
+      "ĠÉ²ËĪi": 1265,
+      "ĠÉ¹ËĪiÉĻl": 1266,
+      "lËĪa": 1267,
+      "ĠbËĪÉĶ": 1268,
+      "ĠkËĪai": 1269,
+      "ÊģËĪa": 1270,
+      "ĠwËĪÉľËĲ": 1271,
+      "ĠaËĲ": 1272,
+      "Ġpas": 1273,
+      "ËĪÊĮs": 1274,
+      "wËĪÉĽÉ¾": 1275,
+      "ĠÉĹËĪe": 1276,
+      "ĠhËĮatÉĻ": 1277,
+      "aÉªn": 1278,
+      "ĠËĪÉĶpÊ°": 1279,
+      "ÊģËĪe": 1280,
+      "ĠÉŁaËĲËĪeËĲÉ¡aËĲ": 1281,
+      "ĠËĪÊĬs": 1282,
+      "ĠtÉķhËĪiÉľ": 1283,
+      "ntÊĥ": 1284,
+      "ĠxËĪuo": 1285,
+      "ËĪuÊģ": 1286,
+      "ĠÉªm": 1287,
+      "É³Éĸ": 1288,
+      "ËĪyÉĻÉľkh": 1289,
+      "ĠËĪyÉĽ": 1290,
+      "ĠmËĮaËĲ": 1291,
+      "ÅĵÊģ": 1292,
+      "ĠËĪalt": 1293,
+      "ĠkÉĻm": 1294,
+      "Êİo": 1295,
+      "ĠÉĲn": 1296,
+      "Ġfy": 1297,
+      "ĠËĮÉĽra": 1298,
+      "ĠÉ¡ËĪÊĬ": 1299,
+      "ĠpËĪÊĮ": 1300,
+      "ls": 1301,
+      "ĠlËĪiËĲ": 1302,
+      "ĠÊĤËĪy": 1303,
+      "ĠbÉªkËĪÊĮz": 1304,
+      "ĠÉ¡ÉĽt": 1305,
+      "ĠbÉ¾": 1306,
+      "tÊ°": 1307,
+      "tÉĻlËĮÉĻb": 1308,
+      "xo": 1309,
+      "skËĮaËĲ": 1310,
+      "É²Ê²": 1311,
+      "ËĪeËĲkÊĪ": 1312,
+      "rÉĻ": 1313,
+      "tÊĥo": 1314,
+      "ĠpÊģÉĶ": 1315,
+      "ĠÉ¹ËĪaÉªt": 1316,
+      "ĠpËĪei": 1317,
+      "ËĮÉªÃ§": 1318,
+      "jËĪÉĽÉ¾": 1319,
+      "tËĲa": 1320,
+      "ĠÉĲbËĮaÊĬt": 1321,
+      "ĠkÊĭËĪeËĲÊĥcÉĻn": 1322,
+      "ĠvËĪe": 1323,
+      "ÊĬÉľ": 1324,
+      "ĠakËĪe": 1325,
+      "ĠpËĪai": 1326,
+      "vËĪÉĽ": 1327,
+      "ĠÎ¸É¹": 1328,
+      "Éªf": 1329,
+      "ĠavËĪÉĽ": 1330,
+      "ĠkËĪe": 1331,
+      "dËĪi": 1332,
+      "ËĪeËĲÉĸ": 1333,
+      "ĠbÉĻt": 1334,
+      "ÊĪÊ°": 1335,
+      "teËĲ": 1336,
+      "Î¸jËĪÉĶn": 1337,
+      "dÉľ": 1338,
+      "ĠjËĪiÉľ": 1339,
+      "Ġve": 1340,
+      "É£ËĪu": 1341,
+      "ËĪÊĮhÉĻl": 1342,
+      "ĠpÉĶ": 1343,
+      "ĠÉ¡r": 1344,
+      "ĠÃ°a": 1345,
+      "ĠvËĪiËĲ": 1346,
+      "ĠËĮÉĳËĲ": 1347,
+      "ËĪÉĻÊĬnt": 1348,
+      "ĠbËĪaËĲÉ¾": 1349,
+      "ĠmËĪÊĮtÉĻlËĮÉĻb": 1350,
+      "ld": 1351,
+      "ĠtÉķËĮÉĶ": 1352,
+      "pa": 1353,
+      "Ã°ËĪad": 1354,
+      "ËĪiÉ¾": 1355,
+      "ĠxËĪu": 1356,
+      "ĠlËĪiÉľÅĭ": 1357,
+      "ËĪeÉªs": 1358,
+      "ĠÉĹËĮeÉľn": 1359,
+      "ĠthËĪiÉĽ": 1360,
+      "tËĲe": 1361,
+      "ĠavËĮÉĽk": 1362,
+      "ĠËĮÉĶ": 1363,
+      "ĠkËĪÉĳu": 1364,
+      "Éªv": 1365,
+      "iËĲz": 1366,
+      "ËĪos": 1367,
+      "ĠÉ¡É¹": 1368,
+      "and": 1369,
+      "ĠlËĪiou": 1370,
+      "ĠËĪoÉľ": 1371,
+      "É¡l": 1372,
+      "ĠpËĪÉĶËĲ": 1373,
+      "ĠmËĮeËĲ": 1374,
+      "ĠkËĪÉĴ": 1375,
+      "nos": 1376,
+      "Ã§ÉĻn": 1377,
+      "fÉĻn": 1378,
+      "ĠsËĪÊĮktËĮeËĲ": 1379,
+      "ĠËĪaÉªn": 1380,
+      "ËĪoËĲre": 1381,
+      "jËĪÉĽn": 1382,
+      "ĠÃ°ËĪÉĽn": 1383,
+      "ĠtÉķhËĪiÉĽÉľn": 1384,
+      "ĠhËĪaÉª": 1385,
+      "É¾ËĪÉĽ": 1386,
+      "ĠsËĪu": 1387,
+      "ĠkËĪÉªjaËĲ": 1388,
+      "ĠpjËĮÊĬ": 1389,
+      "ĠhÉĻmËĮaËĲ": 1390,
+      "ĠËĮÊĮp": 1391,
+      "ĠpËĪÊĮhÉĻl": 1392,
+      "ĠxËĪÉĻ": 1393,
+      "dËĪe": 1394,
+      "ĠmÉĳ": 1395,
+      "ĠÊĬm": 1396,
+      "ndÉĻ": 1397,
+      "ĠdËĪÉĻÊĬnt": 1398,
+      "ËĪeËĲÊĥÉĻn": 1399,
+      "ĠÃ°ats": 1400,
+      "is": 1401,
+      "ĠcËĪaËĲh": 1402,
+      "pe": 1403,
+      "ĠsËĮo": 1404,
+      "ĠÃ°ËĪe": 1405,
+      "ĠsËĪaËĲt": 1406,
+      "ËĪaÊģ": 1407,
+      "ĠsËĪe": 1408,
+      "ÉĻk": 1409,
+      "ÉªÊĭ": 1410,
+      "ĠkËĪoËĲi": 1411,
+      "kÉĶ": 1412,
+      "ĠvËĪaËĲÊĬ": 1413,
+      "ĠfËĪei": 1414,
+      "ĠlËĪeËĲk": 1415,
+      "ĠhËĪiÉĻ": 1416,
+      "ĠaÊĬ": 1417,
+      "ËĪÉĽndo": 1418,
+      "ËĪes": 1419,
+      "ĠzËĪÉĶ": 1420,
+      "ĠËĪÉĽÉ¾a": 1421,
+      "nËĪiÉľn": 1422,
+      "ĠkËĪÊĮm": 1423,
+      "ĠlËĪÉĴ": 1424,
+      "Éªst": 1425,
+      "ĠpÉĳ": 1426,
+      "ĠfËĪÉĶ": 1427,
+      "ĠthËĪonÉ¡": 1428,
+      "nke": 1429,
+      "ËĮÉªk": 1430,
+      "ĠÉ²ËĪÉĻ": 1431,
+      "ËĮÊĮm": 1432,
+      "ËĪiËĲt": 1433,
+      "ĠwËĪÉĴnt": 1434,
+      "ËĪaÎ²an": 1435,
+      "ĠbËĪÊĮr": 1436,
+      "ÉĽnd": 1437,
+      "ĠËĮÉĳËĲbÉľ": 1438,
+      "ĠvËĪaÉª": 1439,
+      "ĠtÊĥËĮi": 1440,
+      "ĠÎ¸ËĪÉªÅĭk": 1441,
+      "sti": 1442,
+      "ĠkÉ¹": 1443,
+      "ĠËĪaÊĬt": 1444,
+      "stÉĻn": 1445,
+      "ĠÊĭËĪÊĮn": 1446,
+      "ĠÉ¡ËĮaËĲ": 1447,
+      "ËĪaËĲÉľÉ²": 1448,
+      "Êģi": 1449,
+      "ĠnËĪÉĶx": 1450,
+      "ĠÉ¹ËĪiÉĻlÉª": 1451,
+      "ĠvËĮi": 1452,
+      "ĠÃ°eÉĻ": 1453,
+      "ËĮÉªtÊĥ": 1454,
+      "ĠvËĪyÉĻ": 1455,
+      "ĠËĮaËĲpkËĮaËĲ": 1456,
+      "ĠfËĮaËĲÉª": 1457,
+      "ĠpËĪÉĶ": 1458,
+      "ĠnËĪÊĮmb": 1459,
+      "Î¸es": 1460,
+      "jËĪÉĽÊģ": 1461,
+      "ĠkËĪÊĬcÊ°": 1462,
+      "mËĪÉĽ": 1463,
+      "ĠvËĪu": 1464,
+      "ĠlÅĵÊģ": 1465,
+      "ĠiËĲm": 1466,
+      "ÊĪÉĻÉ¾": 1467,
+      "tÊĥi": 1468,
+      "ËĲs": 1469,
+      "ĠtËĪy": 1470,
+      "ĠmËĪiÉľÅĭ": 1471,
+      "É¾ËĪe": 1472,
+      "mËĮa": 1473,
+      "ĠmËĮiËĲ": 1474,
+      "ĠÉĽks": 1475,
+      "Éªp": 1476,
+      "ĠkËĪÊĮÉ¾nËĮaËĲ": 1477,
+      "ĠËĮaÊĬx": 1478,
+      "rËĪiËĲ": 1479,
+      "ĠcËĪÊĮl": 1480,
+      "mos": 1481,
+      "ĠkËĪÊĮÉ¾tËĮeËĲ": 1482,
+      "iËĲÉ¾": 1483,
+      "kÉĻn": 1484,
+      "ĠdËĪu": 1485,
+      "naËĲ": 1486,
+      "ĠpwËĪe": 1487,
+      "ËĮÉĶÉª": 1488,
+      "ĠtÉķhËĪiÉĽ": 1489,
+      "ĠÎ²ËĪi": 1490,
+      "ËĪiÉĽÉľt": 1491,
+      "Ġte": 1492,
+      "ËĪaÃ°os": 1493,
+      "mËĪa": 1494,
+      "ĠvËĪo": 1495,
+      "ĠmËĪÉª": 1496,
+      "ĠbËĮi": 1497,
+      "ad": 1498,
+      "do": 1499,
+      "ĠnËĪaÊĬ": 1500,
+      "ĠÊ²ËĪyÉľ": 1501,
+      "wËĪÉĽ": 1502,
+      "ËĪis": 1503,
+      "el": 1504,
+      "Ġpar": 1505,
+      "ĠtËĪai": 1506,
+      "ĠdËĪÉªjaËĲ": 1507,
+      "hËĪi": 1508,
+      "ĠÉ¾ËĪÊĮ": 1509,
+      "ĠdËĪe": 1510,
+      "ËĪaÉªd": 1511,
+      "Ġper": 1512,
+      "ĠsËĮÉĶ": 1513,
+      "we": 1514,
+      "ÊĬm": 1515,
+      "Ġin": 1516,
+      "ĠjËĪuËĲz": 1517,
+      "ËĪiËĲpÉĻl": 1518,
+      "ĠÊĭËĪaËĲl": 1519,
+      "ĠetËĪÉĽ": 1520,
+      "ËĮÉĽm": 1521,
+      "ĠnËĪu": 1522,
+      "ËĪÉĽkt": 1523,
+      "ĠiËĲÉ¾": 1524,
+      "ĠbÉ¹": 1525,
+      "ĠtshËĪi": 1526,
+      "ĠÉĹËĪÉĶÉľ": 1527,
+      "ĠkwËĮa": 1528,
+      "ĠfËĪuÉľ": 1529,
+      "wËĮa": 1530,
+      "ĠdËĪiËĲ": 1531,
+      "ĠÉ¡ËĪyÉĻ": 1532,
+      "ËĮÉĽËĲ": 1533,
+      "rËĪa": 1534,
+      "Ġne": 1535,
+      "ĠzËĪyÉĻ": 1536,
+      "ĠbËĪaÉª": 1537,
+      "ĠÉŁËĪÊĮb": 1538,
+      "ËĪuËĲto": 1539,
+      "ÊĬnt": 1540,
+      "ĠcÊ°": 1541,
+      "ËĪÉĽnti": 1542,
+      "ËĪoÉĻ": 1543,
+      "ĠsËĮÊĮm": 1544,
+      "ĠlÉĳ": 1545,
+      "ËĮeva": 1546,
+      "É¾ÉĽ": 1547,
+      "ntÉľ": 1548,
+      "ĠmËĪÉĽn": 1549,
+      "ËĪÉĳËĲk": 1550,
+      "Ġkil": 1551,
+      "ËĪones": 1552,
+      "ff": 1553,
+      "ĠmËĪÉĽËĲ": 1554,
+      "ĠvËĪÉĻÉª": 1555,
+      "ĠËĪÉĶËĲ": 1556,
+      "ĠËĮÉªnt": 1557,
+      "ÊĬn": 1558,
+      "ĠwÉªl": 1559,
+      "Ġsin": 1560,
+      "ĠËĮalla": 1561,
+      "ĠaÎ²ËĪia": 1562,
+      "pi": 1563,
+      "ËĪoÉľ": 1564,
+      "ÉªjËĮaËĲ": 1565,
+      "ku": 1566,
+      "ĠvËĪÉª": 1567,
+      "Ġtut": 1568,
+      "ĠtËĪeÉľ": 1569,
+      "ĠhËĪÉĶ": 1570,
+      "Î²É¾e": 1571,
+      "sÉĻÉ¾": 1572,
+      "ĠkhËĪai": 1573,
+      "ĠmËĪÉĶ": 1574,
+      "Ġta": 1575,
+      "ĠÉ²ËĪaËĲ": 1576,
+      "Ġnu": 1577,
+      "ËĪuËĲn": 1578,
+      "ĠÉĻËĲÉľ": 1579,
+      "ĠËĪaÊĬf": 1580,
+      "ËĪiËĲdÉľ": 1581,
+      "nti": 1582,
+      "ĠpËĪiËĲpÉĻl": 1583,
+      "Ġkj": 1584,
+      "Ġpe": 1585,
+      "ĠmËĪÉĳ": 1586,
+      "ËĮaÉª": 1587,
+      "ËĪaËĲle": 1588,
+      "ĠvËĮÉĻËĲÉªÉľ": 1589,
+      "mpo": 1590,
+      "ĠkËĪÉªt": 1591,
+      "ĠnËĮÉĽ": 1592,
+      "ĠÉŁËĪaËĲtaËĲ": 1593,
+      "ĠsËĪaËĲtÊ°": 1594,
+      "ĠÉŁËĪi": 1595,
+      "Ġso": 1596,
+      "ĠbËĪÉĽ": 1597,
+      "kËĪi": 1598,
+      "Éªti": 1599,
+      "Ġtsi": 1600,
+      "ĠkÊģ": 1601,
+      "ËĮÉĴ": 1602,
+      "É¡ÉĻl": 1603,
+      "kst": 1604,
+      "ĠmËĪÉĻËĲ": 1605,
+      "ËĪÊĮk": 1606,
+      "ĠnËĪaËĲÊĬ": 1607,
+      "Ġap": 1608,
+      "ĠlËĪÉªkÊ°": 1609,
+      "lli": 1610,
+      "ĠkwËĪal": 1611,
+      "ĠËĪÉĻËĲ": 1612,
+      "ĠtsËĪuei": 1613,
+      "Ġdo": 1614,
+      "ĠkËĲjËĪo": 1615,
+      "ÊĬz": 1616,
+      "ĠpËĪaËĲ": 1617,
+      "ĠmËĪuËĲ": 1618,
+      "ĠÉ¡ÉĻv": 1619,
+      "rËĪi": 1620,
+      "Ġtw": 1621,
+      "ËĮÉªn": 1622,
+      "dËĪÉĳ": 1623,
+      "ĠÃ°ËĪi": 1624,
+      "ĠËĪaËĲi": 1625,
+      "ĠhËĪiÉĽ": 1626,
+      "ĠÃ°ËĮÉĽm": 1627,
+      "ĠpÊ°ËĪÉªÉ¾": 1628,
+      "ÉĴm": 1629,
+      "ĠËĮeËĲ": 1630,
+      "ĠthËĪaiÉľ": 1631,
+      "ĠvËĪas": 1632,
+      "ĠnÉĳËĲ": 1633,
+      "pÉĻn": 1634,
+      "ĠpËĮÉĻÉ¾": 1635,
+      "ĠÉĹËĪaËĲÉª": 1636,
+      "ËĪouÉľ": 1637,
+      "ĠÊĲËĪuÉľ": 1638,
+      "ĠmËĪan": 1639,
+      "ĠtËĪÉĻÉªÉľ": 1640,
+      "ĠlËĪaËĲÊĬ": 1641,
+      "mËĪÉĽnte": 1642,
+      "ĠfËĪam": 1643,
+      "sjËĪÉĶ": 1644,
+      "ĠpËĪÉĻ": 1645,
+      "ËĪeËĲm": 1646,
+      "ĠpËĪÊĮr": 1647,
+      "jËĪi": 1648,
+      "ĠlÉĽ": 1649,
+      "Ġten": 1650,
+      "ËĪoËĲra": 1651,
+      "ki": 1652,
+      "ĠÊĤËĪaËĲÊĬ": 1653,
+      "kÉª": 1654,
+      "bËĲe": 1655,
+      "ËĪalt": 1656,
+      "Ã°Éª": 1657,
+      "pËĪi": 1658,
+      "ĠËĮÉĽnt": 1659,
+      "ĠmËĪei": 1660,
+      "ĠhËĪÉĻÊĬ": 1661,
+      "ĠhËĪÉĽÉ¾": 1662,
+      "jËĪÉĳ": 1663,
+      "ĠhËĪÊĬaËĲ": 1664,
+      "mÉľ": 1665,
+      "ĠdÊ°": 1666,
+      "ĠtÊĥËĪe": 1667,
+      "lËĪÉĽ": 1668,
+      "ËĪaËĲte": 1669,
+      "ĠpËĪuËĲ": 1670,
+      "ĠmËĪÊĬ": 1671,
+      "ËĪaËĲÉªÊĪ": 1672,
+      "diËĲ": 1673,
+      "ĠfÉ¹ÉĴm": 1674,
+      "ĠhËĪÉĳËĲ": 1675,
+      "Î²o": 1676,
+      "ĠmËĪiÉľn": 1677,
+      "ĠÃ°iËĲz": 1678,
+      "ĠkËĪou": 1679,
+      "ËĪiËĲna": 1680,
+      "ĠavËĮeva": 1681,
+      "ĠËĪaËĲÉ¾": 1682,
+      "ĠnËĪuËĲÉ¾": 1683,
+      "ĠÎ²ËĪe": 1684,
+      "ĠzaÉªn": 1685,
+      "ËĪÉĽd": 1686,
+      "ÉĹ": 1687,
+      "ËĪeÉªk": 1688,
+      "sËĮÉĻÊĬ": 1689,
+      "ËĪeËĲÉŁ": 1690,
+      "ĠÊĤËĪÉĻËĲ": 1691,
+      "je": 1692,
+      "cÊ°ËĲ": 1693,
+      "ËĪÉĶr": 1694,
+      "ÉĽËĲ": 1695,
+      "ĠtÉķhËĪyÃ¦Éľn": 1696,
+      "ĠËĮaÉªnÉĻn": 1697,
+      "ĠiËĲn": 1698,
+      "ĠbËĪÊĮc": 1699,
+      "ËĪiËĲm": 1700,
+      "É¾as": 1701,
+      "ËĮÉĻs": 1702,
+      "ĠvËĪeËĲ": 1703,
+      "ĠËĪÉĻrÉľ": 1704,
+      "ĠduËĲ": 1705,
+      "ntÉĻ": 1706,
+      "ĠpÉ¹ËĪÉĴ": 1707,
+      "ĠbËĪÉª": 1708,
+      "ĠwËĪoÉľ": 1709,
+      "nËĮi": 1710,
+      "ĠhÉĲ": 1711,
+      "ĠkËĪÉĽ": 1712,
+      "Ġet": 1713,
+      "jËĪÉĽndo": 1714,
+      "ĠËĪaiÉľ": 1715,
+      "Ġli": 1716,
+      "ĠËĪaÊĬs": 1717,
+      "kËĲo": 1718,
+      "ĠÉĹËĪyÉĻ": 1719,
+      "keËĲ": 1720,
+      "ĠfËĪiËĲl": 1721,
+      "ĠbÊ°ËĪaËĲi": 1722,
+      "ĠÉ¡ÉĻÊĥ": 1723,
+      "ÊĴËĪe": 1724,
+      "ĠnjËĪuËĲ": 1725,
+      "ĠËĪak": 1726,
+      "ĠÉĹËĪaËĲ": 1727,
+      "zËĪa": 1728,
+      "vËĪe": 1729,
+      "ĠhËĮaÊĬ": 1730,
+      "ÉĲÃ§": 1731,
+      "ĠÉ¾ËĪÊĮkÊ°": 1732,
+      "pËĪe": 1733,
+      "ĠtÉĻbi": 1734,
+      "ĠpËĪÊĮhÉĻlËĮeËĲ": 1735,
+      "ĠfËĪÉĽ": 1736,
+      "ĠwËĮÉªtÊĥ": 1737,
+      "ĠtÉķËĪyÉĽÉľ": 1738,
+      "wËĮe": 1739,
+      "ËĮaÉªt": 1740,
+      "ĠnÉĳËĲx": 1741,
+      "ĠkËĪÉĶËĲn": 1742,
+      "ÊĬk": 1743,
+      "ĠbËĪaËĲd": 1744,
+      "ÅĭÉĻn": 1745,
+      "Ġni": 1746,
+      "ĠbËĪe": 1747,
+      "ĠmËĮÊĬ": 1748,
+      "ËĪar": 1749,
+      "ĠmËĮeÉªk": 1750,
+      "ĠsËĪaËĲÉ¾": 1751,
+      "Î²e": 1752,
+      "ĠtÉķhËĪiÉľÅĭ": 1753,
+      "itËĪe": 1754,
+      "kËĮe": 1755,
+      "ËĪÉĽËĲl": 1756,
+      "ËĮÉĴn": 1757,
+      "ËĮÉĳ": 1758,
+      "ĠbËĪÉªl": 1759,
+      "ĠwÊĬd": 1760,
+      "ĠbËĪoËĲl": 1761,
+      "rd": 1762,
+      "iÉĻ": 1763,
+      "Ġda": 1764,
+      "ĠbËĪaËĲÊĬ": 1765,
+      "ĠnËĪÊĮmbÉĻÉ¾": 1766,
+      "ËĪaËĲÉªÉľ": 1767,
+      "ĠÉĽm": 1768,
+      "ĠmiËĲÉ¾": 1769,
+      "ËĪeÉªm": 1770,
+      "los": 1771,
+      "ËĮÉĽt": 1772,
+      "ĠËĮaÊĬs": 1773,
+      "ĠmËĪaÉľt": 1774,
+      "ĠwËĪuÉĻ": 1775,
+      "ĠwËĪeÉª": 1776,
+      "ĠseÉ²": 1777,
+      "ĠbjËĪÉĽ": 1778,
+      "ĠwÉĽn": 1779,
+      "fl": 1780,
+      "ĠkhwËĪa": 1781,
+      "dËĪÉĽ": 1782,
+      "vÉ¹Éª": 1783,
+      "ĠËĪaÉ¾": 1784,
+      "jËĪÉĳuÉľ": 1785,
+      "ĠËĮaËĲpkËĮeËĲ": 1786,
+      "bÊģ": 1787,
+      "ĠtËĪaÉªm": 1788,
+      "ĠËĪÉĳ": 1789,
+      "ĠsËĮa": 1790,
+      "ĠzËĪoÉª": 1791,
+      "ËĪÉĶÉ¾a": 1792,
+      "ĠdËĪÃ¸": 1793,
+      "ËĪÉĶÉ¾t": 1794,
+      "ĠÅĭËĪÉĶ": 1795,
+      "min": 1796,
+      "ĠlËĪÊĬk": 1797,
+      "ËĪÉĶËĲt": 1798,
+      "ĠËĪÉĶtÉ¾": 1799,
+      "ĠfËĪaÉª": 1800,
+      "ĠÉ¡ÉĴt": 1801,
+      "ËĪeËĲÉĻn": 1802,
+      "kËĪÉĶ": 1803,
+      "ĠvËĪÉĽÉ¹i": 1804,
+      "mÉĽ": 1805,
+      "ËĪaÉªz": 1806,
+      "Ġesp": 1807,
+      "É²a": 1808,
+      "ĠlËĪo": 1809,
+      "ËĪÉĽËĲra": 1810,
+      "Î²ËĪi": 1811,
+      "ouÉľ": 1812,
+      "ËĮÉĻk": 1813,
+      "tÊĥuËĲ": 1814,
+      "ĠnËĪyÉĻ": 1815,
+      "ÊĪÉ¾": 1816,
+      "ĠÉ¡ËĪy": 1817,
+      "ĠtËĪoÃ°o": 1818,
+      "ËĪÉªÃ§t": 1819,
+      "ĠmÉªÃ§": 1820,
+      "ĠËĪand": 1821,
+      "ĠkwËĮÉĽl": 1822,
+      "ĠÊĤËĪaËĲ": 1823,
+      "ĠnËĪiÉľ": 1824,
+      "ËĪÉĶp": 1825,
+      "ËĪiËĲz": 1826,
+      "ĠÊĤËĪaÊĬ": 1827,
+      "ĠÉ¾ËĮÉĻhi": 1828,
+      "ĠsËĮÊĬo": 1829,
+      "ĠÉĽÉ¡": 1830,
+      "ĠdÅĵ": 1831,
+      "ĠÉ¡ËĮaËĲÉªÉľ": 1832,
+      "dÉª": 1833,
+      "lËĮa": 1834,
+      "stËĪi": 1835,
+      "ĠdËĮiËĲz": 1836,
+      "ĠtËĮÊĬ": 1837,
+      "Î¸i": 1838,
+      "ĠËĪÉªskËĮoËĲ": 1839,
+      "ndÉĻn": 1840,
+      "Ġtsv": 1841,
+      "ĠhËĪÉĻËĲ": 1842,
+      "ĠÊĥËĪÊĬ": 1843,
+      "ÉĻtËĮeËĲ": 1844,
+      "pËĮÉĽ": 1845,
+      "ËĪaÉ¾ÉĶn": 1846,
+      "ĠpÉĽÊģ": 1847,
+      "Ġy": 1848,
+      "mnËĮeËĲ": 1849,
+      "ËĪÉĽllo": 1850,
+      "ĠÉ¡ËĪÉĻ": 1851,
+      "ĠËĮad": 1852,
+      "ĠÊĥv": 1853,
+      "ËĪÊıÉ¾": 1854,
+      "rËĪe": 1855,
+      "yËĲ": 1856,
+      "ĠpËĪaËĲs": 1857,
+      "ĠËĪÉĽn": 1858,
+      "ÉªdÊĴ": 1859,
+      "ËĪuai": 1860,
+      "Ġfi": 1861,
+      "ĠtËĪyÉĻ": 1862,
+      "ËĪaËĲÉŁ": 1863,
+      "ĠtjËĪe": 1864,
+      "ËĪaËĲnaËĲ": 1865,
+      "stÉ¾": 1866,
+      "Êİe": 1867,
+      "ËĮeÉªt": 1868,
+      "ba": 1869,
+      "Ã°as": 1870,
+      "vÊģ": 1871,
+      "ĠzËĪÉĻËĲ": 1872,
+      "ËĪaËĲli": 1873,
+      "ÉŁÊ°eËĲ": 1874,
+      "ËĪaËĲteËĲ": 1875,
+      "ĠvËĪa": 1876,
+      "Ġsal": 1877,
+      "ËĪaËĲno": 1878,
+      "ĠÉ¡ÉĻz": 1879,
+      "ĠhËĪoËĲti": 1880,
+      "ĠÉ²ËĪiÉĽ": 1881,
+      "tÉľ": 1882,
+      "ĠËĪaËĲp": 1883,
+      "ĠwËĪÉĽl": 1884,
+      "ĠmËĪÉªl": 1885,
+      "ĠfyËĲÉ¾": 1886,
+      "ËĪÉĽËĲsaËĲ": 1887,
+      "ĠbËĮiËĲ": 1888,
+      "ËĪaËĲjaËĲ": 1889,
+      "ËĪÉªp": 1890,
+      "ĠfÊģ": 1891,
+      "tsiËĪoËĲne": 1892,
+      "ĠwËĪuÉľ": 1893,
+      "Ġvi": 1894,
+      "ĠwËĪÉĳÉľn": 1895,
+      "ËĪoËĲn": 1896,
+      "ĠÉĹËĪÉĻÉª": 1897,
+      "ĠÊĿËĪo": 1898,
+      "Ġra": 1899,
+      "mÉĻnt": 1900,
+      "ËĪaÊĬnd": 1901,
+      "ĠpÉĽÉ¾": 1902,
+      "ĠÉĹËĪaËĲÊĬ": 1903,
+      "oËĲÉ¾": 1904,
+      "hËĪo": 1905,
+      "ĠÉĴn": 1906,
+      "ĠÊİe": 1907,
+      "ĠsËĪÉªks": 1908,
+      "É¡n": 1909,
+      "ĠÉ¡ËĪa": 1910,
+      "ĠÎ¸j": 1911,
+      "ĠpËĪe": 1912,
+      "spe": 1913,
+      "ĠvËĪÉĻ": 1914,
+      "ĠfËĪÉª": 1915,
+      "ĠËĮÉªntÊĬ": 1916,
+      "lÉĻn": 1917,
+      "ĠnËĪiËĲd": 1918,
+      "ĠsËĮÊĬa": 1919,
+      "ĠËĪum": 1920,
+      "ĠdËĪeÉª": 1921,
+      "ĠËĪÊĮbÊ°i": 1922,
+      "ËĪÉĳËĲÉ¾": 1923,
+      "ĠbËĪiÉĽÉľt": 1924,
+      "Êİos": 1925,
+      "ĠtshËĪaiÉľ": 1926,
+      "ĠËĮÉªskËĮaËĲ": 1927,
+      "ĠaÊĬÉĻ": 1928,
+      "ĠËĪyÃ¦": 1929,
+      "Ġdyn": 1930,
+      "ĠmËĪiËĲn": 1931,
+      "ĠËĪÊĮcÊ°ËĲ": 1932,
+      "ĠsÉĽ": 1933,
+      "ĠnËĪy": 1934,
+      "ĠnËĮÉĽl": 1935,
+      "É¡É¾": 1936,
+      "ÊĥËĪe": 1937,
+      "ĠÊĤËĮÉĽ": 1938,
+      "ĠËĪÉĽvÉ¹Éª": 1939,
+      "ËĪÉĽlp": 1940,
+      "ĠbËĪak": 1941,
+      "ĠeËĲ": 1942,
+      "ĠfËĪaËĲ": 1943,
+      "ĠkÉĽl": 1944,
+      "ĠËĪeËĲs": 1945,
+      "jËĪaËĲd": 1946,
+      "ĠlËĮi": 1947,
+      "mbÉ¾e": 1948,
+      "ktÉĻ": 1949,
+      "nta": 1950,
+      "tËĪu": 1951,
+      "ĠÃ°ËĪat": 1952,
+      "ĠËĪaÎ²": 1953,
+      "ÉĻÉ¹i": 1954,
+      "ĠkwËĮÉĽlla": 1955,
+      "ĠbÉĻn": 1956,
+      "rËĮÉĽ": 1957,
+      "ĠnÉĶ": 1958,
+      "ĠÉ¡ËĪÉª": 1959,
+      "ĠËĪap": 1960,
+      "É¹ÉĻ": 1961,
+      "ËĪaÉľkh": 1962,
+      "ĠÊĲËĪi": 1963,
+      "ĠËĪÉĳËĲ": 1964,
+      "ÉªÉ¡ÉĻn": 1965,
+      "ĠwËĪai": 1966,
+      "ĠpÉĻt": 1967,
+      "kËĲa": 1968,
+      "ĠbËĪÉĽËĲ": 1969,
+      "ËĪeËĲÊĭ": 1970,
+      "lsÉĻÊĬ": 1971,
+      "ĠcËĪaËĲhÉªËĮeËĲ": 1972,
+      "ĠkÉĻn": 1973,
+      "ĠËĮaÉªnÉĻm": 1974,
+      "ËĪuËĲt": 1975,
+      "ĠhËĪaÊĬ": 1976,
+      "ĠtËĪanto": 1977,
+      "ĠhÉĲz": 1978,
+      "ĠsËĪÊĮÉ¾": 1979,
+      "Ġno": 1980,
+      "ĠtËĪÉĶËĲ": 1981,
+      "ĠzËĪaÉª": 1982,
+      "ĠtÉķËĪiÉĽÉľ": 1983,
+      "ĠkozËĪi": 1984,
+      "ĠkËĪei": 1985,
+      "Ã°ËĪÉĶÉ¾": 1986,
+      "ËĮÉĶÊģ": 1987,
+      "ĠtËĪÊĮÉ¾": 1988,
+      "ĠÊĲËĪÉĻ": 1989,
+      "ĠÉķËĪyÉĽÉľ": 1990,
+      "ĠmËĮÊĬÉŁÊ°eËĲ": 1991,
+      "mf": 1992,
+      "ĠvËĪiËĲdÉľ": 1993,
+      "kËĪa": 1994,
+      "ĠÉĲÉ¡": 1995,
+      "kw": 1996,
+      "ĠÊģÉĽ": 1997,
+      "xÉĻn": 1998,
+      "ĠdÊĬ": 1999,
+      "ĠkËĪÊĮÉ¾nËĮeËĲ": 2000,
+      "jËĪaËĲdaËĲ": 2001,
+      "ĠfÉĻ": 2002,
+      "ĠËĮimp": 2003,
+      "ĠhÉªz": 2004,
+      "ĠÊ°Ïĩ": 2005,
+      "ËĪoËĲni": 2006,
+      "ĠxËĪiÉľ": 2007,
+      "ËĪeËĲsÊĪ": 2008,
+      "ÊıbÉľ": 2009,
+      "ËĮÉĶÉ¾ke": 2010,
+      "ĠÉ¡ËĪÉĻÊĬ": 2011,
+      "ËĪÉªÊĥÉĻn": 2012,
+      "les": 2013,
+      "ĠfËĪiËĲ": 2014,
+      "É¡tÉĻ": 2015,
+      "ËĪeËĲre": 2016,
+      "ĠvËĮaËĲ": 2017,
+      "ĠËĪeÉª": 2018,
+      "ĠmËĪuÉĻÉľn": 2019,
+      "ĠÉ¡ËĪÊĬd": 2020,
+      "ĠmËĮaÉªn": 2021,
+      "zËĪe": 2022,
+      "ĠlËĪiÉľ": 2023,
+      "Ġmu": 2024,
+      "ĠkËĮÉĽl": 2025,
+      "ĠjËĮÉĻh": 2026,
+      "ĠfËĮÉĶÉ¾": 2027,
+      "fÉ¹": 2028,
+      "ĠkËĪaÉªn": 2029,
+      "ĠËĪÉĴlsÉĻÊĬ": 2030,
+      "Î¸ÉªÅĭ": 2031,
+      "ĠthËĪonÉ¡Éľ": 2032,
+      "tËĪÉĳ": 2033,
+      "Î¸jo": 2034,
+      "mËĪÉĶ": 2035,
+      "Ġos": 2036,
+      "ĠsÊĬ": 2037,
+      "ĠsËĪÊĮmÉĻ": 2038,
+      "ĠvËĮÉĽn": 2039,
+      "nËĪo": 2040,
+      "ĠËĪaktÊĥuËĲ": 2041,
+      "É£a": 2042,
+      "ĠtÊ°i": 2043,
+      "ĠfËĮi": 2044,
+      "ĠvËĪÉĽl": 2045,
+      "ĠtËĪutËĲi": 2046,
+      "xos": 2047
+    },
+    "merges": [
+      [
+        "Ë",
+        "Ī"
+      ],
+      [
+        "Ë",
+        "Ĳ"
+      ],
+      [
+        "ËĪ",
+        "É"
+      ],
+      [
+        "Ë",
+        "Į"
+      ],
+      [
+        "É",
+        "Ļ"
+      ],
+      [
+        "ËĪ",
+        "a"
+      ],
+      [
+        "ËĪ",
+        "i"
+      ],
+      [
+        "Ġ",
+        "t"
+      ],
+      [
+        "É",
+        "ª"
+      ],
+      [
+        "É",
+        "¾"
+      ],
+      [
+        "Ġ",
+        "É"
+      ],
+      [
+        "Ġ",
+        "k"
+      ],
+      [
+        "É",
+        "ľ"
+      ],
+      [
+        "Ġ",
+        "s"
+      ],
+      [
+        "ËĪ",
+        "e"
+      ],
+      [
+        "É",
+        "Ľ"
+      ],
+      [
+        "ËĪ",
+        "o"
+      ],
+      [
+        "Ġ",
+        "l"
+      ],
+      [
+        "ËĪÉ",
+        "Ľ"
+      ],
+      [
+        "Ġ",
+        "d"
+      ],
+      [
+        "Ê",
+        "Ĭ"
+      ],
+      [
+        "ËĪa",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "p"
+      ],
+      [
+        "Ì",
+        "ĥ"
+      ],
+      [
+        "Ġ",
+        "m"
+      ],
+      [
+        "ËĪ",
+        "u"
+      ],
+      [
+        "Å",
+        "ĭ"
+      ],
+      [
+        "Ã",
+        "°"
+      ],
+      [
+        "ËĪÉ",
+        "Ķ"
+      ],
+      [
+        "Ê",
+        "Į"
+      ],
+      [
+        "ËĮ",
+        "a"
+      ],
+      [
+        "Ġ",
+        "h"
+      ],
+      [
+        "ËĪ",
+        "ÊĮ"
+      ],
+      [
+        "Ġ",
+        "n"
+      ],
+      [
+        "Ê",
+        "ģ"
+      ],
+      [
+        "ËĪÉ",
+        "ĳ"
+      ],
+      [
+        "Ê",
+        "ĥ"
+      ],
+      [
+        "e",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "a"
+      ],
+      [
+        "Ġ",
+        "b"
+      ],
+      [
+        "É",
+        "Ķ"
+      ],
+      [
+        "ËĪÉ",
+        "Ļ"
+      ],
+      [
+        "ÉĻ",
+        "n"
+      ],
+      [
+        "Ġ",
+        "f"
+      ],
+      [
+        "ËĪÉ",
+        "ª"
+      ],
+      [
+        "É",
+        "¡"
+      ],
+      [
+        "ËĪe",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "j"
+      ],
+      [
+        "n",
+        "t"
+      ],
+      [
+        "Ġ",
+        "Ã°"
+      ],
+      [
+        "Ġ",
+        "ËĮ"
+      ],
+      [
+        "Ġt",
+        "s"
+      ],
+      [
+        "ĠÉ",
+        "¡"
+      ],
+      [
+        "É",
+        "ķ"
+      ],
+      [
+        "ËĪo",
+        "ËĲ"
+      ],
+      [
+        "Ê",
+        "°"
+      ],
+      [
+        "a",
+        "ËĲ"
+      ],
+      [
+        "ËĪ",
+        "y"
+      ],
+      [
+        "Ġt",
+        "Éķ"
+      ],
+      [
+        "ËĪi",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "Ê"
+      ],
+      [
+        "Ġ",
+        "v"
+      ],
+      [
+        "Ġ",
+        "w"
+      ],
+      [
+        "s",
+        "t"
+      ],
+      [
+        "É",
+        "ĳ"
+      ],
+      [
+        "n",
+        "d"
+      ],
+      [
+        "ËĮ",
+        "i"
+      ],
+      [
+        "Ì",
+        "ª"
+      ],
+      [
+        "ËĮ",
+        "e"
+      ],
+      [
+        "Ġ",
+        "z"
+      ],
+      [
+        "ËĪa",
+        "Éª"
+      ],
+      [
+        "ËĪi",
+        "ÉĽ"
+      ],
+      [
+        "Î",
+        "²"
+      ],
+      [
+        "É",
+        "¹"
+      ],
+      [
+        "Ġ",
+        "ËĮa"
+      ],
+      [
+        "Î",
+        "¸"
+      ],
+      [
+        "Ġh",
+        "ÉĽ"
+      ],
+      [
+        "Ê",
+        "Ī"
+      ],
+      [
+        "i",
+        "ËĲ"
+      ],
+      [
+        "ËĮ",
+        "o"
+      ],
+      [
+        "Ġ",
+        "Éª"
+      ],
+      [
+        "Éľ",
+        "n"
+      ],
+      [
+        "Ġ",
+        "x"
+      ],
+      [
+        "Ġt",
+        "ÉĻ"
+      ],
+      [
+        "ËĪu",
+        "ËĲ"
+      ],
+      [
+        "ËĮ",
+        "ÉĻ"
+      ],
+      [
+        "Ġj",
+        "ËĪi"
+      ],
+      [
+        "ËĮ",
+        "ÉĽ"
+      ],
+      [
+        "ĠÉ",
+        "Ľ"
+      ],
+      [
+        "Ġ",
+        "ËĪa"
+      ],
+      [
+        "ËĮa",
+        "ËĲ"
+      ],
+      [
+        "Ġl",
+        "a"
+      ],
+      [
+        "ĠÃ°",
+        "e"
+      ],
+      [
+        "ĠhÉĽ",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "e"
+      ],
+      [
+        "Ã",
+        "§"
+      ],
+      [
+        "ÉĻ",
+        "l"
+      ],
+      [
+        "o",
+        "ËĲ"
+      ],
+      [
+        "ËĪÉĳ",
+        "u"
+      ],
+      [
+        "Ê",
+        "Ĵ"
+      ],
+      [
+        "u",
+        "ËĲ"
+      ],
+      [
+        "ĠÉ",
+        "Ĺ"
+      ],
+      [
+        "ĠÉ",
+        "ķ"
+      ],
+      [
+        "ËĮ",
+        "eËĲ"
+      ],
+      [
+        "ĠtÉķ",
+        "ËĪi"
+      ],
+      [
+        "o",
+        "s"
+      ],
+      [
+        "ËĪÉĶ",
+        "ËĲ"
+      ],
+      [
+        "a",
+        "s"
+      ],
+      [
+        "ËĪ",
+        "ÊĬ"
+      ],
+      [
+        "Ġ",
+        "i"
+      ],
+      [
+        "ËĪa",
+        "i"
+      ],
+      [
+        "É",
+        "²"
+      ],
+      [
+        "Éª",
+        "n"
+      ],
+      [
+        "t",
+        "s"
+      ],
+      [
+        "Éľ",
+        "Åĭ"
+      ],
+      [
+        "ĠÉ",
+        "Ł"
+      ],
+      [
+        "Ġ",
+        "Êĥ"
+      ],
+      [
+        "ËĪe",
+        "Éª"
+      ],
+      [
+        "ÉĽ",
+        "É¾"
+      ],
+      [
+        "ËĪÉĽ",
+        "ËĲ"
+      ],
+      [
+        "ËĪÉĽ",
+        "É¾"
+      ],
+      [
+        "Ġ",
+        "r"
+      ],
+      [
+        "t",
+        "Êĥ"
+      ],
+      [
+        "ËĮ",
+        "ÉĶ"
+      ],
+      [
+        "Ġd",
+        "ÉĻ"
+      ],
+      [
+        "t",
+        "ÉĻ"
+      ],
+      [
+        "o",
+        "u"
+      ],
+      [
+        "ËĪy",
+        "ÉĻ"
+      ],
+      [
+        "ĠËĮ",
+        "i"
+      ],
+      [
+        "ÉĻ",
+        "É¾"
+      ],
+      [
+        "ËĪÉĻ",
+        "ÊĬ"
+      ],
+      [
+        "ËĪÊĮ",
+        "É¾"
+      ],
+      [
+        "ËĪÉ",
+        "Ĵ"
+      ],
+      [
+        "Ġt",
+        "h"
+      ],
+      [
+        "ËĪo",
+        "n"
+      ],
+      [
+        "Ê",
+        "ĭ"
+      ],
+      [
+        "ËĪÉĳ",
+        "ËĲ"
+      ],
+      [
+        "ËĪÊĮ",
+        "h"
+      ],
+      [
+        "w",
+        "ËĪa"
+      ],
+      [
+        "ËĪe",
+        "i"
+      ],
+      [
+        "l",
+        "l"
+      ],
+      [
+        "ĠÉ",
+        "Ĳ"
+      ],
+      [
+        "Éĳ",
+        "ËĲ"
+      ],
+      [
+        "a",
+        "n"
+      ],
+      [
+        "É",
+        "Ł"
+      ],
+      [
+        "ĠÊ",
+        "ĭ"
+      ],
+      [
+        "Ġk",
+        "o"
+      ],
+      [
+        "k",
+        "h"
+      ],
+      [
+        "Éª",
+        "Åĭ"
+      ],
+      [
+        "ËĪaËĲ",
+        "Éª"
+      ],
+      [
+        "Ġt",
+        "Êĥ"
+      ],
+      [
+        "ËĪaËĲ",
+        "t"
+      ],
+      [
+        "ĠËĮ",
+        "e"
+      ],
+      [
+        "ĠtÉķ",
+        "h"
+      ],
+      [
+        "ËĪu",
+        "o"
+      ],
+      [
+        "ËĪon",
+        "É¡"
+      ],
+      [
+        "É",
+        "ĸ"
+      ],
+      [
+        "a",
+        "t"
+      ],
+      [
+        "Ġk",
+        "e"
+      ],
+      [
+        "É",
+        "Ĵ"
+      ],
+      [
+        "ĠÉķ",
+        "ËĪi"
+      ],
+      [
+        "Ã",
+        "¸"
+      ],
+      [
+        "ĠÉ",
+        "ĳ"
+      ],
+      [
+        "ËĪeËĲ",
+        "k"
+      ],
+      [
+        "Å",
+        "ĵ"
+      ],
+      [
+        "r",
+        "e"
+      ],
+      [
+        "Ġ",
+        "É¾"
+      ],
+      [
+        "Ġk",
+        "ÉĶ"
+      ],
+      [
+        "ËĮ",
+        "ÊĬ"
+      ],
+      [
+        "s",
+        "k"
+      ],
+      [
+        "Ġ",
+        "ÊĬ"
+      ],
+      [
+        "Ġa",
+        "nd"
+      ],
+      [
+        "Éª",
+        "Ã§"
+      ],
+      [
+        "Ġm",
+        "e"
+      ],
+      [
+        "ËĪa",
+        "É¾"
+      ],
+      [
+        "Ġ",
+        "ËĪÉª"
+      ],
+      [
+        "n",
+        "a"
+      ],
+      [
+        "Ġ",
+        "Î²"
+      ],
+      [
+        "Ġl",
+        "ËĪi"
+      ],
+      [
+        "j",
+        "aËĲ"
+      ],
+      [
+        "l",
+        "i"
+      ],
+      [
+        "n",
+        "o"
+      ],
+      [
+        "ĠÉª",
+        "n"
+      ],
+      [
+        "Ġd",
+        "ËĮi"
+      ],
+      [
+        "ĠÉ",
+        "²"
+      ],
+      [
+        "t",
+        "ËĲ"
+      ],
+      [
+        "ÉĻ",
+        "m"
+      ],
+      [
+        "Ġl",
+        "ÉĻ"
+      ],
+      [
+        "ĠÃ°",
+        "ÉĻ"
+      ],
+      [
+        "Éª",
+        "k"
+      ],
+      [
+        "ËĪÉĽ",
+        "l"
+      ],
+      [
+        "Éľ",
+        "t"
+      ],
+      [
+        "Ġs",
+        "e"
+      ],
+      [
+        "e",
+        "s"
+      ],
+      [
+        "ËĪo",
+        "u"
+      ],
+      [
+        "ËĪa",
+        "ÊĬ"
+      ],
+      [
+        "ĠÉ",
+        "Ķ"
+      ],
+      [
+        "Éª",
+        "t"
+      ],
+      [
+        "Ġ",
+        "Åĭ"
+      ],
+      [
+        "ËĪÉĽ",
+        "n"
+      ],
+      [
+        "Ê",
+        "İ"
+      ],
+      [
+        "Ġk",
+        "h"
+      ],
+      [
+        "ËĪÉĽ",
+        "nt"
+      ],
+      [
+        "ËĪaËĲ",
+        "É¾"
+      ],
+      [
+        "Ġk",
+        "i"
+      ],
+      [
+        "m",
+        "p"
+      ],
+      [
+        "l",
+        "t"
+      ],
+      [
+        "É",
+        "£"
+      ],
+      [
+        "Ġp",
+        "a"
+      ],
+      [
+        "ËĪÉĻ",
+        "ËĲ"
+      ],
+      [
+        "Éª",
+        "s"
+      ],
+      [
+        "ĠÉ",
+        "Ĵ"
+      ],
+      [
+        "Ġl",
+        "e"
+      ],
+      [
+        "Éª",
+        "Éľ"
+      ],
+      [
+        "ËĪÉĽ",
+        "t"
+      ],
+      [
+        "Ġd",
+        "e"
+      ],
+      [
+        "ĠÉ",
+        "¹"
+      ],
+      [
+        "Ġt",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġ",
+        "Êģ"
+      ],
+      [
+        "Êĥ",
+        "ÉĻn"
+      ],
+      [
+        "ĠÊĬ",
+        "nt"
+      ],
+      [
+        "ËĪÉĶ",
+        "É¾"
+      ],
+      [
+        "ËĪa",
+        "Ã°"
+      ],
+      [
+        "Ġa",
+        "Éª"
+      ],
+      [
+        "ĠÊ",
+        "Ĳ"
+      ],
+      [
+        "Ġm",
+        "ËĪa"
+      ],
+      [
+        "r",
+        "a"
+      ],
+      [
+        "Ġk",
+        "ËĪÉª"
+      ],
+      [
+        "k",
+        "t"
+      ],
+      [
+        "ËĲ",
+        "p"
+      ],
+      [
+        "ĠÊ",
+        "Ī"
+      ],
+      [
+        "ËĪaËĲ",
+        "ÊĬ"
+      ],
+      [
+        "Ġk",
+        "ËĪÊĮÉ¾"
+      ],
+      [
+        "Ġ",
+        "ËĪÊĮ"
+      ],
+      [
+        "ĠÉĴ",
+        "v"
+      ],
+      [
+        "Ġe",
+        "l"
+      ],
+      [
+        "k",
+        "s"
+      ],
+      [
+        "Ġk",
+        "w"
+      ],
+      [
+        "ÉĻ",
+        "t"
+      ],
+      [
+        "nd",
+        "o"
+      ],
+      [
+        "e",
+        "i"
+      ],
+      [
+        "ĠËĮa",
+        "ËĲp"
+      ],
+      [
+        "s",
+        "e"
+      ],
+      [
+        "ÉĻ",
+        "É¹"
+      ],
+      [
+        "ËĪu",
+        "ei"
+      ],
+      [
+        "ÉĻ",
+        "s"
+      ],
+      [
+        "Ġk",
+        "ËĮo"
+      ],
+      [
+        "ĠÊ",
+        "Ĥ"
+      ],
+      [
+        "ĠËĮ",
+        "ÊĬ"
+      ],
+      [
+        "Ġ",
+        "c"
+      ],
+      [
+        "ĠÉĽ",
+        "n"
+      ],
+      [
+        "ËĪa",
+        "nt"
+      ],
+      [
+        "Î¸",
+        "j"
+      ],
+      [
+        "ËĮo",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġp",
+        "É¾"
+      ],
+      [
+        "s",
+        "i"
+      ],
+      [
+        "Ġ",
+        "ËĪe"
+      ],
+      [
+        "Ġj",
+        "uËĲ"
+      ],
+      [
+        "Ġk",
+        "ËĮe"
+      ],
+      [
+        "ËĮ",
+        "Éª"
+      ],
+      [
+        "ÉĶ",
+        "n"
+      ],
+      [
+        "Ġs",
+        "ËĪÊĮ"
+      ],
+      [
+        "Ġ",
+        "ËĪu"
+      ],
+      [
+        "n",
+        "i"
+      ],
+      [
+        "Ġs",
+        "t"
+      ],
+      [
+        "Ġd",
+        "iËĲ"
+      ],
+      [
+        "Ġk",
+        "eËĲ"
+      ],
+      [
+        "ĠjËĪi",
+        "ou"
+      ],
+      [
+        "ËĪai",
+        "Éľ"
+      ],
+      [
+        "Ġd",
+        "ÊĴ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĶ"
+      ],
+      [
+        "v",
+        "a"
+      ],
+      [
+        "ËĲ",
+        "É¾"
+      ],
+      [
+        "ËĪ",
+        "Ã¸"
+      ],
+      [
+        "ËĮÉĻ",
+        "ÊĬ"
+      ],
+      [
+        "Ġp",
+        "ËĪu"
+      ],
+      [
+        "Ġs",
+        "u"
+      ],
+      [
+        "Ġm",
+        "a"
+      ],
+      [
+        "Ġ",
+        "ÉĻ"
+      ],
+      [
+        "d",
+        "ÊĴ"
+      ],
+      [
+        "Ġp",
+        "Ê°"
+      ],
+      [
+        "l",
+        "e"
+      ],
+      [
+        "i",
+        "n"
+      ],
+      [
+        "ĠtÉķh",
+        "ËĪi"
+      ],
+      [
+        "Ġw",
+        "ËĪo"
+      ],
+      [
+        "r",
+        "o"
+      ],
+      [
+        "ËĮ",
+        "y"
+      ],
+      [
+        "É¾",
+        "a"
+      ],
+      [
+        "Ġs",
+        "ËĪi"
+      ],
+      [
+        "Ã°",
+        "ÉĻ"
+      ],
+      [
+        "Ġs",
+        "eËĲ"
+      ],
+      [
+        "l",
+        "a"
+      ],
+      [
+        "ĠÊ",
+        "Ĵ"
+      ],
+      [
+        "m",
+        "b"
+      ],
+      [
+        "Ġh",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġb",
+        "Ê°"
+      ],
+      [
+        "ĠÉĽ",
+        "É¾"
+      ],
+      [
+        "ĠÃ°",
+        "at"
+      ],
+      [
+        "s",
+        "p"
+      ],
+      [
+        "ÉĶ",
+        "É¾"
+      ],
+      [
+        "e",
+        "n"
+      ],
+      [
+        "Ġs",
+        "ÉĻ"
+      ],
+      [
+        "ËĪÉĶ",
+        "Éľ"
+      ],
+      [
+        "Ġl",
+        "ËĮa"
+      ],
+      [
+        "ĠËĮ",
+        "ÉĽ"
+      ],
+      [
+        "Ġ",
+        "ËĪy"
+      ],
+      [
+        "É¡",
+        "aËĲ"
+      ],
+      [
+        "Ġd",
+        "ÉĽÉ¾"
+      ],
+      [
+        "ËĪÉĽ",
+        "Êģ"
+      ],
+      [
+        "Éľ",
+        "kh"
+      ],
+      [
+        "ËĪi",
+        "ÉĻ"
+      ],
+      [
+        "ËĪa",
+        "n"
+      ],
+      [
+        "Ġm",
+        "ËĪo"
+      ],
+      [
+        "ËĪa",
+        "Î²"
+      ],
+      [
+        "Ġa",
+        "l"
+      ],
+      [
+        "Ġ",
+        "ËĪeËĲ"
+      ],
+      [
+        "Ġ",
+        "Î¸"
+      ],
+      [
+        "Ġn",
+        "ËĪi"
+      ],
+      [
+        "p",
+        "Ê°"
+      ],
+      [
+        "ll",
+        "a"
+      ],
+      [
+        "Ġp",
+        "l"
+      ],
+      [
+        "ËĪ",
+        "Åĵ"
+      ],
+      [
+        "j",
+        "ËĪÉĳu"
+      ],
+      [
+        "Ġa",
+        "v"
+      ],
+      [
+        "Ġm",
+        "ËĪi"
+      ],
+      [
+        "Ġf",
+        "ËĪa"
+      ],
+      [
+        "ËĪÉ",
+        "ľ"
+      ],
+      [
+        "m",
+        "e"
+      ],
+      [
+        "ËĮÉĻ",
+        "h"
+      ],
+      [
+        "ËĪu",
+        "ÉĻ"
+      ],
+      [
+        "i",
+        "t"
+      ],
+      [
+        "j",
+        "ËĪe"
+      ],
+      [
+        "Ġ",
+        "o"
+      ],
+      [
+        "ËĪÉľ",
+        "ËĲ"
+      ],
+      [
+        "ĠtÉķËĪi",
+        "ou"
+      ],
+      [
+        "ÉĶ",
+        "ËĲ"
+      ],
+      [
+        "Ġn",
+        "ÉĻ"
+      ],
+      [
+        "ËĪÉĻ",
+        "Éľn"
+      ],
+      [
+        "Ġm",
+        "ÉĻ"
+      ],
+      [
+        "Ġd",
+        "eËĲ"
+      ],
+      [
+        "m",
+        "o"
+      ],
+      [
+        "s",
+        "a"
+      ],
+      [
+        "j",
+        "ËĪÉĶ"
+      ],
+      [
+        "ËĪa",
+        "l"
+      ],
+      [
+        "ĠtÉķ",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ĠÉ¡",
+        "ÉĻ"
+      ],
+      [
+        "Ã°",
+        "a"
+      ],
+      [
+        "ĠÉª",
+        "z"
+      ],
+      [
+        "Ġs",
+        "a"
+      ],
+      [
+        "r",
+        "i"
+      ],
+      [
+        "ĠËĮi",
+        "l"
+      ],
+      [
+        "ËĮ",
+        "u"
+      ],
+      [
+        "Ġk",
+        "aËĲ"
+      ],
+      [
+        "ĠÉĻ",
+        "ËĲ"
+      ],
+      [
+        "ĠÉ",
+        "ĸ"
+      ],
+      [
+        "Ġk",
+        "a"
+      ],
+      [
+        "ËĪÊĮh",
+        "i"
+      ],
+      [
+        "Ġj",
+        "eËĲ"
+      ],
+      [
+        "Ġt",
+        "Ê°"
+      ],
+      [
+        "n",
+        "e"
+      ],
+      [
+        "k",
+        "ËĲ"
+      ],
+      [
+        "Ġts",
+        "ËĪai"
+      ],
+      [
+        "Ġ",
+        "ËĪeËĲk"
+      ],
+      [
+        "n",
+        "k"
+      ],
+      [
+        "t",
+        "i"
+      ],
+      [
+        "ËĪa",
+        "Éľn"
+      ],
+      [
+        "Ġk",
+        "ËĲ"
+      ],
+      [
+        "É¡",
+        "ÉĻn"
+      ],
+      [
+        "ËĪi",
+        "a"
+      ],
+      [
+        "ĠÉĶ",
+        "ËĲÉ¾"
+      ],
+      [
+        "Ê",
+        "ı"
+      ],
+      [
+        "ĠËĮ",
+        "ÊĮ"
+      ],
+      [
+        "Ġz",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġl",
+        "os"
+      ],
+      [
+        "ÉĽ",
+        "s"
+      ],
+      [
+        "ËĪÉĶ",
+        "n"
+      ],
+      [
+        "ÉĽ",
+        "nt"
+      ],
+      [
+        "ÉĽ",
+        "n"
+      ],
+      [
+        "ĠÉŁ",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ã§",
+        "t"
+      ],
+      [
+        "Ġd",
+        "as"
+      ],
+      [
+        "Ġx",
+        "ËĮo"
+      ],
+      [
+        "ËĪu",
+        "Éľ"
+      ],
+      [
+        "ËĪa",
+        "s"
+      ],
+      [
+        "Ġb",
+        "ËĪÊĮ"
+      ],
+      [
+        "ËĪiÉĽ",
+        "Éľn"
+      ],
+      [
+        "É",
+        "Ĳ"
+      ],
+      [
+        "Ġts",
+        "uËĲ"
+      ],
+      [
+        "Ġp",
+        "ËĮÉĽ"
+      ],
+      [
+        "Ġn",
+        "ËĪÉĶ"
+      ],
+      [
+        "ÊĬ",
+        "t"
+      ],
+      [
+        "m",
+        "a"
+      ],
+      [
+        "Ġn",
+        "ËĪo"
+      ],
+      [
+        "Ġl",
+        "ËĪÉª"
+      ],
+      [
+        "ËĪÉĽ",
+        "s"
+      ],
+      [
+        "Éª",
+        "l"
+      ],
+      [
+        "ĠÉķ",
+        "ËĪiÉĽ"
+      ],
+      [
+        "Ġ",
+        "ËĪÊĬ"
+      ],
+      [
+        "ÉĴ",
+        "t"
+      ],
+      [
+        "t",
+        "o"
+      ],
+      [
+        "Ġ",
+        "ËĪo"
+      ],
+      [
+        "ËĮo",
+        "n"
+      ],
+      [
+        "Ġk",
+        "wËĪa"
+      ],
+      [
+        "ĠÉª",
+        "t"
+      ],
+      [
+        "Ġh",
+        "oËĲ"
+      ],
+      [
+        "ËĪiËĲ",
+        "k"
+      ],
+      [
+        "ĠËĮaËĲp",
+        "k"
+      ],
+      [
+        "ËĪaÉª",
+        "n"
+      ],
+      [
+        "Ã",
+        "¦"
+      ],
+      [
+        "ÉĻn",
+        "t"
+      ],
+      [
+        "t",
+        "a"
+      ],
+      [
+        "l",
+        "o"
+      ],
+      [
+        "Ġn",
+        "ËĪÉĳ"
+      ],
+      [
+        "Ġl",
+        "ËĪa"
+      ],
+      [
+        "ËĪi",
+        "Éľ"
+      ],
+      [
+        "Ġw",
+        "ËĪei"
+      ],
+      [
+        "ÉĽ",
+        "Êģ"
+      ],
+      [
+        "Ġt",
+        "ËĪa"
+      ],
+      [
+        "ĠÉ¾",
+        "ËĮÉĻh"
+      ],
+      [
+        "ĠÉķËĪi",
+        "Éĳ"
+      ],
+      [
+        "ËĮi",
+        "ËĲ"
+      ],
+      [
+        "ËĮÉĽ",
+        "l"
+      ],
+      [
+        "ĠtÉĻ",
+        "Éľ"
+      ],
+      [
+        "Ġk",
+        "ËĪuo"
+      ],
+      [
+        "Ġt",
+        "ËĪu"
+      ],
+      [
+        "j",
+        "ËĪÉĽ"
+      ],
+      [
+        "ĠËĮi",
+        "n"
+      ],
+      [
+        "É¾",
+        "e"
+      ],
+      [
+        "Ġk",
+        "oËĲ"
+      ],
+      [
+        "Ġk",
+        "ËĪa"
+      ],
+      [
+        "É¾",
+        "i"
+      ],
+      [
+        "ĠtÉķËĪi",
+        "Éĳ"
+      ],
+      [
+        "l",
+        "ÉĻ"
+      ],
+      [
+        "Ġk",
+        "ÉĻ"
+      ],
+      [
+        "Ġt",
+        "ËĪi"
+      ],
+      [
+        "ĠÅĭ",
+        "ËĪyÉĻ"
+      ],
+      [
+        "Ġts",
+        "h"
+      ],
+      [
+        "e",
+        "r"
+      ],
+      [
+        "a",
+        "v"
+      ],
+      [
+        "ĠkÉĶ",
+        "n"
+      ],
+      [
+        "ËĪÉĻ",
+        "ÉľÅĭ"
+      ],
+      [
+        "Ã°",
+        "o"
+      ],
+      [
+        "ËĪaËĲ",
+        "n"
+      ],
+      [
+        "ĠbÊ°",
+        "ËĪi"
+      ],
+      [
+        "ĠkËĲ",
+        "jaËĲ"
+      ],
+      [
+        "ÉĻ",
+        "z"
+      ],
+      [
+        "Ġp",
+        "Êģ"
+      ],
+      [
+        "Ġd",
+        "ËĪÉª"
+      ],
+      [
+        "Ġz",
+        "iËĲ"
+      ],
+      [
+        "É¡",
+        "eËĲ"
+      ],
+      [
+        "Ġt",
+        "ËĪÉĻ"
+      ],
+      [
+        "Éª",
+        "z"
+      ],
+      [
+        "Ġn",
+        "ËĮon"
+      ],
+      [
+        "t",
+        "aËĲ"
+      ],
+      [
+        "b",
+        "l"
+      ],
+      [
+        "t",
+        "e"
+      ],
+      [
+        "n",
+        "ËĮeËĲ"
+      ],
+      [
+        "ËĪÉª",
+        "l"
+      ],
+      [
+        "s",
+        "o"
+      ],
+      [
+        "k",
+        "o"
+      ],
+      [
+        "u",
+        "Êģ"
+      ],
+      [
+        "ĠÉ",
+        "£"
+      ],
+      [
+        "Ġpa",
+        "Êģ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĽ"
+      ],
+      [
+        "j",
+        "ËĪuËĲ"
+      ],
+      [
+        "ËĮ",
+        "ÊĮ"
+      ],
+      [
+        "y",
+        "n"
+      ],
+      [
+        "ËĪiËĲ",
+        "n"
+      ],
+      [
+        "Ġl",
+        "ËĪaÉª"
+      ],
+      [
+        "ËĪÉª",
+        "Åĭ"
+      ],
+      [
+        "ĠtÉķh",
+        "ËĪy"
+      ],
+      [
+        "Ġn",
+        "ËĪÊĮhi"
+      ],
+      [
+        "Ġd",
+        "ËĮe"
+      ],
+      [
+        "Ġj",
+        "ËĪÉĳu"
+      ],
+      [
+        "Ġt",
+        "ËĪÉĳu"
+      ],
+      [
+        "Ġh",
+        "ËĪo"
+      ],
+      [
+        "Éª",
+        "d"
+      ],
+      [
+        "Ġth",
+        "ËĪÉĳ"
+      ],
+      [
+        "m",
+        "ËĪe"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĻ"
+      ],
+      [
+        "j",
+        "a"
+      ],
+      [
+        "Ġp",
+        "h"
+      ],
+      [
+        "ÉĽ",
+        "t"
+      ],
+      [
+        "Ġk",
+        "ËĪÊĮ"
+      ],
+      [
+        "t",
+        "ÉĻn"
+      ],
+      [
+        "m",
+        "ËĪÉĳ"
+      ],
+      [
+        "w",
+        "ËĪe"
+      ],
+      [
+        "ĠËĮa",
+        "Éªn"
+      ],
+      [
+        "ĠÃ°",
+        "Éªs"
+      ],
+      [
+        "É¡",
+        "ÉĻ"
+      ],
+      [
+        "Ġn",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġb",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġa",
+        "Î¸"
+      ],
+      [
+        "Ġm",
+        "ËĮa"
+      ],
+      [
+        "ËĪÊĮh",
+        "a"
+      ],
+      [
+        "Ġd",
+        "ËĮa"
+      ],
+      [
+        "ËĪ",
+        "Êı"
+      ],
+      [
+        "ĠÉ²",
+        "ËĮy"
+      ],
+      [
+        "Ġp",
+        "ËĪa"
+      ],
+      [
+        "ËĪaÃ°",
+        "o"
+      ],
+      [
+        "d",
+        "i"
+      ],
+      [
+        "b",
+        "Éľ"
+      ],
+      [
+        "É",
+        "³"
+      ],
+      [
+        "Ġw",
+        "iËĲ"
+      ],
+      [
+        "Ġn",
+        "ËĪÉª"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÉĶÉľ"
+      ],
+      [
+        "tËĲ",
+        "o"
+      ],
+      [
+        "ËĮÉĻ",
+        "m"
+      ],
+      [
+        "ËĪaËĲ",
+        "r"
+      ],
+      [
+        "Ġm",
+        "ÉĽ"
+      ],
+      [
+        "ËĪeËĲ",
+        "É¡aËĲ"
+      ],
+      [
+        "Ġs",
+        "ËĮi"
+      ],
+      [
+        "Ġl",
+        "ËĮaËĲ"
+      ],
+      [
+        "n",
+        "ËĮaËĲ"
+      ],
+      [
+        "Ġs",
+        "p"
+      ],
+      [
+        "t",
+        "Êģ"
+      ],
+      [
+        "ĠÊ",
+        "İ"
+      ],
+      [
+        "ËĮ",
+        "ÉĳËĲ"
+      ],
+      [
+        "Ġk",
+        "l"
+      ],
+      [
+        "k",
+        "Ê°"
+      ],
+      [
+        "i",
+        "l"
+      ],
+      [
+        "ĠÊĥ",
+        "t"
+      ],
+      [
+        "ĠËĮÊĬ",
+        "n"
+      ],
+      [
+        "a",
+        "l"
+      ],
+      [
+        "Ġs",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġm",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġ",
+        "Åĵ"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÊĮ"
+      ],
+      [
+        "ĠpËĮÉĽ",
+        "r"
+      ],
+      [
+        "É¾",
+        "ËĪa"
+      ],
+      [
+        "ËĲ",
+        "ÊĪ"
+      ],
+      [
+        "ËĪaÎ²",
+        "a"
+      ],
+      [
+        "Ġw",
+        "ËĪÉĴ"
+      ],
+      [
+        "Ġx",
+        "ËĪuei"
+      ],
+      [
+        "Ġkh",
+        "ËĪo"
+      ],
+      [
+        "Ġla",
+        "s"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪo"
+      ],
+      [
+        "Ġf",
+        "ÉĽÉ¾"
+      ],
+      [
+        "Ġj",
+        "ËĪiÉĽ"
+      ],
+      [
+        "Ġt",
+        "ËĪe"
+      ],
+      [
+        "Ġk",
+        "ËĮÉĶ"
+      ],
+      [
+        "ĠdeËĲ",
+        "n"
+      ],
+      [
+        "Ġm",
+        "o"
+      ],
+      [
+        "Ġp",
+        "ËĪi"
+      ],
+      [
+        "Ġt",
+        "ËĪÉĳ"
+      ],
+      [
+        "ËĪÉĽ",
+        "st"
+      ],
+      [
+        "w",
+        "ËĪÉĳ"
+      ],
+      [
+        "ËĪaÉª",
+        "t"
+      ],
+      [
+        "ÉĻ",
+        "ÊĬ"
+      ],
+      [
+        "Ġ",
+        "ËĪi"
+      ],
+      [
+        "Éª",
+        "j"
+      ],
+      [
+        "a",
+        "Éª"
+      ],
+      [
+        "ËĪaËĲ",
+        "Éľ"
+      ],
+      [
+        "ĠËĪÉª",
+        "s"
+      ],
+      [
+        "Ġp",
+        "ÉĶÉ¾"
+      ],
+      [
+        "Ã¦",
+        "Éľn"
+      ],
+      [
+        "k",
+        "a"
+      ],
+      [
+        "Åĭ",
+        "É¡"
+      ],
+      [
+        "b",
+        "ÉĻn"
+      ],
+      [
+        "ÊĬ",
+        "f"
+      ],
+      [
+        "Ġp",
+        "É¹"
+      ],
+      [
+        "Ġl",
+        "ËĮe"
+      ],
+      [
+        "ËĪiËĲ",
+        "d"
+      ],
+      [
+        "ËĪaËĲ",
+        "re"
+      ],
+      [
+        "Ġm",
+        "ËĪÊĮ"
+      ],
+      [
+        "ÉĻ",
+        "r"
+      ],
+      [
+        "Ġd",
+        "Éĳ"
+      ],
+      [
+        "ËĪaËĲt",
+        "o"
+      ],
+      [
+        "Ġp",
+        "ËĪeËĲ"
+      ],
+      [
+        "Ġd",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġs",
+        "ËĮÊĬ"
+      ],
+      [
+        "Ġh",
+        "ËĪi"
+      ],
+      [
+        "Ġs",
+        "ËĪa"
+      ],
+      [
+        "ËĪeËĲ",
+        "n"
+      ],
+      [
+        "d",
+        "ÉĻ"
+      ],
+      [
+        "Ġp",
+        "j"
+      ],
+      [
+        "ËĪÅĵ",
+        "Êģ"
+      ],
+      [
+        "l",
+        "ÉªÃ§"
+      ],
+      [
+        "ÉĴ",
+        "n"
+      ],
+      [
+        "ĠËĪÉĻ",
+        "r"
+      ],
+      [
+        "t",
+        "ËĪe"
+      ],
+      [
+        "Ġi",
+        "l"
+      ],
+      [
+        "ËĪaËĲ",
+        "l"
+      ],
+      [
+        "Ġs",
+        "ËĮÉĻÊĬ"
+      ],
+      [
+        "s",
+        "ÊĪ"
+      ],
+      [
+        "Ġd",
+        "ËĪuËĲ"
+      ],
+      [
+        "h",
+        "ËĪÉĳ"
+      ],
+      [
+        "Ġx",
+        "ËĪou"
+      ],
+      [
+        "Ġl",
+        "ËĪaiÉľ"
+      ],
+      [
+        "w",
+        "ËĪo"
+      ],
+      [
+        "ËĪÉĽnt",
+        "e"
+      ],
+      [
+        "Ġs",
+        "y"
+      ],
+      [
+        "Ġz",
+        "ÉªÃ§"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪu"
+      ],
+      [
+        "ĠÉķ",
+        "ËĪy"
+      ],
+      [
+        "ËĪÉĶËĲ",
+        "l"
+      ],
+      [
+        "ÉĶ",
+        "l"
+      ],
+      [
+        "Ġt",
+        "ËĪo"
+      ],
+      [
+        "ĠÊĭ",
+        "oËĲ"
+      ],
+      [
+        "Ġ",
+        "iËĲ"
+      ],
+      [
+        "wËĪa",
+        "Ã°a"
+      ],
+      [
+        "ËĪa",
+        "ndo"
+      ],
+      [
+        "ĠaÎ¸",
+        "ÉĽnt"
+      ],
+      [
+        "ĠaÎ¸ÉĽnt",
+        "wËĪaÃ°a"
+      ],
+      [
+        "Ġt",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ËĪei",
+        "Éľ"
+      ],
+      [
+        "Ġp",
+        "ËĮa"
+      ],
+      [
+        "Ġn",
+        "ËĪaÉª"
+      ],
+      [
+        "w",
+        "a"
+      ],
+      [
+        "Ġf",
+        "r"
+      ],
+      [
+        "ĠÊĲ",
+        "ËĪÉĻÉľn"
+      ],
+      [
+        "ËĪu",
+        "a"
+      ],
+      [
+        "m",
+        "i"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĽ"
+      ],
+      [
+        "ËĪeËĲk",
+        "Ê°"
+      ],
+      [
+        "c",
+        "Ê°"
+      ],
+      [
+        "Ġw",
+        "ËĪÉĳ"
+      ],
+      [
+        "st",
+        "a"
+      ],
+      [
+        "Ġt",
+        "u"
+      ],
+      [
+        "Ġs",
+        "k"
+      ],
+      [
+        "ËĪÉĶ",
+        "l"
+      ],
+      [
+        "ËĪeËĲ",
+        "ÊĪ"
+      ],
+      [
+        "Ġl",
+        "ËĪaËĲÉª"
+      ],
+      [
+        "Ġl",
+        "ËĪaËĲ"
+      ],
+      [
+        "ËĪÉĽËĲ",
+        "s"
+      ],
+      [
+        "ËĪÉĽÉ¾",
+        "a"
+      ],
+      [
+        "ËĪÉĻ",
+        "Éľt"
+      ],
+      [
+        "Ġ",
+        "yn"
+      ],
+      [
+        "d",
+        "ÉĻn"
+      ],
+      [
+        "Ġd",
+        "i"
+      ],
+      [
+        "ËĪiËĲ",
+        "s"
+      ],
+      [
+        "ĠÃ°e",
+        "l"
+      ],
+      [
+        "ËĪÊĮ",
+        "r"
+      ],
+      [
+        "Ġh",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġb",
+        "ÉĻ"
+      ],
+      [
+        "Ġj",
+        "ËĪuËĲ"
+      ],
+      [
+        "ll",
+        "e"
+      ],
+      [
+        "st",
+        "o"
+      ],
+      [
+        "ËĪÉª",
+        "t"
+      ],
+      [
+        "ËĪoËĲ",
+        "É¾"
+      ],
+      [
+        "b",
+        "Ê°"
+      ],
+      [
+        "m",
+        "ÉĻn"
+      ],
+      [
+        "ËĮu",
+        "ÉĻ"
+      ],
+      [
+        "ËĮÉĻ",
+        "É¾"
+      ],
+      [
+        "ËĪÊĮ",
+        "n"
+      ],
+      [
+        "ĠlËĪaÉª",
+        "k"
+      ],
+      [
+        "Ġb",
+        "ËĪa"
+      ],
+      [
+        "Éª",
+        "Ã°"
+      ],
+      [
+        "Ġl",
+        "o"
+      ],
+      [
+        "z",
+        "i"
+      ],
+      [
+        "ËĪÊĮ",
+        "st"
+      ],
+      [
+        "m",
+        "ËĪi"
+      ],
+      [
+        "ÉĶ",
+        "Êģ"
+      ],
+      [
+        "ĠnËĪÉª",
+        "Ã§t"
+      ],
+      [
+        "Ġt",
+        "É¾"
+      ],
+      [
+        "Ġd",
+        "ËĪeËĲkÊ°"
+      ],
+      [
+        "Ġs",
+        "ËĮe"
+      ],
+      [
+        "Ġn",
+        "ËĪÉĻÊĬ"
+      ],
+      [
+        "Ġ",
+        "u"
+      ],
+      [
+        "Ġs",
+        "i"
+      ],
+      [
+        "ĠÉª",
+        "Ã§"
+      ],
+      [
+        "Ġp",
+        "r"
+      ],
+      [
+        "ĠtÉķ",
+        "ËĪy"
+      ],
+      [
+        "Ġm",
+        "ËĪu"
+      ],
+      [
+        "z",
+        "a"
+      ],
+      [
+        "Ġt",
+        "Êģ"
+      ],
+      [
+        "Ġw",
+        "ÉªÃ°"
+      ],
+      [
+        "t",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġp",
+        "ËĪÊĮÉ¾"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĶ"
+      ],
+      [
+        "ËĪoËĲ",
+        "r"
+      ],
+      [
+        "Ġh",
+        "ËĮa"
+      ],
+      [
+        "Ġk",
+        "ËĪonÉ¡"
+      ],
+      [
+        "Ġp",
+        "uÊģ"
+      ],
+      [
+        "Ġd",
+        "y"
+      ],
+      [
+        "ËĪÉª",
+        "n"
+      ],
+      [
+        "nt",
+        "e"
+      ],
+      [
+        "Ġk",
+        "ËĮa"
+      ],
+      [
+        "ËĪÉĻ",
+        "Éª"
+      ],
+      [
+        "Ġm",
+        "i"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĮuÉĻ"
+      ],
+      [
+        "ĠÊ",
+        "²"
+      ],
+      [
+        "Ġf",
+        "ËĪÉĳ"
+      ],
+      [
+        "Ġv",
+        "ÉĳËĲ"
+      ],
+      [
+        "ĠËĮa",
+        "ÊĬ"
+      ],
+      [
+        "ËĮ",
+        "uËĲ"
+      ],
+      [
+        "ĠËĪu",
+        "n"
+      ],
+      [
+        "Ġj",
+        "ËĪÊĮha"
+      ],
+      [
+        "j",
+        "uËĲ"
+      ],
+      [
+        "Ġm",
+        "Éªt"
+      ],
+      [
+        "Ġl",
+        "ËĪÉĽ"
+      ],
+      [
+        "ËĪeËĲ",
+        "Êĥ"
+      ],
+      [
+        "Ġf",
+        "ÉĶËĲ"
+      ],
+      [
+        "m",
+        "ÉĻ"
+      ],
+      [
+        "É¾",
+        "t"
+      ],
+      [
+        "ĠkËĮo",
+        "n"
+      ],
+      [
+        "Ġl",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġx",
+        "ËĪÉĳu"
+      ],
+      [
+        "p",
+        "l"
+      ],
+      [
+        "Ġd",
+        "ËĪi"
+      ],
+      [
+        "Ġl",
+        "ËĪoËĲ"
+      ],
+      [
+        "s",
+        "ÉĻ"
+      ],
+      [
+        "ËĪaËĲ",
+        "va"
+      ],
+      [
+        "Ġl",
+        "ËĪu"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĮÉĻÊĬ"
+      ],
+      [
+        "Ġh",
+        "av"
+      ],
+      [
+        "ĠËĮaËĲpk",
+        "ËĮoËĲ"
+      ],
+      [
+        "É¾",
+        "ËĪi"
+      ],
+      [
+        "Ġf",
+        "ËĪÉĻ"
+      ],
+      [
+        "Ġh",
+        "ËĮÉĻm"
+      ],
+      [
+        "ËĪonÉ¡",
+        "Éľ"
+      ],
+      [
+        "j",
+        "o"
+      ],
+      [
+        "Ġs",
+        "ÉĶ"
+      ],
+      [
+        "ËĪaËĲ",
+        "d"
+      ],
+      [
+        "w",
+        "ËĪiÉĻ"
+      ],
+      [
+        "ËĪa",
+        "nd"
+      ],
+      [
+        "ËĮa",
+        "Éªn"
+      ],
+      [
+        "t",
+        "É¾"
+      ],
+      [
+        "ĠËĮ",
+        "Éª"
+      ],
+      [
+        "ĠËĪu",
+        "na"
+      ],
+      [
+        "Ġx",
+        "wËĪÉĳ"
+      ],
+      [
+        "Ġj",
+        "ÉĶËĲ"
+      ],
+      [
+        "Êģ",
+        "ËĪi"
+      ],
+      [
+        "ĠkËĪuo",
+        "Éľ"
+      ],
+      [
+        "Ġa",
+        "Î²"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪaËĲ"
+      ],
+      [
+        "an",
+        "o"
+      ],
+      [
+        "t",
+        "ÉĻl"
+      ],
+      [
+        "Ġr",
+        "ËĮe"
+      ],
+      [
+        "ËĮÊĮ",
+        "t"
+      ],
+      [
+        "ĠjËĪi",
+        "Éĳ"
+      ],
+      [
+        "ĠÉ¾ËĮÉĻh",
+        "aËĲ"
+      ],
+      [
+        "Ġm",
+        "ËĪe"
+      ],
+      [
+        "ĠËĪy",
+        "Ã¦Éľn"
+      ],
+      [
+        "Ġf",
+        "ËĪu"
+      ],
+      [
+        "Ġb",
+        "l"
+      ],
+      [
+        "n",
+        "ËĪi"
+      ],
+      [
+        "s",
+        "ÉĻn"
+      ],
+      [
+        "Ġa",
+        "Éªn"
+      ],
+      [
+        "ËĪi",
+        "ÊĬ"
+      ],
+      [
+        "ĠÃ°e",
+        "Éª"
+      ],
+      [
+        "ĠÉª",
+        "ts"
+      ],
+      [
+        "Ġ",
+        "("
+      ],
+      [
+        "ËĪy",
+        "ËĲ"
+      ],
+      [
+        "ÉĻ",
+        "d"
+      ],
+      [
+        "ĠËĮ",
+        "o"
+      ],
+      [
+        "ĠÉĽ",
+        "s"
+      ],
+      [
+        "Ġv",
+        "iËĲ"
+      ],
+      [
+        "ËĲ",
+        "É¡eËĲ"
+      ],
+      [
+        "k",
+        "ËĪe"
+      ],
+      [
+        "ĠËĪa",
+        "l"
+      ],
+      [
+        "ÉĽ",
+        "l"
+      ],
+      [
+        "Ġ",
+        "ÊĮ"
+      ],
+      [
+        "ËĲ",
+        "o"
+      ],
+      [
+        "Ġk",
+        "ËĪo"
+      ],
+      [
+        "ĠÊĪ",
+        "ËĪuËĲ"
+      ],
+      [
+        "Ġs",
+        "ËĪÉª"
+      ],
+      [
+        "ËĪeËĲ",
+        "É¾"
+      ],
+      [
+        "Éľ",
+        "m"
+      ],
+      [
+        "ËĮ",
+        "ÉĻn"
+      ],
+      [
+        "ËĪaËĲ",
+        "i"
+      ],
+      [
+        "ËĪoËĲ",
+        "l"
+      ],
+      [
+        "Éª",
+        "ËĮeËĲ"
+      ],
+      [
+        "ĠÊ²",
+        "ËĪy"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "s",
+        "ËĪi"
+      ],
+      [
+        "Ġl",
+        "ËĪe"
+      ],
+      [
+        "ËĮ",
+        "ÉĴt"
+      ],
+      [
+        "ËĪiËĲ",
+        "p"
+      ],
+      [
+        "a",
+        "Êģ"
+      ],
+      [
+        "ĠÎ¸",
+        "ËĪÉªÅĭ"
+      ],
+      [
+        "ËĪÉĻËĲ",
+        "Éª"
+      ],
+      [
+        "ËĪÊĮ",
+        "l"
+      ],
+      [
+        "ĠhËĪoËĲ",
+        "taËĲ"
+      ],
+      [
+        "ËĪo",
+        "Éª"
+      ],
+      [
+        "nt",
+        "o"
+      ],
+      [
+        "z",
+        "h"
+      ],
+      [
+        "ĠdeËĲ",
+        "m"
+      ],
+      [
+        "ĠkÉĶ",
+        "m"
+      ],
+      [
+        "Ê°",
+        "ËĪiËĲk"
+      ],
+      [
+        "ĠdÊĴ",
+        "ËĪÊĮst"
+      ],
+      [
+        "p",
+        "É¾"
+      ],
+      [
+        "Ġl",
+        "y"
+      ],
+      [
+        "h",
+        "ËĪu"
+      ],
+      [
+        "ËĪÉĶ",
+        "Ã¸"
+      ],
+      [
+        "ËĪaËĲ",
+        "s"
+      ],
+      [
+        "ĠËĪa",
+        "n"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĴ"
+      ],
+      [
+        "Ġk",
+        "an"
+      ],
+      [
+        "Ġts",
+        "ËĪuo"
+      ],
+      [
+        "ËĪeËĲ",
+        "va"
+      ],
+      [
+        "ĠÉ¡",
+        "É¾"
+      ],
+      [
+        "Ġp",
+        "o"
+      ],
+      [
+        "ĠtÊĥ",
+        "ËĪÉĶ"
+      ],
+      [
+        "Êİ",
+        "a"
+      ],
+      [
+        "Ġm",
+        "ËĮi"
+      ],
+      [
+        "Êĥ",
+        "t"
+      ],
+      [
+        "t",
+        "ËĪi"
+      ],
+      [
+        "Ġh",
+        "ËĪÊĮ"
+      ],
+      [
+        "tÊĥ",
+        "e"
+      ],
+      [
+        "Ġf",
+        "ÉĶn"
+      ],
+      [
+        "v",
+        "e"
+      ],
+      [
+        "Ġn",
+        "ËĮe"
+      ],
+      [
+        "ËĪÉĶ",
+        "Êģ"
+      ],
+      [
+        "i",
+        "z"
+      ],
+      [
+        "Ġs",
+        "ËĪuo"
+      ],
+      [
+        "ËĪÉĽËĲ",
+        "r"
+      ],
+      [
+        "wËĪa",
+        "Êģ"
+      ],
+      [
+        "ËĪaÃ°",
+        "a"
+      ],
+      [
+        "Åĭ",
+        "k"
+      ],
+      [
+        "p",
+        "o"
+      ],
+      [
+        "Ġk",
+        "ËĪi"
+      ],
+      [
+        "ËĪa",
+        "d"
+      ],
+      [
+        "Ġv",
+        "ËĪi"
+      ],
+      [
+        "t",
+        "Éķ"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĻ"
+      ],
+      [
+        "Ġw",
+        "ËĪu"
+      ],
+      [
+        "ÉĴ",
+        "z"
+      ],
+      [
+        "ĠvÉĳËĲ",
+        "É¾"
+      ],
+      [
+        "Êģ",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġk",
+        "ËĪaËĲ"
+      ],
+      [
+        "k",
+        "e"
+      ],
+      [
+        "n",
+        "ÉĻ"
+      ],
+      [
+        "ËĪÊĮ",
+        "b"
+      ],
+      [
+        "ËĪuËĲ",
+        "É¾"
+      ],
+      [
+        "ËĮÉĻ",
+        "ËĲ"
+      ],
+      [
+        "ĠÊĪ",
+        "Ê°ËĪiËĲk"
+      ],
+      [
+        "Ġk",
+        "ËĪu"
+      ],
+      [
+        "Ġb",
+        "ËĮÊĮt"
+      ],
+      [
+        "Ġa",
+        "t"
+      ],
+      [
+        "Ġf",
+        "É¹"
+      ],
+      [
+        "ËĪa",
+        "x"
+      ],
+      [
+        "Ġz",
+        "oËĲ"
+      ],
+      [
+        "Ġt",
+        "ËĪaËĲ"
+      ],
+      [
+        "ĠÃ°",
+        "ËĮe"
+      ],
+      [
+        "n",
+        "eËĲ"
+      ],
+      [
+        "ĠÉĳ",
+        "ËĲ"
+      ],
+      [
+        "Ġa",
+        "ÊĬf"
+      ],
+      [
+        "a",
+        "m"
+      ],
+      [
+        "ÊĬ",
+        "Åĭ"
+      ],
+      [
+        "ĠÉĶ",
+        "ËĲ"
+      ],
+      [
+        "ĠÉķËĪi",
+        "ÉľÅĭ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĶËĲl"
+      ],
+      [
+        "Éª",
+        "m"
+      ],
+      [
+        "j",
+        "ËĪo"
+      ],
+      [
+        "ËĪiËĲ",
+        "ÉŁ"
+      ],
+      [
+        "Ġkw",
+        "ËĮÉĽ"
+      ],
+      [
+        "ĠmËĪa",
+        "s"
+      ],
+      [
+        "ÉĻ",
+        "h"
+      ],
+      [
+        "ĠËĪa",
+        "ÊĬ"
+      ],
+      [
+        "ËĪÉĶ",
+        "Éª"
+      ],
+      [
+        "É¡",
+        "ÉĻÉ¾"
+      ],
+      [
+        "r",
+        "ÉĻn"
+      ],
+      [
+        "ËĪÉª",
+        "k"
+      ],
+      [
+        "s",
+        "se"
+      ],
+      [
+        "Ġp",
+        "ËĪÉĳ"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĮe"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪi"
+      ],
+      [
+        "Ġa",
+        "z"
+      ],
+      [
+        "ĠÉ¡ËĪÊĮ",
+        "jaËĲ"
+      ],
+      [
+        "z",
+        "e"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĮaËĲ"
+      ],
+      [
+        "Ġf",
+        "ËĪi"
+      ],
+      [
+        "ĠËĮ",
+        "ÉĴn"
+      ],
+      [
+        "Ġx",
+        "ËĪo"
+      ],
+      [
+        "ĠËĮÊĬ",
+        "na"
+      ],
+      [
+        "ĠtÊ°",
+        "aËĲ"
+      ],
+      [
+        "Ġs",
+        "Éĳ"
+      ],
+      [
+        "ËĪeÉª",
+        "ÊĥÉĻn"
+      ],
+      [
+        "ĠtÉķËĪi",
+        "Éľ"
+      ],
+      [
+        "ĠÉŁ",
+        "aËĲ"
+      ],
+      [
+        "p",
+        "ËĲ"
+      ],
+      [
+        "Ġpl",
+        "y"
+      ],
+      [
+        "Î¸",
+        "ËĪi"
+      ],
+      [
+        "ËĲ",
+        "Éĸ"
+      ],
+      [
+        "Ġt",
+        "ËĪuei"
+      ],
+      [
+        "Ġl",
+        "ËĪÉĻ"
+      ],
+      [
+        "Ġd",
+        "ÉĳËĲ"
+      ],
+      [
+        "f",
+        "t"
+      ],
+      [
+        "ËĪa",
+        "m"
+      ],
+      [
+        "ĠsËĪÊĮ",
+        "kt"
+      ],
+      [
+        "Ġt",
+        "ËĪou"
+      ],
+      [
+        "Ġp",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ĠËĪa",
+        "i"
+      ],
+      [
+        "ĠwËĪÉĴ",
+        "n"
+      ],
+      [
+        "Ġz",
+        "ËĮaÉªn"
+      ],
+      [
+        "Ġe",
+        "st"
+      ],
+      [
+        "Ġm",
+        "ÉĶ"
+      ],
+      [
+        "ĠtÉķ",
+        "jËĪÉĳu"
+      ],
+      [
+        "Éľ",
+        "p"
+      ],
+      [
+        "ËĪÊĮ",
+        "z"
+      ],
+      [
+        "b",
+        "i"
+      ],
+      [
+        "ËĪÉĽËĲs",
+        "eËĲ"
+      ],
+      [
+        "Ġl",
+        "ËĪy"
+      ],
+      [
+        "Ġm",
+        "ËĮe"
+      ],
+      [
+        "Ġd",
+        "ËĮÉĽl"
+      ],
+      [
+        "ËĪiËĲ",
+        "l"
+      ],
+      [
+        "ĠkËĮo",
+        "mo"
+      ],
+      [
+        "Ġh",
+        "ËĪaÉľn"
+      ],
+      [
+        "ËĪoËĲ",
+        "ne"
+      ],
+      [
+        "ĠkËĪÊĮÉ¾",
+        "t"
+      ],
+      [
+        "Ġsy",
+        "Êģ"
+      ],
+      [
+        "ËĮÉĶ",
+        "É¾"
+      ],
+      [
+        "ĠÉª",
+        "f"
+      ],
+      [
+        "u",
+        "v"
+      ],
+      [
+        "z",
+        "ÉĻn"
+      ],
+      [
+        "o",
+        "l"
+      ],
+      [
+        "Ï",
+        "ĩ"
+      ],
+      [
+        "i",
+        "m"
+      ],
+      [
+        "Ġm",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ĠÃ°",
+        "Éª"
+      ],
+      [
+        "Ġv",
+        "ËĪÉĽ"
+      ],
+      [
+        "ÊĬ",
+        "d"
+      ],
+      [
+        "Ġt",
+        "r"
+      ],
+      [
+        "ËĪeËĲ",
+        "s"
+      ],
+      [
+        "Ã°",
+        "e"
+      ],
+      [
+        "d",
+        "e"
+      ],
+      [
+        "Ê°",
+        "Ïĩ"
+      ],
+      [
+        "ÉŁ",
+        "Ê°"
+      ],
+      [
+        "ËĮÉĻËĲ",
+        "ÉªÉľ"
+      ],
+      [
+        "b",
+        "ËĲ"
+      ],
+      [
+        "ËĪÊĬ",
+        "k"
+      ],
+      [
+        "ĠnËĪÉĶ",
+        "ÉªÉľ"
+      ],
+      [
+        "ĠËĮ",
+        "iËĲ"
+      ],
+      [
+        "ËĪÉĳËĲ",
+        "t"
+      ],
+      [
+        "ËĪiËĲ",
+        "É¾"
+      ],
+      [
+        "Ġt",
+        "É¹"
+      ],
+      [
+        "É¾",
+        "ÉĶ"
+      ],
+      [
+        "Ġw",
+        "ÉĴz"
+      ],
+      [
+        "Ġv",
+        "u"
+      ],
+      [
+        "b",
+        "ÉĻl"
+      ],
+      [
+        "b",
+        "ÉĻ"
+      ],
+      [
+        "É¹",
+        "i"
+      ],
+      [
+        "nt",
+        "s"
+      ],
+      [
+        "Ġs",
+        "ËĪaËĲ"
+      ],
+      [
+        "d",
+        "Ê°"
+      ],
+      [
+        "Ġt",
+        "ÊĬ"
+      ],
+      [
+        "ĠÊİ",
+        "ËĮi"
+      ],
+      [
+        "Î²",
+        "a"
+      ],
+      [
+        "h",
+        "ËĪÉĻÉľÅĭ"
+      ],
+      [
+        "Ġs",
+        "ËĪiËĲ"
+      ],
+      [
+        "ĠpËĮa",
+        "É¾a"
+      ],
+      [
+        "ËĪÉĽÉ¾",
+        "ÉĶ"
+      ],
+      [
+        "ËĪÉª",
+        "s"
+      ],
+      [
+        "É£",
+        "o"
+      ],
+      [
+        "ĠËĮa",
+        "l"
+      ],
+      [
+        "o",
+        "r"
+      ],
+      [
+        "Ġb",
+        "ËĪÊĮh"
+      ],
+      [
+        "Ġk",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġt",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġp",
+        "ËĪo"
+      ],
+      [
+        "ĠÊĴ",
+        "ÉĻ"
+      ],
+      [
+        "p",
+        "Êģ"
+      ],
+      [
+        "Ġ",
+        "ËĪaÉª"
+      ],
+      [
+        "hËĪÉĳ",
+        "ÉľÅĭ"
+      ],
+      [
+        "ÉĻl",
+        "i"
+      ],
+      [
+        "ËĪeÉª",
+        "t"
+      ],
+      [
+        "ĠjËĪiou",
+        "Éľ"
+      ],
+      [
+        "Ġd",
+        "ËĪÉĻ"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "l",
+        "ËĪi"
+      ],
+      [
+        "ËĮy",
+        "ÉĻ"
+      ],
+      [
+        "ĠlËĪoËĲ",
+        "É¡"
+      ],
+      [
+        "Ġn",
+        "ËĪÊĮ"
+      ],
+      [
+        "Ġh",
+        "ËĪÊĬ"
+      ],
+      [
+        "Ġn",
+        "ËĪÉĻÉľÅĭ"
+      ],
+      [
+        "ĠÊģ",
+        "ÉĻ"
+      ],
+      [
+        "z",
+        "ËĪi"
+      ],
+      [
+        "Ġt",
+        "ËĪuËĲ"
+      ],
+      [
+        "ĠkËĮo",
+        "me"
+      ],
+      [
+        "Ġl",
+        "ËĪeËĲ"
+      ],
+      [
+        "ËĪaËĲt",
+        "aËĲ"
+      ],
+      [
+        "Ġa",
+        "n"
+      ],
+      [
+        "ĠËĪy",
+        "u"
+      ],
+      [
+        "ĠËĮÊĮ",
+        "É¡ÉĻÉ¾"
+      ],
+      [
+        "ĠËĪÉª",
+        "n"
+      ],
+      [
+        "ĠhËĪo",
+        "ÉĻ"
+      ],
+      [
+        "v",
+        "ÉĻ"
+      ],
+      [
+        "ËĪÃ¸",
+        "ËĲ"
+      ],
+      [
+        "Î¸j",
+        "a"
+      ],
+      [
+        "ËĪuÉĻ",
+        "Éľn"
+      ],
+      [
+        "Ġk",
+        "ÉĻÉ¾"
+      ],
+      [
+        "ËĪa",
+        "t"
+      ],
+      [
+        "j",
+        "ËĪÃ¸"
+      ],
+      [
+        "ËĪÉĽt",
+        "Êģ"
+      ],
+      [
+        "Ġp",
+        "ËĪÉĳu"
+      ],
+      [
+        "st",
+        "ÉĻ"
+      ],
+      [
+        "Ġw",
+        "ÉĴt"
+      ],
+      [
+        "ËĪeËĲ",
+        "l"
+      ],
+      [
+        "ÊĪ",
+        "i"
+      ],
+      [
+        "Ġx",
+        "ËĪaiÉľ"
+      ],
+      [
+        "ËĪy",
+        "Êģ"
+      ],
+      [
+        "ĠhËĪoËĲ",
+        "É¡aËĲ"
+      ],
+      [
+        "Ġts",
+        "ËĪi"
+      ],
+      [
+        "ĠËĪÊĮ",
+        "p"
+      ],
+      [
+        "Ġn",
+        "ËĮÉĴt"
+      ],
+      [
+        "ĠlËĪÉª",
+        "eËĲ"
+      ],
+      [
+        "Ġh",
+        "ËĪa"
+      ],
+      [
+        "Ġf",
+        "l"
+      ],
+      [
+        "Ġn",
+        "ËĪeËĲ"
+      ],
+      [
+        "ËĮaËĲ",
+        "Éª"
+      ],
+      [
+        "Ġt",
+        "ËĪuo"
+      ],
+      [
+        "tÊĥ",
+        "ËĲ"
+      ],
+      [
+        "s",
+        "ËĪe"
+      ],
+      [
+        "bÊ°",
+        "i"
+      ],
+      [
+        "ĠbËĪÊĮh",
+        "ÊĬt"
+      ],
+      [
+        "ËĪÉĽ",
+        "nd"
+      ],
+      [
+        "Ġs",
+        "ËĪÉĶ"
+      ],
+      [
+        "ÉĻn",
+        "s"
+      ],
+      [
+        "ËĮÉĻ",
+        "l"
+      ],
+      [
+        "ÉĽ",
+        "Éľ"
+      ],
+      [
+        "ĠÉ¡",
+        "l"
+      ],
+      [
+        "ËĪÉª",
+        "É¾"
+      ],
+      [
+        "ËĪaËĲt",
+        "a"
+      ],
+      [
+        "Éľ",
+        "ËĲ"
+      ],
+      [
+        "ËĪÉĽnt",
+        "o"
+      ],
+      [
+        "sk",
+        "ËĮoËĲ"
+      ],
+      [
+        "ËĪÉĽ",
+        "k"
+      ],
+      [
+        "ts",
+        "i"
+      ],
+      [
+        "Ġt",
+        "ËĪonÉ¡"
+      ],
+      [
+        "Ġb",
+        "iËĲ"
+      ],
+      [
+        "Ġh",
+        "ËĪaËĲÉª"
+      ],
+      [
+        "Ġb",
+        "ËĪi"
+      ],
+      [
+        "j",
+        "j"
+      ],
+      [
+        "Êİ",
+        "i"
+      ],
+      [
+        "Ġk",
+        "Ê°"
+      ],
+      [
+        "Ġs",
+        "ËĪo"
+      ],
+      [
+        "ll",
+        "o"
+      ],
+      [
+        "Ġb",
+        "aÉª"
+      ],
+      [
+        "ĠÉĽ",
+        "nt"
+      ],
+      [
+        "Ġ",
+        "ËĪiËĲ"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪo"
+      ],
+      [
+        "É¾",
+        "eËĲ"
+      ],
+      [
+        "Ġk",
+        "Êĭ"
+      ],
+      [
+        "Ġm",
+        "ËĪeiÉľ"
+      ],
+      [
+        "ÊĬ",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "Ġt",
+        "ËĪaÉª"
+      ],
+      [
+        "Ġsu",
+        "s"
+      ],
+      [
+        "Ġr",
+        "i"
+      ],
+      [
+        "Ġv",
+        "ËĮÉĽ"
+      ],
+      [
+        "ËĪiËĲ",
+        "no"
+      ],
+      [
+        "v",
+        "ano"
+      ],
+      [
+        "ĠdËĮi",
+        "ËĲ"
+      ],
+      [
+        "ĠÊĲ",
+        "ËĪaÉľn"
+      ],
+      [
+        "Ê",
+        "Ĥ"
+      ],
+      [
+        "ĠÉĲ",
+        "b"
+      ],
+      [
+        "ËĪaËĲ",
+        "h"
+      ],
+      [
+        "Éª",
+        "Êĥ"
+      ],
+      [
+        "ĠdËĮe",
+        "lla"
+      ],
+      [
+        "tËĲ",
+        "i"
+      ],
+      [
+        "ĠËĪÊĬ",
+        "n"
+      ],
+      [
+        "Ġh",
+        "iËĲ"
+      ],
+      [
+        "Ġb",
+        "ËĪaËĲt"
+      ],
+      [
+        "Ġth",
+        "ËĪi"
+      ],
+      [
+        "Ġa",
+        "m"
+      ],
+      [
+        "Ġ",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġh",
+        "u"
+      ],
+      [
+        "Ġk",
+        "ËĪÊĮh"
+      ],
+      [
+        "Ġz",
+        "ËĪÉĳËĲ"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĮÉĶ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĻÊĬ"
+      ],
+      [
+        "y",
+        "ËĪi"
+      ],
+      [
+        "Ġl",
+        "ËĪÊĮ"
+      ],
+      [
+        "Ġd",
+        "ËĪeËĲ"
+      ],
+      [
+        "Ġs",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "sk",
+        "ËĮeËĲ"
+      ],
+      [
+        "É¾",
+        "o"
+      ],
+      [
+        "Êģ",
+        "ËĪÉĳ"
+      ],
+      [
+        "t",
+        "ËĪa"
+      ],
+      [
+        "Ġk",
+        "ËĪÊĬ"
+      ],
+      [
+        "ËĪant",
+        "e"
+      ],
+      [
+        "Ġd",
+        "ÉĶ"
+      ],
+      [
+        "Ġs",
+        "ËĪeÉª"
+      ],
+      [
+        "Ġs",
+        "ÉĽt"
+      ],
+      [
+        "É¹",
+        "Éª"
+      ],
+      [
+        "ĠÉ¡ËĮÉĻÊĬ",
+        "ÉªÅĭ"
+      ],
+      [
+        "z",
+        "o"
+      ],
+      [
+        "Ġj",
+        "ËĪaËĲ"
+      ],
+      [
+        "ĠÉĴv",
+        "Ã°ÉĻ"
+      ],
+      [
+        "ĠÊ",
+        "Ŀ"
+      ],
+      [
+        "ĠÉĽ",
+        "l"
+      ],
+      [
+        "Ġs",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġth",
+        "ËĪiÉľ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĽl"
+      ],
+      [
+        "Ġly",
+        "ËĮi"
+      ],
+      [
+        "nd",
+        "ÊĴ"
+      ],
+      [
+        "ĠÉķ",
+        "jËĪÉĳu"
+      ],
+      [
+        "Î¸",
+        "a"
+      ],
+      [
+        "ĠÉ¾ËĮÉĻh",
+        "eËĲ"
+      ],
+      [
+        "Ġma",
+        "Éª"
+      ],
+      [
+        "j",
+        "ÉĻ"
+      ],
+      [
+        "ĠËĪÊĮ",
+        "b"
+      ],
+      [
+        "as",
+        "jËĪÉĶ"
+      ],
+      [
+        "d",
+        "Êģ"
+      ],
+      [
+        "Ġkh",
+        "ËĪa"
+      ],
+      [
+        "ĠËĪe",
+        "s"
+      ],
+      [
+        "v",
+        "i"
+      ],
+      [
+        "f",
+        "i"
+      ],
+      [
+        "ËĮÉĻ",
+        "b"
+      ],
+      [
+        "Ġr",
+        "e"
+      ],
+      [
+        "Ġav",
+        "ËĮÉĽ"
+      ],
+      [
+        "Ġt",
+        "ËĮi"
+      ],
+      [
+        "Ġk",
+        "É¾"
+      ],
+      [
+        "Ġb",
+        "Éªk"
+      ],
+      [
+        "st",
+        "e"
+      ],
+      [
+        "ËĪeËĲÊĥ",
+        "c"
+      ],
+      [
+        "p",
+        "t"
+      ],
+      [
+        "z",
+        "ÉĻ"
+      ],
+      [
+        "Ġw",
+        "ËĪaËĲ"
+      ],
+      [
+        "k",
+        "l"
+      ],
+      [
+        "ĠsËĪÊĮ",
+        "m"
+      ],
+      [
+        "Éª",
+        "ÊĪ"
+      ],
+      [
+        "d",
+        "z"
+      ],
+      [
+        "v",
+        "o"
+      ],
+      [
+        "ËĮa",
+        "ÊĬt"
+      ],
+      [
+        "nd",
+        "e"
+      ],
+      [
+        "Ġd",
+        "ÉĽs"
+      ],
+      [
+        "ĠÉŁ",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġr",
+        "ËĮi"
+      ],
+      [
+        "s",
+        "ËĮeËĲ"
+      ],
+      [
+        "É¡",
+        "i"
+      ],
+      [
+        "Ġal",
+        "s"
+      ],
+      [
+        "ËĪi",
+        "Ã°o"
+      ],
+      [
+        "ĠnËĪi",
+        "Éľn"
+      ],
+      [
+        "ÊĬ",
+        "l"
+      ],
+      [
+        "ts",
+        "ËĲ"
+      ],
+      [
+        "ËĪant",
+        "o"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪÉĻÊĬ"
+      ],
+      [
+        "kËĲ",
+        "i"
+      ],
+      [
+        "ĠsËĪÊĮ",
+        "b"
+      ],
+      [
+        "Ġn",
+        "ËĪa"
+      ],
+      [
+        "Ġl",
+        "ËĮo"
+      ],
+      [
+        "Ġph",
+        "ËĪi"
+      ],
+      [
+        "m",
+        "ËĮe"
+      ],
+      [
+        "Ġf",
+        "a"
+      ],
+      [
+        "k",
+        "ÉĻ"
+      ],
+      [
+        "Ġz",
+        "ËĪu"
+      ],
+      [
+        "n",
+        "s"
+      ],
+      [
+        "ĠÊģ",
+        "e"
+      ],
+      [
+        "Ġb",
+        "ËĪo"
+      ],
+      [
+        "ËĪaËĲt",
+        "i"
+      ],
+      [
+        "Ġm",
+        "an"
+      ],
+      [
+        "ĠlËĪi",
+        "Éĳ"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĮyÉĻ"
+      ],
+      [
+        "Ġf",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "ĠkÊĭ",
+        "ËĪeËĲÊĥc"
+      ],
+      [
+        "Ġx",
+        "ËĪÉĳ"
+      ],
+      [
+        "ĠtÉķ",
+        "ËĪu"
+      ],
+      [
+        "j",
+        "ÉĻÉ¾"
+      ],
+      [
+        "ĠÉª",
+        "st"
+      ],
+      [
+        "w",
+        "ËĪi"
+      ],
+      [
+        "ĠËĮaÉªn",
+        "ÉĻ"
+      ],
+      [
+        "Éª",
+        "É¡"
+      ],
+      [
+        "Ġs",
+        "ÊĪ"
+      ],
+      [
+        "ËĪi",
+        "ÉĻl"
+      ],
+      [
+        "Ġn",
+        "ËĪiÉĽÉľn"
+      ],
+      [
+        "ĠËĮÉĽ",
+        "ËĲ"
+      ],
+      [
+        "ËĪaÉª",
+        "nd"
+      ],
+      [
+        "Ġz",
+        "ËĪi"
+      ],
+      [
+        "v",
+        "ÉĻn"
+      ],
+      [
+        "m",
+        "z"
+      ],
+      [
+        "Ã°",
+        "os"
+      ],
+      [
+        "dÊĴ",
+        "ËĲ"
+      ],
+      [
+        "j",
+        "ËĪa"
+      ],
+      [
+        "É¾",
+        "ËĪÉĶ"
+      ],
+      [
+        "l",
+        "ËĪe"
+      ],
+      [
+        "Ê",
+        "²"
+      ],
+      [
+        "Ġv",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġl",
+        "ËĪiÉĽ"
+      ],
+      [
+        "Î¸",
+        "e"
+      ],
+      [
+        "mËĪe",
+        "nte"
+      ],
+      [
+        "ĠÉªn",
+        "Ã°ÉĻ"
+      ],
+      [
+        "ĠaÉª",
+        "m"
+      ],
+      [
+        "n",
+        "ÉĻn"
+      ],
+      [
+        "Ġh",
+        "ÉĻm"
+      ],
+      [
+        "É¾",
+        "aËĲ"
+      ],
+      [
+        "ĠsËĪuo",
+        "Éľ"
+      ],
+      [
+        "ĠÉ²",
+        "ËĪi"
+      ],
+      [
+        "ĠÉ¹",
+        "ËĪiÉĻl"
+      ],
+      [
+        "l",
+        "ËĪa"
+      ],
+      [
+        "Ġb",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġk",
+        "ËĪai"
+      ],
+      [
+        "Êģ",
+        "ËĪa"
+      ],
+      [
+        "Ġw",
+        "ËĪÉľËĲ"
+      ],
+      [
+        "Ġa",
+        "ËĲ"
+      ],
+      [
+        "Ġp",
+        "as"
+      ],
+      [
+        "ËĪÊĮ",
+        "s"
+      ],
+      [
+        "w",
+        "ËĪÉĽÉ¾"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪe"
+      ],
+      [
+        "ĠhËĮa",
+        "tÉĻ"
+      ],
+      [
+        "a",
+        "Éªn"
+      ],
+      [
+        "ĠËĪÉĶ",
+        "pÊ°"
+      ],
+      [
+        "Êģ",
+        "ËĪe"
+      ],
+      [
+        "ĠÉŁaËĲ",
+        "ËĪeËĲÉ¡aËĲ"
+      ],
+      [
+        "ĠËĪÊĬ",
+        "s"
+      ],
+      [
+        "ĠtÉķhËĪi",
+        "Éľ"
+      ],
+      [
+        "nt",
+        "Êĥ"
+      ],
+      [
+        "Ġx",
+        "ËĪuo"
+      ],
+      [
+        "ËĪu",
+        "Êģ"
+      ],
+      [
+        "ĠÉª",
+        "m"
+      ],
+      [
+        "É³",
+        "Éĸ"
+      ],
+      [
+        "ËĪyÉĻ",
+        "Éľkh"
+      ],
+      [
+        "ĠËĪy",
+        "ÉĽ"
+      ],
+      [
+        "Ġm",
+        "ËĮaËĲ"
+      ],
+      [
+        "Åĵ",
+        "Êģ"
+      ],
+      [
+        "ĠËĪa",
+        "lt"
+      ],
+      [
+        "Ġk",
+        "ÉĻm"
+      ],
+      [
+        "Êİ",
+        "o"
+      ],
+      [
+        "ĠÉĲ",
+        "n"
+      ],
+      [
+        "Ġf",
+        "y"
+      ],
+      [
+        "ĠËĮÉĽ",
+        "ra"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÊĬ"
+      ],
+      [
+        "Ġp",
+        "ËĪÊĮ"
+      ],
+      [
+        "l",
+        "s"
+      ],
+      [
+        "Ġl",
+        "ËĪiËĲ"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĪy"
+      ],
+      [
+        "ĠbÉªk",
+        "ËĪÊĮz"
+      ],
+      [
+        "ĠÉ¡",
+        "ÉĽt"
+      ],
+      [
+        "Ġb",
+        "É¾"
+      ],
+      [
+        "t",
+        "Ê°"
+      ],
+      [
+        "tÉĻl",
+        "ËĮÉĻb"
+      ],
+      [
+        "x",
+        "o"
+      ],
+      [
+        "sk",
+        "ËĮaËĲ"
+      ],
+      [
+        "É²",
+        "Ê²"
+      ],
+      [
+        "ËĪeËĲk",
+        "ÊĪ"
+      ],
+      [
+        "r",
+        "ÉĻ"
+      ],
+      [
+        "tÊĥ",
+        "o"
+      ],
+      [
+        "ĠpÊģ",
+        "ÉĶ"
+      ],
+      [
+        "ĠÉ¹",
+        "ËĪaÉªt"
+      ],
+      [
+        "Ġp",
+        "ËĪei"
+      ],
+      [
+        "ËĮ",
+        "ÉªÃ§"
+      ],
+      [
+        "j",
+        "ËĪÉĽÉ¾"
+      ],
+      [
+        "tËĲ",
+        "a"
+      ],
+      [
+        "ĠÉĲb",
+        "ËĮaÊĬt"
+      ],
+      [
+        "ĠkÊĭËĪeËĲÊĥc",
+        "ÉĻn"
+      ],
+      [
+        "Ġv",
+        "ËĪe"
+      ],
+      [
+        "ÊĬ",
+        "Éľ"
+      ],
+      [
+        "Ġa",
+        "kËĪe"
+      ],
+      [
+        "Ġp",
+        "ËĪai"
+      ],
+      [
+        "v",
+        "ËĪÉĽ"
+      ],
+      [
+        "ĠÎ¸",
+        "É¹"
+      ],
+      [
+        "Éª",
+        "f"
+      ],
+      [
+        "Ġav",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġk",
+        "ËĪe"
+      ],
+      [
+        "d",
+        "ËĪi"
+      ],
+      [
+        "ËĪeËĲ",
+        "Éĸ"
+      ],
+      [
+        "Ġb",
+        "ÉĻt"
+      ],
+      [
+        "ÊĪ",
+        "Ê°"
+      ],
+      [
+        "t",
+        "eËĲ"
+      ],
+      [
+        "Î¸j",
+        "ËĪÉĶn"
+      ],
+      [
+        "d",
+        "Éľ"
+      ],
+      [
+        "ĠjËĪi",
+        "Éľ"
+      ],
+      [
+        "Ġv",
+        "e"
+      ],
+      [
+        "É£",
+        "ËĪu"
+      ],
+      [
+        "ËĪÊĮh",
+        "ÉĻl"
+      ],
+      [
+        "Ġp",
+        "ÉĶ"
+      ],
+      [
+        "ĠÉ¡",
+        "r"
+      ],
+      [
+        "ĠÃ°",
+        "a"
+      ],
+      [
+        "Ġv",
+        "ËĪiËĲ"
+      ],
+      [
+        "ĠËĮ",
+        "ÉĳËĲ"
+      ],
+      [
+        "ËĪÉĻÊĬ",
+        "nt"
+      ],
+      [
+        "Ġb",
+        "ËĪaËĲÉ¾"
+      ],
+      [
+        "ĠmËĪÊĮ",
+        "tÉĻlËĮÉĻb"
+      ],
+      [
+        "l",
+        "d"
+      ],
+      [
+        "ĠtÉķ",
+        "ËĮÉĶ"
+      ],
+      [
+        "p",
+        "a"
+      ],
+      [
+        "Ã°",
+        "ËĪad"
+      ],
+      [
+        "ËĪi",
+        "É¾"
+      ],
+      [
+        "Ġx",
+        "ËĪu"
+      ],
+      [
+        "ĠlËĪi",
+        "ÉľÅĭ"
+      ],
+      [
+        "ËĪeÉª",
+        "s"
+      ],
+      [
+        "ĠÉĹËĮe",
+        "Éľn"
+      ],
+      [
+        "Ġth",
+        "ËĪiÉĽ"
+      ],
+      [
+        "tËĲ",
+        "e"
+      ],
+      [
+        "ĠavËĮÉĽ",
+        "k"
+      ],
+      [
+        "ĠËĮ",
+        "ÉĶ"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĳu"
+      ],
+      [
+        "Éª",
+        "v"
+      ],
+      [
+        "iËĲ",
+        "z"
+      ],
+      [
+        "ËĪo",
+        "s"
+      ],
+      [
+        "ĠÉ¡",
+        "É¹"
+      ],
+      [
+        "a",
+        "nd"
+      ],
+      [
+        "ĠlËĪi",
+        "ou"
+      ],
+      [
+        "ĠËĪo",
+        "Éľ"
+      ],
+      [
+        "É¡",
+        "l"
+      ],
+      [
+        "Ġp",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "Ġm",
+        "ËĮeËĲ"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĴ"
+      ],
+      [
+        "n",
+        "os"
+      ],
+      [
+        "Ã§",
+        "ÉĻn"
+      ],
+      [
+        "f",
+        "ÉĻn"
+      ],
+      [
+        "ĠsËĪÊĮkt",
+        "ËĮeËĲ"
+      ],
+      [
+        "Ġ",
+        "ËĪaÉªn"
+      ],
+      [
+        "ËĪoËĲ",
+        "re"
+      ],
+      [
+        "j",
+        "ËĪÉĽn"
+      ],
+      [
+        "ĠÃ°",
+        "ËĪÉĽn"
+      ],
+      [
+        "ĠtÉķh",
+        "ËĪiÉĽÉľn"
+      ],
+      [
+        "Ġh",
+        "ËĪaÉª"
+      ],
+      [
+        "É¾",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġs",
+        "ËĪu"
+      ],
+      [
+        "ĠkËĪÉª",
+        "jaËĲ"
+      ],
+      [
+        "Ġpj",
+        "ËĮÊĬ"
+      ],
+      [
+        "ĠhÉĻm",
+        "ËĮaËĲ"
+      ],
+      [
+        "ĠËĮÊĮ",
+        "p"
+      ],
+      [
+        "Ġp",
+        "ËĪÊĮhÉĻl"
+      ],
+      [
+        "Ġx",
+        "ËĪÉĻ"
+      ],
+      [
+        "d",
+        "ËĪe"
+      ],
+      [
+        "Ġm",
+        "Éĳ"
+      ],
+      [
+        "ĠÊĬ",
+        "m"
+      ],
+      [
+        "nd",
+        "ÉĻ"
+      ],
+      [
+        "Ġd",
+        "ËĪÉĻÊĬnt"
+      ],
+      [
+        "ËĪeËĲ",
+        "ÊĥÉĻn"
+      ],
+      [
+        "ĠÃ°a",
+        "ts"
+      ],
+      [
+        "i",
+        "s"
+      ],
+      [
+        "Ġc",
+        "ËĪaËĲh"
+      ],
+      [
+        "p",
+        "e"
+      ],
+      [
+        "Ġs",
+        "ËĮo"
+      ],
+      [
+        "ĠÃ°",
+        "ËĪe"
+      ],
+      [
+        "Ġs",
+        "ËĪaËĲt"
+      ],
+      [
+        "ËĪa",
+        "Êģ"
+      ],
+      [
+        "Ġs",
+        "ËĪe"
+      ],
+      [
+        "ÉĻ",
+        "k"
+      ],
+      [
+        "Éª",
+        "Êĭ"
+      ],
+      [
+        "ĠkËĪoËĲ",
+        "i"
+      ],
+      [
+        "k",
+        "ÉĶ"
+      ],
+      [
+        "Ġv",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "Ġf",
+        "ËĪei"
+      ],
+      [
+        "Ġl",
+        "ËĪeËĲk"
+      ],
+      [
+        "Ġh",
+        "ËĪiÉĻ"
+      ],
+      [
+        "Ġa",
+        "ÊĬ"
+      ],
+      [
+        "ËĪÉĽ",
+        "ndo"
+      ],
+      [
+        "ËĪe",
+        "s"
+      ],
+      [
+        "Ġz",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĽÉ¾a"
+      ],
+      [
+        "nËĪi",
+        "Éľn"
+      ],
+      [
+        "ĠkËĪÊĮ",
+        "m"
+      ],
+      [
+        "Ġl",
+        "ËĪÉĴ"
+      ],
+      [
+        "Éª",
+        "st"
+      ],
+      [
+        "Ġp",
+        "Éĳ"
+      ],
+      [
+        "Ġf",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġth",
+        "ËĪonÉ¡"
+      ],
+      [
+        "nk",
+        "e"
+      ],
+      [
+        "ËĮ",
+        "Éªk"
+      ],
+      [
+        "ĠÉ²",
+        "ËĪÉĻ"
+      ],
+      [
+        "ËĮÊĮ",
+        "m"
+      ],
+      [
+        "ËĪiËĲ",
+        "t"
+      ],
+      [
+        "ĠwËĪÉĴ",
+        "nt"
+      ],
+      [
+        "ËĪaÎ²",
+        "an"
+      ],
+      [
+        "ĠbËĪÊĮ",
+        "r"
+      ],
+      [
+        "ÉĽ",
+        "nd"
+      ],
+      [
+        "ĠËĮÉĳËĲ",
+        "bÉľ"
+      ],
+      [
+        "Ġv",
+        "ËĪaÉª"
+      ],
+      [
+        "ĠtÊĥ",
+        "ËĮi"
+      ],
+      [
+        "ĠÎ¸ËĪÉªÅĭ",
+        "k"
+      ],
+      [
+        "st",
+        "i"
+      ],
+      [
+        "Ġk",
+        "É¹"
+      ],
+      [
+        "ĠËĪa",
+        "ÊĬt"
+      ],
+      [
+        "st",
+        "ÉĻn"
+      ],
+      [
+        "ĠÊĭ",
+        "ËĪÊĮn"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĮaËĲ"
+      ],
+      [
+        "ËĪaËĲÉľ",
+        "É²"
+      ],
+      [
+        "Êģ",
+        "i"
+      ],
+      [
+        "ĠnËĪÉĶ",
+        "x"
+      ],
+      [
+        "ĠÉ¹ËĪiÉĻl",
+        "Éª"
+      ],
+      [
+        "Ġv",
+        "ËĮi"
+      ],
+      [
+        "ĠÃ°e",
+        "ÉĻ"
+      ],
+      [
+        "ËĮÉª",
+        "tÊĥ"
+      ],
+      [
+        "Ġv",
+        "ËĪyÉĻ"
+      ],
+      [
+        "ĠËĮaËĲpk",
+        "ËĮaËĲ"
+      ],
+      [
+        "Ġf",
+        "ËĮaËĲÉª"
+      ],
+      [
+        "Ġp",
+        "ËĪÉĶ"
+      ],
+      [
+        "ĠnËĪÊĮ",
+        "mb"
+      ],
+      [
+        "Î¸",
+        "es"
+      ],
+      [
+        "j",
+        "ËĪÉĽÊģ"
+      ],
+      [
+        "ĠkËĪÊĬ",
+        "cÊ°"
+      ],
+      [
+        "m",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġv",
+        "ËĪu"
+      ],
+      [
+        "Ġl",
+        "ÅĵÊģ"
+      ],
+      [
+        "ĠiËĲ",
+        "m"
+      ],
+      [
+        "ÊĪ",
+        "ÉĻÉ¾"
+      ],
+      [
+        "tÊĥ",
+        "i"
+      ],
+      [
+        "ËĲ",
+        "s"
+      ],
+      [
+        "Ġt",
+        "ËĪy"
+      ],
+      [
+        "ĠmËĪi",
+        "ÉľÅĭ"
+      ],
+      [
+        "É¾",
+        "ËĪe"
+      ],
+      [
+        "m",
+        "ËĮa"
+      ],
+      [
+        "Ġm",
+        "ËĮiËĲ"
+      ],
+      [
+        "ĠÉĽ",
+        "ks"
+      ],
+      [
+        "Éª",
+        "p"
+      ],
+      [
+        "ĠkËĪÊĮÉ¾",
+        "nËĮaËĲ"
+      ],
+      [
+        "ĠËĮaÊĬ",
+        "x"
+      ],
+      [
+        "r",
+        "ËĪiËĲ"
+      ],
+      [
+        "Ġc",
+        "ËĪÊĮl"
+      ],
+      [
+        "m",
+        "os"
+      ],
+      [
+        "ĠkËĪÊĮÉ¾t",
+        "ËĮeËĲ"
+      ],
+      [
+        "iËĲ",
+        "É¾"
+      ],
+      [
+        "k",
+        "ÉĻn"
+      ],
+      [
+        "Ġd",
+        "ËĪu"
+      ],
+      [
+        "n",
+        "aËĲ"
+      ],
+      [
+        "Ġp",
+        "wËĪe"
+      ],
+      [
+        "ËĮÉĶ",
+        "Éª"
+      ],
+      [
+        "ĠtÉķh",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ĠÎ²",
+        "ËĪi"
+      ],
+      [
+        "ËĪiÉĽ",
+        "Éľt"
+      ],
+      [
+        "Ġt",
+        "e"
+      ],
+      [
+        "ËĪaÃ°",
+        "os"
+      ],
+      [
+        "m",
+        "ËĪa"
+      ],
+      [
+        "Ġv",
+        "ËĪo"
+      ],
+      [
+        "Ġm",
+        "ËĪÉª"
+      ],
+      [
+        "Ġb",
+        "ËĮi"
+      ],
+      [
+        "a",
+        "d"
+      ],
+      [
+        "d",
+        "o"
+      ],
+      [
+        "Ġn",
+        "ËĪaÊĬ"
+      ],
+      [
+        "ĠÊ²ËĪy",
+        "Éľ"
+      ],
+      [
+        "w",
+        "ËĪÉĽ"
+      ],
+      [
+        "ËĪi",
+        "s"
+      ],
+      [
+        "e",
+        "l"
+      ],
+      [
+        "Ġpa",
+        "r"
+      ],
+      [
+        "Ġt",
+        "ËĪai"
+      ],
+      [
+        "ĠdËĪÉª",
+        "jaËĲ"
+      ],
+      [
+        "h",
+        "ËĪi"
+      ],
+      [
+        "ĠÉ¾",
+        "ËĪÊĮ"
+      ],
+      [
+        "Ġd",
+        "ËĪe"
+      ],
+      [
+        "ËĪaÉª",
+        "d"
+      ],
+      [
+        "Ġp",
+        "er"
+      ],
+      [
+        "Ġs",
+        "ËĮÉĶ"
+      ],
+      [
+        "w",
+        "e"
+      ],
+      [
+        "ÊĬ",
+        "m"
+      ],
+      [
+        "Ġi",
+        "n"
+      ],
+      [
+        "ĠjËĪuËĲ",
+        "z"
+      ],
+      [
+        "ËĪiËĲp",
+        "ÉĻl"
+      ],
+      [
+        "ĠÊĭ",
+        "ËĪaËĲl"
+      ],
+      [
+        "Ġe",
+        "tËĪÉĽ"
+      ],
+      [
+        "ËĮÉĽ",
+        "m"
+      ],
+      [
+        "Ġn",
+        "ËĪu"
+      ],
+      [
+        "ËĪÉĽ",
+        "kt"
+      ],
+      [
+        "ĠiËĲ",
+        "É¾"
+      ],
+      [
+        "Ġb",
+        "É¹"
+      ],
+      [
+        "Ġtsh",
+        "ËĪi"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪÉĶÉľ"
+      ],
+      [
+        "Ġkw",
+        "ËĮa"
+      ],
+      [
+        "Ġf",
+        "ËĪuÉľ"
+      ],
+      [
+        "w",
+        "ËĮa"
+      ],
+      [
+        "Ġd",
+        "ËĪiËĲ"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪyÉĻ"
+      ],
+      [
+        "ËĮÉĽ",
+        "ËĲ"
+      ],
+      [
+        "r",
+        "ËĪa"
+      ],
+      [
+        "Ġn",
+        "e"
+      ],
+      [
+        "Ġz",
+        "ËĪyÉĻ"
+      ],
+      [
+        "Ġb",
+        "ËĪaÉª"
+      ],
+      [
+        "ĠÉŁ",
+        "ËĪÊĮb"
+      ],
+      [
+        "ËĪuËĲ",
+        "to"
+      ],
+      [
+        "ÊĬ",
+        "nt"
+      ],
+      [
+        "Ġc",
+        "Ê°"
+      ],
+      [
+        "ËĪÉĽnt",
+        "i"
+      ],
+      [
+        "ËĪo",
+        "ÉĻ"
+      ],
+      [
+        "Ġs",
+        "ËĮÊĮm"
+      ],
+      [
+        "Ġl",
+        "Éĳ"
+      ],
+      [
+        "ËĮe",
+        "va"
+      ],
+      [
+        "É¾",
+        "ÉĽ"
+      ],
+      [
+        "nt",
+        "Éľ"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĽn"
+      ],
+      [
+        "ËĪÉĳËĲ",
+        "k"
+      ],
+      [
+        "Ġki",
+        "l"
+      ],
+      [
+        "ËĪon",
+        "es"
+      ],
+      [
+        "f",
+        "f"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĽËĲ"
+      ],
+      [
+        "Ġv",
+        "ËĪÉĻÉª"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "ĠËĮÉª",
+        "nt"
+      ],
+      [
+        "ÊĬ",
+        "n"
+      ],
+      [
+        "Ġw",
+        "Éªl"
+      ],
+      [
+        "Ġs",
+        "in"
+      ],
+      [
+        "ĠËĮa",
+        "lla"
+      ],
+      [
+        "ĠaÎ²",
+        "ËĪia"
+      ],
+      [
+        "p",
+        "i"
+      ],
+      [
+        "ËĪo",
+        "Éľ"
+      ],
+      [
+        "Éªj",
+        "ËĮaËĲ"
+      ],
+      [
+        "k",
+        "u"
+      ],
+      [
+        "Ġv",
+        "ËĪÉª"
+      ],
+      [
+        "Ġtu",
+        "t"
+      ],
+      [
+        "ĠtËĪe",
+        "Éľ"
+      ],
+      [
+        "Ġh",
+        "ËĪÉĶ"
+      ],
+      [
+        "Î²",
+        "É¾e"
+      ],
+      [
+        "s",
+        "ÉĻÉ¾"
+      ],
+      [
+        "Ġkh",
+        "ËĪai"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġt",
+        "a"
+      ],
+      [
+        "ĠÉ²",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġn",
+        "u"
+      ],
+      [
+        "ËĪuËĲ",
+        "n"
+      ],
+      [
+        "ĠÉĻËĲ",
+        "Éľ"
+      ],
+      [
+        "ĠËĪa",
+        "ÊĬf"
+      ],
+      [
+        "ËĪiËĲd",
+        "Éľ"
+      ],
+      [
+        "nt",
+        "i"
+      ],
+      [
+        "Ġp",
+        "ËĪiËĲpÉĻl"
+      ],
+      [
+        "Ġk",
+        "j"
+      ],
+      [
+        "Ġp",
+        "e"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĳ"
+      ],
+      [
+        "ËĮa",
+        "Éª"
+      ],
+      [
+        "ËĪaËĲ",
+        "le"
+      ],
+      [
+        "Ġv",
+        "ËĮÉĻËĲÉªÉľ"
+      ],
+      [
+        "mp",
+        "o"
+      ],
+      [
+        "ĠkËĪÉª",
+        "t"
+      ],
+      [
+        "Ġn",
+        "ËĮÉĽ"
+      ],
+      [
+        "ĠÉŁ",
+        "ËĪaËĲtaËĲ"
+      ],
+      [
+        "ĠsËĪaËĲt",
+        "Ê°"
+      ],
+      [
+        "ĠÉŁ",
+        "ËĪi"
+      ],
+      [
+        "Ġs",
+        "o"
+      ],
+      [
+        "Ġb",
+        "ËĪÉĽ"
+      ],
+      [
+        "k",
+        "ËĪi"
+      ],
+      [
+        "Éªt",
+        "i"
+      ],
+      [
+        "Ġts",
+        "i"
+      ],
+      [
+        "Ġk",
+        "Êģ"
+      ],
+      [
+        "ËĮ",
+        "ÉĴ"
+      ],
+      [
+        "É¡",
+        "ÉĻl"
+      ],
+      [
+        "k",
+        "st"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĻËĲ"
+      ],
+      [
+        "ËĪÊĮ",
+        "k"
+      ],
+      [
+        "Ġn",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "Ġa",
+        "p"
+      ],
+      [
+        "ĠlËĪÉª",
+        "kÊ°"
+      ],
+      [
+        "ll",
+        "i"
+      ],
+      [
+        "ĠkwËĪa",
+        "l"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĻËĲ"
+      ],
+      [
+        "Ġts",
+        "ËĪuei"
+      ],
+      [
+        "Ġd",
+        "o"
+      ],
+      [
+        "ĠkËĲ",
+        "jËĪo"
+      ],
+      [
+        "ÊĬ",
+        "z"
+      ],
+      [
+        "Ġp",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġm",
+        "ËĪuËĲ"
+      ],
+      [
+        "ĠÉ¡ÉĻ",
+        "v"
+      ],
+      [
+        "r",
+        "ËĪi"
+      ],
+      [
+        "Ġt",
+        "w"
+      ],
+      [
+        "ËĮ",
+        "Éªn"
+      ],
+      [
+        "d",
+        "ËĪÉĳ"
+      ],
+      [
+        "ĠÃ°",
+        "ËĪi"
+      ],
+      [
+        "ĠËĪaËĲ",
+        "i"
+      ],
+      [
+        "Ġh",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ĠÃ°",
+        "ËĮÉĽm"
+      ],
+      [
+        "ĠpÊ°",
+        "ËĪÉªÉ¾"
+      ],
+      [
+        "ÉĴ",
+        "m"
+      ],
+      [
+        "ĠËĮ",
+        "eËĲ"
+      ],
+      [
+        "Ġth",
+        "ËĪaiÉľ"
+      ],
+      [
+        "Ġv",
+        "ËĪas"
+      ],
+      [
+        "Ġn",
+        "ÉĳËĲ"
+      ],
+      [
+        "p",
+        "ÉĻn"
+      ],
+      [
+        "Ġp",
+        "ËĮÉĻÉ¾"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪaËĲÉª"
+      ],
+      [
+        "ËĪou",
+        "Éľ"
+      ],
+      [
+        "ĠÊĲ",
+        "ËĪuÉľ"
+      ],
+      [
+        "ĠmËĪa",
+        "n"
+      ],
+      [
+        "ĠtËĪÉĻ",
+        "ÉªÉľ"
+      ],
+      [
+        "Ġl",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "m",
+        "ËĪÉĽnte"
+      ],
+      [
+        "ĠfËĪa",
+        "m"
+      ],
+      [
+        "s",
+        "jËĪÉĶ"
+      ],
+      [
+        "Ġp",
+        "ËĪÉĻ"
+      ],
+      [
+        "ËĪeËĲ",
+        "m"
+      ],
+      [
+        "Ġp",
+        "ËĪÊĮr"
+      ],
+      [
+        "j",
+        "ËĪi"
+      ],
+      [
+        "Ġl",
+        "ÉĽ"
+      ],
+      [
+        "Ġt",
+        "en"
+      ],
+      [
+        "ËĪoËĲ",
+        "ra"
+      ],
+      [
+        "k",
+        "i"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "k",
+        "Éª"
+      ],
+      [
+        "bËĲ",
+        "e"
+      ],
+      [
+        "ËĪa",
+        "lt"
+      ],
+      [
+        "Ã°",
+        "Éª"
+      ],
+      [
+        "p",
+        "ËĪi"
+      ],
+      [
+        "ĠËĮÉĽ",
+        "nt"
+      ],
+      [
+        "Ġm",
+        "ËĪei"
+      ],
+      [
+        "Ġh",
+        "ËĪÉĻÊĬ"
+      ],
+      [
+        "Ġh",
+        "ËĪÉĽÉ¾"
+      ],
+      [
+        "j",
+        "ËĪÉĳ"
+      ],
+      [
+        "ĠhËĪÊĬ",
+        "aËĲ"
+      ],
+      [
+        "m",
+        "Éľ"
+      ],
+      [
+        "Ġd",
+        "Ê°"
+      ],
+      [
+        "ĠtÊĥ",
+        "ËĪe"
+      ],
+      [
+        "l",
+        "ËĪÉĽ"
+      ],
+      [
+        "ËĪaËĲt",
+        "e"
+      ],
+      [
+        "Ġp",
+        "ËĪuËĲ"
+      ],
+      [
+        "Ġm",
+        "ËĪÊĬ"
+      ],
+      [
+        "ËĪaËĲÉª",
+        "ÊĪ"
+      ],
+      [
+        "d",
+        "iËĲ"
+      ],
+      [
+        "ĠfÉ¹",
+        "ÉĴm"
+      ],
+      [
+        "Ġh",
+        "ËĪÉĳËĲ"
+      ],
+      [
+        "Î²",
+        "o"
+      ],
+      [
+        "ĠmËĪi",
+        "Éľn"
+      ],
+      [
+        "ĠÃ°",
+        "iËĲz"
+      ],
+      [
+        "Ġk",
+        "ËĪou"
+      ],
+      [
+        "ËĪiËĲ",
+        "na"
+      ],
+      [
+        "Ġav",
+        "ËĮeva"
+      ],
+      [
+        "Ġ",
+        "ËĪaËĲÉ¾"
+      ],
+      [
+        "Ġn",
+        "ËĪuËĲÉ¾"
+      ],
+      [
+        "ĠÎ²",
+        "ËĪe"
+      ],
+      [
+        "Ġz",
+        "aÉªn"
+      ],
+      [
+        "ËĪÉĽ",
+        "d"
+      ],
+      [
+        "É",
+        "Ĺ"
+      ],
+      [
+        "ËĪeÉª",
+        "k"
+      ],
+      [
+        "s",
+        "ËĮÉĻÊĬ"
+      ],
+      [
+        "ËĪeËĲ",
+        "ÉŁ"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĪÉĻËĲ"
+      ],
+      [
+        "j",
+        "e"
+      ],
+      [
+        "cÊ°",
+        "ËĲ"
+      ],
+      [
+        "ËĪÉĶ",
+        "r"
+      ],
+      [
+        "ÉĽ",
+        "ËĲ"
+      ],
+      [
+        "ĠtÉķhËĪy",
+        "Ã¦Éľn"
+      ],
+      [
+        "ĠËĮaÉªn",
+        "ÉĻn"
+      ],
+      [
+        "ĠiËĲ",
+        "n"
+      ],
+      [
+        "ĠbËĪÊĮ",
+        "c"
+      ],
+      [
+        "ËĪiËĲ",
+        "m"
+      ],
+      [
+        "É¾",
+        "as"
+      ],
+      [
+        "ËĮÉĻ",
+        "s"
+      ],
+      [
+        "Ġv",
+        "ËĪeËĲ"
+      ],
+      [
+        "ĠËĪÉĻr",
+        "Éľ"
+      ],
+      [
+        "Ġd",
+        "uËĲ"
+      ],
+      [
+        "nt",
+        "ÉĻ"
+      ],
+      [
+        "ĠpÉ¹",
+        "ËĪÉĴ"
+      ],
+      [
+        "Ġb",
+        "ËĪÉª"
+      ],
+      [
+        "ĠwËĪo",
+        "Éľ"
+      ],
+      [
+        "n",
+        "ËĮi"
+      ],
+      [
+        "Ġh",
+        "ÉĲ"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġe",
+        "t"
+      ],
+      [
+        "jËĪÉĽ",
+        "ndo"
+      ],
+      [
+        "ĠËĪai",
+        "Éľ"
+      ],
+      [
+        "Ġl",
+        "i"
+      ],
+      [
+        "ĠËĪaÊĬ",
+        "s"
+      ],
+      [
+        "kËĲ",
+        "o"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪyÉĻ"
+      ],
+      [
+        "k",
+        "eËĲ"
+      ],
+      [
+        "Ġf",
+        "ËĪiËĲl"
+      ],
+      [
+        "ĠbÊ°",
+        "ËĪaËĲi"
+      ],
+      [
+        "ĠÉ¡ÉĻ",
+        "Êĥ"
+      ],
+      [
+        "ÊĴ",
+        "ËĪe"
+      ],
+      [
+        "Ġn",
+        "jËĪuËĲ"
+      ],
+      [
+        "ĠËĪa",
+        "k"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪaËĲ"
+      ],
+      [
+        "z",
+        "ËĪa"
+      ],
+      [
+        "v",
+        "ËĪe"
+      ],
+      [
+        "ĠhËĮa",
+        "ÊĬ"
+      ],
+      [
+        "ÉĲ",
+        "Ã§"
+      ],
+      [
+        "ĠÉ¾ËĪÊĮ",
+        "kÊ°"
+      ],
+      [
+        "p",
+        "ËĪe"
+      ],
+      [
+        "ĠtÉĻ",
+        "bi"
+      ],
+      [
+        "ĠpËĪÊĮhÉĻl",
+        "ËĮeËĲ"
+      ],
+      [
+        "Ġf",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġw",
+        "ËĮÉªtÊĥ"
+      ],
+      [
+        "ĠtÉķËĪy",
+        "ÉĽÉľ"
+      ],
+      [
+        "w",
+        "ËĮe"
+      ],
+      [
+        "ËĮa",
+        "Éªt"
+      ],
+      [
+        "ĠnÉĳËĲ",
+        "x"
+      ],
+      [
+        "ĠkËĪÉĶËĲ",
+        "n"
+      ],
+      [
+        "ÊĬ",
+        "k"
+      ],
+      [
+        "ĠbËĪaËĲ",
+        "d"
+      ],
+      [
+        "Åĭ",
+        "ÉĻn"
+      ],
+      [
+        "Ġn",
+        "i"
+      ],
+      [
+        "Ġb",
+        "ËĪe"
+      ],
+      [
+        "Ġm",
+        "ËĮÊĬ"
+      ],
+      [
+        "ËĪa",
+        "r"
+      ],
+      [
+        "ĠmËĮe",
+        "Éªk"
+      ],
+      [
+        "Ġs",
+        "ËĪaËĲÉ¾"
+      ],
+      [
+        "Î²",
+        "e"
+      ],
+      [
+        "ĠtÉķhËĪi",
+        "ÉľÅĭ"
+      ],
+      [
+        "it",
+        "ËĪe"
+      ],
+      [
+        "k",
+        "ËĮe"
+      ],
+      [
+        "ËĪÉĽËĲ",
+        "l"
+      ],
+      [
+        "ËĮ",
+        "ÉĴn"
+      ],
+      [
+        "ËĮ",
+        "Éĳ"
+      ],
+      [
+        "Ġb",
+        "ËĪÉªl"
+      ],
+      [
+        "Ġw",
+        "ÊĬd"
+      ],
+      [
+        "Ġb",
+        "ËĪoËĲl"
+      ],
+      [
+        "r",
+        "d"
+      ],
+      [
+        "i",
+        "ÉĻ"
+      ],
+      [
+        "Ġd",
+        "a"
+      ],
+      [
+        "Ġb",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "ĠnËĪÊĮmb",
+        "ÉĻÉ¾"
+      ],
+      [
+        "ËĪaËĲÉª",
+        "Éľ"
+      ],
+      [
+        "ĠÉĽ",
+        "m"
+      ],
+      [
+        "Ġm",
+        "iËĲÉ¾"
+      ],
+      [
+        "ËĪeÉª",
+        "m"
+      ],
+      [
+        "l",
+        "os"
+      ],
+      [
+        "ËĮÉĽ",
+        "t"
+      ],
+      [
+        "ĠËĮaÊĬ",
+        "s"
+      ],
+      [
+        "ĠmËĪa",
+        "Éľt"
+      ],
+      [
+        "Ġw",
+        "ËĪuÉĻ"
+      ],
+      [
+        "Ġw",
+        "ËĪeÉª"
+      ],
+      [
+        "Ġse",
+        "É²"
+      ],
+      [
+        "Ġb",
+        "jËĪÉĽ"
+      ],
+      [
+        "Ġw",
+        "ÉĽn"
+      ],
+      [
+        "f",
+        "l"
+      ],
+      [
+        "Ġkh",
+        "wËĪa"
+      ],
+      [
+        "d",
+        "ËĪÉĽ"
+      ],
+      [
+        "v",
+        "É¹Éª"
+      ],
+      [
+        "ĠËĪa",
+        "É¾"
+      ],
+      [
+        "jËĪÉĳu",
+        "Éľ"
+      ],
+      [
+        "ĠËĮaËĲpk",
+        "ËĮeËĲ"
+      ],
+      [
+        "b",
+        "Êģ"
+      ],
+      [
+        "ĠtËĪaÉª",
+        "m"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĳ"
+      ],
+      [
+        "Ġs",
+        "ËĮa"
+      ],
+      [
+        "Ġz",
+        "ËĪoÉª"
+      ],
+      [
+        "ËĪÉĶÉ¾",
+        "a"
+      ],
+      [
+        "Ġd",
+        "ËĪÃ¸"
+      ],
+      [
+        "ËĪÉĶÉ¾",
+        "t"
+      ],
+      [
+        "ĠÅĭ",
+        "ËĪÉĶ"
+      ],
+      [
+        "m",
+        "in"
+      ],
+      [
+        "Ġl",
+        "ËĪÊĬk"
+      ],
+      [
+        "ËĪÉĶËĲ",
+        "t"
+      ],
+      [
+        "ĠËĪÉĶ",
+        "tÉ¾"
+      ],
+      [
+        "Ġf",
+        "ËĪaÉª"
+      ],
+      [
+        "ĠÉ¡",
+        "ÉĴt"
+      ],
+      [
+        "ËĪeËĲ",
+        "ÉĻn"
+      ],
+      [
+        "k",
+        "ËĪÉĶ"
+      ],
+      [
+        "ĠvËĪÉĽ",
+        "É¹i"
+      ],
+      [
+        "m",
+        "ÉĽ"
+      ],
+      [
+        "ËĪaÉª",
+        "z"
+      ],
+      [
+        "Ġe",
+        "sp"
+      ],
+      [
+        "É²",
+        "a"
+      ],
+      [
+        "Ġl",
+        "ËĪo"
+      ],
+      [
+        "ËĪÉĽËĲ",
+        "ra"
+      ],
+      [
+        "Î²",
+        "ËĪi"
+      ],
+      [
+        "ou",
+        "Éľ"
+      ],
+      [
+        "ËĮÉĻ",
+        "k"
+      ],
+      [
+        "tÊĥ",
+        "uËĲ"
+      ],
+      [
+        "Ġn",
+        "ËĪyÉĻ"
+      ],
+      [
+        "ÊĪ",
+        "É¾"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪy"
+      ],
+      [
+        "ĠtËĪo",
+        "Ã°o"
+      ],
+      [
+        "ËĪÉª",
+        "Ã§t"
+      ],
+      [
+        "Ġm",
+        "ÉªÃ§"
+      ],
+      [
+        "ĠËĪa",
+        "nd"
+      ],
+      [
+        "Ġkw",
+        "ËĮÉĽl"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĪaËĲ"
+      ],
+      [
+        "ĠnËĪi",
+        "Éľ"
+      ],
+      [
+        "ËĪÉĶ",
+        "p"
+      ],
+      [
+        "ËĪiËĲ",
+        "z"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĪaÊĬ"
+      ],
+      [
+        "ĠÉ¾ËĮÉĻh",
+        "i"
+      ],
+      [
+        "ĠsËĮÊĬ",
+        "o"
+      ],
+      [
+        "ĠÉĽ",
+        "É¡"
+      ],
+      [
+        "Ġd",
+        "Åĵ"
+      ],
+      [
+        "ĠÉ¡ËĮaËĲ",
+        "ÉªÉľ"
+      ],
+      [
+        "d",
+        "Éª"
+      ],
+      [
+        "l",
+        "ËĮa"
+      ],
+      [
+        "st",
+        "ËĪi"
+      ],
+      [
+        "ĠdËĮiËĲ",
+        "z"
+      ],
+      [
+        "Ġt",
+        "ËĮÊĬ"
+      ],
+      [
+        "Î¸",
+        "i"
+      ],
+      [
+        "ĠËĪÉª",
+        "skËĮoËĲ"
+      ],
+      [
+        "nd",
+        "ÉĻn"
+      ],
+      [
+        "Ġts",
+        "v"
+      ],
+      [
+        "Ġh",
+        "ËĪÉĻËĲ"
+      ],
+      [
+        "ĠÊĥ",
+        "ËĪÊĬ"
+      ],
+      [
+        "ÉĻt",
+        "ËĮeËĲ"
+      ],
+      [
+        "p",
+        "ËĮÉĽ"
+      ],
+      [
+        "ËĪaÉ¾",
+        "ÉĶn"
+      ],
+      [
+        "Ġp",
+        "ÉĽÊģ"
+      ],
+      [
+        "Ġ",
+        "y"
+      ],
+      [
+        "m",
+        "nËĮeËĲ"
+      ],
+      [
+        "ËĪÉĽ",
+        "llo"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÉĻ"
+      ],
+      [
+        "ĠËĮa",
+        "d"
+      ],
+      [
+        "ĠÊĥ",
+        "v"
+      ],
+      [
+        "ËĪÊı",
+        "É¾"
+      ],
+      [
+        "r",
+        "ËĪe"
+      ],
+      [
+        "y",
+        "ËĲ"
+      ],
+      [
+        "Ġp",
+        "ËĪaËĲs"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĽn"
+      ],
+      [
+        "Éª",
+        "dÊĴ"
+      ],
+      [
+        "ËĪua",
+        "i"
+      ],
+      [
+        "Ġf",
+        "i"
+      ],
+      [
+        "Ġt",
+        "ËĪyÉĻ"
+      ],
+      [
+        "ËĪaËĲ",
+        "ÉŁ"
+      ],
+      [
+        "Ġt",
+        "jËĪe"
+      ],
+      [
+        "ËĪaËĲn",
+        "aËĲ"
+      ],
+      [
+        "st",
+        "É¾"
+      ],
+      [
+        "Êİ",
+        "e"
+      ],
+      [
+        "ËĮe",
+        "Éªt"
+      ],
+      [
+        "b",
+        "a"
+      ],
+      [
+        "Ã°",
+        "as"
+      ],
+      [
+        "v",
+        "Êģ"
+      ],
+      [
+        "Ġz",
+        "ËĪÉĻËĲ"
+      ],
+      [
+        "ËĪaËĲ",
+        "li"
+      ],
+      [
+        "ÉŁÊ°",
+        "eËĲ"
+      ],
+      [
+        "ËĪaËĲt",
+        "eËĲ"
+      ],
+      [
+        "Ġv",
+        "ËĪa"
+      ],
+      [
+        "Ġsa",
+        "l"
+      ],
+      [
+        "ËĪaËĲ",
+        "no"
+      ],
+      [
+        "ĠÉ¡ÉĻ",
+        "z"
+      ],
+      [
+        "ĠhËĪoËĲ",
+        "ti"
+      ],
+      [
+        "ĠÉ²",
+        "ËĪiÉĽ"
+      ],
+      [
+        "t",
+        "Éľ"
+      ],
+      [
+        "ĠËĪaËĲ",
+        "p"
+      ],
+      [
+        "Ġw",
+        "ËĪÉĽl"
+      ],
+      [
+        "Ġm",
+        "ËĪÉªl"
+      ],
+      [
+        "Ġfy",
+        "ËĲÉ¾"
+      ],
+      [
+        "ËĪÉĽËĲs",
+        "aËĲ"
+      ],
+      [
+        "Ġb",
+        "ËĮiËĲ"
+      ],
+      [
+        "ËĪaËĲ",
+        "jaËĲ"
+      ],
+      [
+        "ËĪÉª",
+        "p"
+      ],
+      [
+        "Ġf",
+        "Êģ"
+      ],
+      [
+        "tsi",
+        "ËĪoËĲne"
+      ],
+      [
+        "Ġw",
+        "ËĪuÉľ"
+      ],
+      [
+        "Ġv",
+        "i"
+      ],
+      [
+        "ĠwËĪÉĳ",
+        "Éľn"
+      ],
+      [
+        "ËĪoËĲ",
+        "n"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪÉĻÉª"
+      ],
+      [
+        "ĠÊĿ",
+        "ËĪo"
+      ],
+      [
+        "Ġr",
+        "a"
+      ],
+      [
+        "m",
+        "ÉĻnt"
+      ],
+      [
+        "ËĪaÊĬ",
+        "nd"
+      ],
+      [
+        "Ġp",
+        "ÉĽÉ¾"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "oËĲ",
+        "É¾"
+      ],
+      [
+        "h",
+        "ËĪo"
+      ],
+      [
+        "ĠÉĴ",
+        "n"
+      ],
+      [
+        "ĠÊİ",
+        "e"
+      ],
+      [
+        "ĠsËĪÉª",
+        "ks"
+      ],
+      [
+        "É¡",
+        "n"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪa"
+      ],
+      [
+        "Ġ",
+        "Î¸j"
+      ],
+      [
+        "Ġp",
+        "ËĪe"
+      ],
+      [
+        "sp",
+        "e"
+      ],
+      [
+        "Ġv",
+        "ËĪÉĻ"
+      ],
+      [
+        "Ġf",
+        "ËĪÉª"
+      ],
+      [
+        "ĠËĮÉªnt",
+        "ÊĬ"
+      ],
+      [
+        "l",
+        "ÉĻn"
+      ],
+      [
+        "Ġn",
+        "ËĪiËĲd"
+      ],
+      [
+        "ĠsËĮÊĬ",
+        "a"
+      ],
+      [
+        "ĠËĪu",
+        "m"
+      ],
+      [
+        "Ġd",
+        "ËĪeÉª"
+      ],
+      [
+        "ĠËĪÊĮ",
+        "bÊ°i"
+      ],
+      [
+        "ËĪÉĳËĲ",
+        "É¾"
+      ],
+      [
+        "Ġb",
+        "ËĪiÉĽÉľt"
+      ],
+      [
+        "Êİ",
+        "os"
+      ],
+      [
+        "Ġtsh",
+        "ËĪaiÉľ"
+      ],
+      [
+        "ĠËĮÉª",
+        "skËĮaËĲ"
+      ],
+      [
+        "ĠaÊĬ",
+        "ÉĻ"
+      ],
+      [
+        "ĠËĪy",
+        "Ã¦"
+      ],
+      [
+        "Ġd",
+        "yn"
+      ],
+      [
+        "Ġm",
+        "ËĪiËĲn"
+      ],
+      [
+        "ĠËĪÊĮ",
+        "cÊ°ËĲ"
+      ],
+      [
+        "Ġs",
+        "ÉĽ"
+      ],
+      [
+        "Ġn",
+        "ËĪy"
+      ],
+      [
+        "Ġn",
+        "ËĮÉĽl"
+      ],
+      [
+        "É¡",
+        "É¾"
+      ],
+      [
+        "Êĥ",
+        "ËĪe"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĮÉĽ"
+      ],
+      [
+        "ĠËĪÉĽ",
+        "vÉ¹Éª"
+      ],
+      [
+        "ËĪÉĽl",
+        "p"
+      ],
+      [
+        "ĠbËĪa",
+        "k"
+      ],
+      [
+        "Ġ",
+        "eËĲ"
+      ],
+      [
+        "Ġf",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġk",
+        "ÉĽl"
+      ],
+      [
+        "ĠËĪeËĲ",
+        "s"
+      ],
+      [
+        "j",
+        "ËĪaËĲd"
+      ],
+      [
+        "Ġl",
+        "ËĮi"
+      ],
+      [
+        "mb",
+        "É¾e"
+      ],
+      [
+        "k",
+        "tÉĻ"
+      ],
+      [
+        "nt",
+        "a"
+      ],
+      [
+        "t",
+        "ËĪu"
+      ],
+      [
+        "ĠÃ°",
+        "ËĪat"
+      ],
+      [
+        "ĠËĪa",
+        "Î²"
+      ],
+      [
+        "ÉĻÉ¹",
+        "i"
+      ],
+      [
+        "ĠkwËĮÉĽ",
+        "lla"
+      ],
+      [
+        "Ġb",
+        "ÉĻn"
+      ],
+      [
+        "r",
+        "ËĮÉĽ"
+      ],
+      [
+        "Ġn",
+        "ÉĶ"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÉª"
+      ],
+      [
+        "ĠËĪa",
+        "p"
+      ],
+      [
+        "É¹",
+        "ÉĻ"
+      ],
+      [
+        "ËĪa",
+        "Éľkh"
+      ],
+      [
+        "ĠÊĲ",
+        "ËĪi"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĳËĲ"
+      ],
+      [
+        "Éª",
+        "É¡ÉĻn"
+      ],
+      [
+        "Ġw",
+        "ËĪai"
+      ],
+      [
+        "Ġp",
+        "ÉĻt"
+      ],
+      [
+        "kËĲ",
+        "a"
+      ],
+      [
+        "Ġb",
+        "ËĪÉĽËĲ"
+      ],
+      [
+        "ËĪeËĲ",
+        "Êĭ"
+      ],
+      [
+        "ls",
+        "ÉĻÊĬ"
+      ],
+      [
+        "ĠcËĪaËĲh",
+        "ÉªËĮeËĲ"
+      ],
+      [
+        "Ġk",
+        "ÉĻn"
+      ],
+      [
+        "ĠËĮaÉªn",
+        "ÉĻm"
+      ],
+      [
+        "ËĪuËĲ",
+        "t"
+      ],
+      [
+        "Ġh",
+        "ËĪaÊĬ"
+      ],
+      [
+        "Ġt",
+        "ËĪanto"
+      ],
+      [
+        "ĠhÉĲ",
+        "z"
+      ],
+      [
+        "Ġs",
+        "ËĪÊĮÉ¾"
+      ],
+      [
+        "Ġn",
+        "o"
+      ],
+      [
+        "Ġt",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "Ġz",
+        "ËĪaÉª"
+      ],
+      [
+        "ĠtÉķËĪiÉĽ",
+        "Éľ"
+      ],
+      [
+        "Ġko",
+        "zËĪi"
+      ],
+      [
+        "Ġk",
+        "ËĪei"
+      ],
+      [
+        "Ã°",
+        "ËĪÉĶÉ¾"
+      ],
+      [
+        "ËĮÉĶ",
+        "Êģ"
+      ],
+      [
+        "Ġt",
+        "ËĪÊĮÉ¾"
+      ],
+      [
+        "ĠÊĲ",
+        "ËĪÉĻ"
+      ],
+      [
+        "ĠÉķËĪy",
+        "ÉĽÉľ"
+      ],
+      [
+        "ĠmËĮÊĬ",
+        "ÉŁÊ°eËĲ"
+      ],
+      [
+        "m",
+        "f"
+      ],
+      [
+        "Ġv",
+        "ËĪiËĲdÉľ"
+      ],
+      [
+        "k",
+        "ËĪa"
+      ],
+      [
+        "ĠÉĲ",
+        "É¡"
+      ],
+      [
+        "k",
+        "w"
+      ],
+      [
+        "ĠÊģ",
+        "ÉĽ"
+      ],
+      [
+        "x",
+        "ÉĻn"
+      ],
+      [
+        "Ġd",
+        "ÊĬ"
+      ],
+      [
+        "ĠkËĪÊĮÉ¾",
+        "nËĮeËĲ"
+      ],
+      [
+        "jËĪaËĲd",
+        "aËĲ"
+      ],
+      [
+        "Ġf",
+        "ÉĻ"
+      ],
+      [
+        "ĠËĮi",
+        "mp"
+      ],
+      [
+        "Ġh",
+        "Éªz"
+      ],
+      [
+        "Ġ",
+        "Ê°Ïĩ"
+      ],
+      [
+        "ËĪoËĲ",
+        "ni"
+      ],
+      [
+        "Ġx",
+        "ËĪiÉľ"
+      ],
+      [
+        "ËĪeËĲ",
+        "sÊĪ"
+      ],
+      [
+        "Êı",
+        "bÉľ"
+      ],
+      [
+        "ËĮÉĶÉ¾",
+        "ke"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÉĻÊĬ"
+      ],
+      [
+        "ËĪÉª",
+        "ÊĥÉĻn"
+      ],
+      [
+        "l",
+        "es"
+      ],
+      [
+        "Ġf",
+        "ËĪiËĲ"
+      ],
+      [
+        "É¡",
+        "tÉĻ"
+      ],
+      [
+        "ËĪeËĲ",
+        "re"
+      ],
+      [
+        "Ġv",
+        "ËĮaËĲ"
+      ],
+      [
+        "Ġ",
+        "ËĪeÉª"
+      ],
+      [
+        "Ġm",
+        "ËĪuÉĻÉľn"
+      ],
+      [
+        "ĠÉ¡ËĪÊĬ",
+        "d"
+      ],
+      [
+        "ĠmËĮa",
+        "Éªn"
+      ],
+      [
+        "z",
+        "ËĪe"
+      ],
+      [
+        "ĠlËĪi",
+        "Éľ"
+      ],
+      [
+        "Ġm",
+        "u"
+      ],
+      [
+        "Ġk",
+        "ËĮÉĽl"
+      ],
+      [
+        "Ġj",
+        "ËĮÉĻh"
+      ],
+      [
+        "Ġf",
+        "ËĮÉĶÉ¾"
+      ],
+      [
+        "f",
+        "É¹"
+      ],
+      [
+        "Ġk",
+        "ËĪaÉªn"
+      ],
+      [
+        "ĠËĪÉĴ",
+        "lsÉĻÊĬ"
+      ],
+      [
+        "Î¸",
+        "ÉªÅĭ"
+      ],
+      [
+        "Ġth",
+        "ËĪonÉ¡Éľ"
+      ],
+      [
+        "t",
+        "ËĪÉĳ"
+      ],
+      [
+        "Î¸j",
+        "o"
+      ],
+      [
+        "m",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġ",
+        "os"
+      ],
+      [
+        "Ġs",
+        "ÊĬ"
+      ],
+      [
+        "ĠsËĪÊĮ",
+        "mÉĻ"
+      ],
+      [
+        "ĠvËĮÉĽ",
+        "n"
+      ],
+      [
+        "n",
+        "ËĪo"
+      ],
+      [
+        "ĠËĪak",
+        "tÊĥuËĲ"
+      ],
+      [
+        "É£",
+        "a"
+      ],
+      [
+        "ĠtÊ°",
+        "i"
+      ],
+      [
+        "Ġf",
+        "ËĮi"
+      ],
+      [
+        "Ġv",
+        "ËĪÉĽl"
+      ],
+      [
+        "ĠtËĪu",
+        "tËĲi"
+      ],
+      [
+        "x",
+        "os"
+      ]
+    ]
+  }
+}
\ No newline at end of file

From 5e78e46b35602b519dde97c8d0e3b24176c6a42b Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Sun, 1 Feb 2026 23:10:02 +0000
Subject: [PATCH 29/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 examples/tts/easy_magpietts.py                |   1 +
 nemo/collections/tts/models/easy_magpietts.py |  17 +-
 .../ipa_scripts/add_ipa_to_lhotse_shards.py   |  27 ++-
 .../ipa_scripts/analyze_ipa_tokenization.py   | 208 +++++++++---------
 .../ipa_scripts/train_ipa_bpe_tokenizer.py    | 119 +++++-----
 5 files changed, 191 insertions(+), 181 deletions(-)

diff --git a/examples/tts/easy_magpietts.py b/examples/tts/easy_magpietts.py
index 705c4ab77134..4195060b87ef 100644
--- a/examples/tts/easy_magpietts.py
+++ b/examples/tts/easy_magpietts.py
@@ -21,6 +21,7 @@
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
+
 @hydra_runner(config_path="conf/magpietts", config_name="easy_magpietts")
 def main(cfg):
     logging.info('\nConfig Params:\n%s', OmegaConf.to_yaml(cfg, resolve=True))
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 68a48ab9701c..f6dd9c7728b3 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -352,7 +352,8 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.audio_out_projection = nn.Identity()
 
         self.final_proj = nn.Linear(
-            self.audio_embedding_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor
+            self.audio_embedding_dim,
+            self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor,
         )
         self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none')
 
@@ -376,7 +377,9 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             )
             # Projection from local_transformer_hidden_dim to audio_embedding_dim (Identity if same)
             if self.audio_embedding_dim != local_transformer_hidden_dim:
-                self.local_transformer_audio_out_projection = nn.Linear(local_transformer_hidden_dim, self.audio_embedding_dim)
+                self.local_transformer_audio_out_projection = nn.Linear(
+                    local_transformer_hidden_dim, self.audio_embedding_dim
+                )
             else:
                 self.local_transformer_audio_out_projection = nn.Identity()
             local_transformer_out_projections = []
@@ -1365,7 +1368,7 @@ def process_batch(
         # Text dropout: randomly drop text input to encourage the model to rely on other signals
         dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False
         dropout_phoneme_input = (random.random() < self.dropout_phoneme_input_prob) if mode == 'train' else False
-        if (dropout_phoneme_input and dropout_text_input):
+        if dropout_phoneme_input and dropout_text_input:
             # Only one of the two can be True, so choose randomly
             dropout_phoneme_input = random.random() < 0.5
             dropout_text_input = not dropout_phoneme_input
@@ -1462,7 +1465,9 @@ def process_batch(
                 remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1)
             else:
                 # Log Warning
-                print(f"Warning: Remaining text length {remaining_text_embedded.size(1)} is greater than audio codes input length {audio_codes_input_embedded.size(1)}")
+                print(
+                    f"Warning: Remaining text length {remaining_text_embedded.size(1)} is greater than audio codes input length {audio_codes_input_embedded.size(1)}"
+                )
                 remaining_text_embedded = remaining_text_embedded[:, : audio_codes_input_embedded.size(1), :]
             # Add text information to audio embeddings (element-wise addition)
             audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded
@@ -2035,9 +2040,7 @@ def infer_batch(
 
                 # Project from hidden_dim to audio_embedding_dim, then to logits
                 last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :])
-                all_code_logits_t = self.final_proj(
-                    last_hidden_audio
-                )  # (B, num_codebooks * num_tokens_per_codebook)
+                all_code_logits_t = self.final_proj(last_hidden_audio)  # (B, num_codebooks * num_tokens_per_codebook)
 
                 if self.phoneme_tokenizer is not None:
                     all_code_logits_t_phoneme = self.phoneme_final_proj(
diff --git a/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py b/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py
index 61a124d56ccc..10972d1bdc6a 100644
--- a/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py
+++ b/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py
@@ -45,13 +45,14 @@ def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[
     """Load CUTS_DIRS_BY_LANG from a JSON config file."""
     if config_path is None:
         config_path = DEFAULT_CONFIG_PATH
-    
+
     if not config_path.exists():
         raise FileNotFoundError(f"Config file not found: {config_path}")
-    
+
     with open(config_path, "r", encoding="utf-8") as f:
         return json.load(f)
 
+
 # Map your dataset language keys to espeak voice codes (adjust as needed).
 # For German, espeak-ng uses "de" typically.
 ESPEAK_VOICE_BY_LANG: Dict[str, str] = {
@@ -100,8 +101,7 @@ def _find_espeak_binary() -> str:
         if shutil.which(exe):
             return exe
     raise RuntimeError(
-        "Neither 'espeak-ng' nor 'espeak' was found on PATH. "
-        "Install espeak-ng (recommended) or espeak."
+        "Neither 'espeak-ng' nor 'espeak' was found on PATH. " "Install espeak-ng (recommended) or espeak."
     )
 
 
@@ -204,7 +204,7 @@ def add_ipa_to_cut(
             custom["normalized_text"] = text
         else:
             text = custom.get("normalized_text") or sup.get("text")
-        
+
         if not text:
             continue
 
@@ -234,9 +234,10 @@ def process_shard(
     cache = IPACache()
     n = 0
 
-    with gzip.open(shard_path, "rt", encoding="utf-8") as fin, gzip.open(
-        out_shard_path, "wt", encoding="utf-8"
-    ) as fout:
+    with (
+        gzip.open(shard_path, "rt", encoding="utf-8") as fin,
+        gzip.open(out_shard_path, "wt", encoding="utf-8") as fout,
+    ):
         for line in fin:
             line = line.strip()
             if not line:
@@ -315,25 +316,23 @@ def process_language(lang: str, cuts_dirs: Dict[str, List[str]]) -> bool:
             print(f"[WARN] missing dir: {cuts_dir}", file=sys.stderr)
             continue
         process_cuts_dir(lang, cuts_dir)
-    
+
     return True
 
 
 def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Add IPA strings to Lhotse cuts jsonl.gz shards."
-    )
+    parser = argparse.ArgumentParser(description="Add IPA strings to Lhotse cuts jsonl.gz shards.")
     parser.add_argument(
         "--lang",
         type=str,
         required=True,
-        help="Language code to process (e.g., 'de', 'en', 'fr') or 'all' for all languages."
+        help="Language code to process (e.g., 'de', 'en', 'fr') or 'all' for all languages.",
     )
     parser.add_argument(
         "--config",
         type=str,
         default=None,
-        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}"
+        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}",
     )
     args = parser.parse_args()
 
diff --git a/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py b/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py
index 7032e1eeca0f..e2d53c3099d3 100644
--- a/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py
+++ b/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py
@@ -64,13 +64,14 @@ def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[
     """Load CUTS_DIRS_BY_LANG from a JSON config file."""
     if config_path is None:
         config_path = DEFAULT_CONFIG_PATH
-    
+
     if not config_path.exists():
         raise FileNotFoundError(f"Config file not found: {config_path}")
-    
+
     with open(config_path, "r", encoding="utf-8") as f:
         return json.load(f)
 
+
 OUTPUT_SUFFIX = "_with_ipa"
 SHARD_GLOB = "cuts.*.jsonl.gz"
 
@@ -78,6 +79,7 @@ def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[
 @dataclass
 class TextPair:
     """A pair of raw text and its IPA phonemization with audio duration."""
+
     raw_text: str
     ipa_text: str
     lang: str
@@ -87,6 +89,7 @@ class TextPair:
 @dataclass
 class TokenizationStats:
     """Statistics for tokenization comparison (tokens per second)."""
+
     lang: str
     num_samples: int
     total_duration: float  # sum of all durations in seconds
@@ -113,7 +116,7 @@ def iter_shards(ipa_dir: Path) -> List[Path]:
 def extract_text_pairs_from_shard(shard_path: Path, lang: str) -> Generator[TextPair, None, None]:
     """
     Extract text pairs (raw text + IPA) from a single shard file.
-    
+
     Yields:
         TextPair objects with raw_text, ipa_text, and duration
     """
@@ -132,7 +135,7 @@ def extract_text_pairs_from_shard(shard_path: Path, lang: str) -> Generator[Text
                     ipa = custom.get("ipa")
                     # Get raw text - prefer normalized_text, fallback to text
                     raw_text = custom.get("normalized_text") or sup.get("text")
-                    
+
                     if ipa and raw_text and isinstance(ipa, str) and isinstance(raw_text, str):
                         ipa = ipa.strip()
                         raw_text = raw_text.strip()
@@ -150,31 +153,31 @@ def sample_text_pairs(
 ) -> List[TextPair]:
     """
     Sample text pairs from a language's cuts_with_ipa directories.
-    
+
     Args:
         lang: Language code
         cuts_dirs: Dictionary mapping language codes to lists of cuts directories
         num_samples: Number of samples to collect
         seed: Random seed for reproducibility
-    
+
     Returns:
         List of TextPair objects
     """
     random.seed(seed)
-    
+
     if lang not in cuts_dirs:
         raise ValueError(f"Unknown language: {lang}")
-    
+
     # Collect all text pairs from all directories
     all_pairs = []
     for cuts_dir_str in cuts_dirs[lang]:
         cuts_dir = Path(cuts_dir_str)
         ipa_dir = get_ipa_dir(cuts_dir)
-        
+
         if not ipa_dir.exists():
             print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr)
             continue
-        
+
         shards = iter_shards(ipa_dir)
         for shard in shards:
             for pair in extract_text_pairs_from_shard(shard, lang):
@@ -186,12 +189,12 @@ def sample_text_pairs(
                 break
         if len(all_pairs) >= num_samples * 10:
             break
-    
+
     # Sample
     if len(all_pairs) <= num_samples:
         print(f"[INFO] {lang}: Only found {len(all_pairs)} pairs, using all")
         return all_pairs
-    
+
     return random.sample(all_pairs, num_samples)
 
 
@@ -202,14 +205,14 @@ def iter_ipa_strings_for_lang(
     """Iterate over all IPA strings for a single language (memory-efficient)."""
     if lang not in cuts_dirs:
         return
-    
+
     for cuts_dir_str in cuts_dirs[lang]:
         cuts_dir = Path(cuts_dir_str)
         ipa_dir = get_ipa_dir(cuts_dir)
-        
+
         if not ipa_dir.exists():
             continue
-        
+
         shards = iter_shards(ipa_dir)
         for shard in shards:
             with gzip.open(shard, "rt", encoding="utf-8") as f:
@@ -246,31 +249,31 @@ def simple_sample_ipa_strings(
 ) -> List[str]:
     """
     Simple sampling: collect up to max_collect IPA strings, then randomly sample k.
-    
+
     This avoids reading through all data like reservoir sampling does.
-    
+
     Args:
         lang: Language code
         cuts_dirs: Dictionary mapping language codes to lists of cuts directories
         k: Number of samples to select
         max_collect: Maximum number of strings to collect before sampling
         seed: Random seed for reproducibility
-    
+
     Returns:
         List of up to k sampled IPA strings
     """
     rng = random.Random(seed)
     collected: List[str] = []
-    
+
     for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs):
         collected.append(ipa)
         if len(collected) >= max_collect:
             break
-    
+
     # If we have fewer than k, return all
     if len(collected) <= k:
         return collected
-    
+
     # Otherwise, randomly sample k
     return rng.sample(collected, k)
 
@@ -285,11 +288,11 @@ def create_balanced_corpus(
 ) -> Tuple[str, Dict[str, int]]:
     """
     Create a balanced IPA corpus file with equal samples from each language.
-    
+
     Uses a memory-efficient two-pass approach:
     1. First pass: Count sentences per language (up to max_count_per_lang)
     2. Second pass: Use simple sampling to select samples
-    
+
     Args:
         train_langs: List of language codes to include
         cuts_dirs: Dictionary mapping language codes to lists of cuts directories
@@ -297,14 +300,14 @@ def create_balanced_corpus(
         max_samples_per_lang: Optional cap on samples per language
         max_count_per_lang: Max count per language when counting IPA strings
         seed: Random seed for reproducibility
-    
+
     Returns:
         Tuple of (corpus_file_path, dict of lang -> actual_count)
     """
     # First pass: Count sentences per language
     print("[INFO] Pass 1: Counting IPA strings per language...")
     lang_counts: Dict[str, int] = {}
-    
+
     for lang in train_langs:
         if lang not in cuts_dirs:
             print(f"[WARN] Language {lang} not in config, skipping")
@@ -313,42 +316,42 @@ def create_balanced_corpus(
         count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang)
         lang_counts[lang] = count
         print(f"{count} IPA strings")
-    
+
     if not lang_counts:
         raise ValueError("No IPA strings found for any language")
-    
+
     # Find minimum count across languages
     min_count = min(lang_counts.values())
     print(f"[INFO] Minimum count across languages: {min_count}")
-    
+
     # Apply max_samples_per_lang cap if specified
     samples_per_lang = min_count
     if max_samples_per_lang is not None and max_samples_per_lang < min_count:
         samples_per_lang = max_samples_per_lang
         print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}")
-    
+
     # Second pass: Sample from each language using simple sampling
     print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...")
     actual_counts: Dict[str, int] = {}
     total_written = 0
-    
+
     with open(output_file, "w", encoding="utf-8") as f:
         for lang in lang_counts.keys():
             print(f"[INFO] Sampling from {lang}...", end=" ", flush=True)
             # Use different seed per language for variety, but reproducible
             lang_seed = seed + hash(lang) % 10000
             sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed)
-            
+
             for ipa in sampled:
                 f.write(ipa + "\n")
                 total_written += 1
-            
+
             actual_counts[lang] = len(sampled)
             print(f"sampled {len(sampled)} strings")
-    
+
     print(f"[INFO] Total IPA strings written to corpus: {total_written}")
     print(f"[INFO] Balanced corpus saved to: {output_file}")
-    
+
     return output_file, actual_counts
 
 
@@ -360,48 +363,48 @@ def train_ipa_bpe_tokenizer(
 ) -> Tokenizer:
     """
     Train a byte-level BPE tokenizer on IPA strings from a pre-built corpus file.
-    
+
     Args:
         output_dir: Directory to save tokenizer files
         vocab_size: Target vocabulary size
         corpus_file: Path to the IPA corpus file (one IPA string per line)
         min_frequency: Minimum frequency for a token to be included
-    
+
     Returns:
         Trained Tokenizer object
     """
     tokenizer_dir = os.path.join(output_dir, f"ipa_bpe_v{vocab_size}")
     os.makedirs(tokenizer_dir, exist_ok=True)
-    
+
     tokenizer_file = os.path.join(tokenizer_dir, "tokenizer.json")
-    
+
     # Check if already trained
     if os.path.exists(tokenizer_file):
         print(f"[INFO] Loading existing tokenizer from {tokenizer_file}")
         return Tokenizer.from_file(tokenizer_file)
-    
+
     # Initialize tokenizer
     tokenizer = Tokenizer(BPE(unk_token="<unk>"))
     tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
-    
+
     special_tokens = ["<pad>", "<blank>", "<unk>"]
-    
+
     trainer = BpeTrainer(
         vocab_size=vocab_size,
         min_frequency=min_frequency,
         special_tokens=special_tokens,
         show_progress=True,
     )
-    
+
     print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}...")
     tokenizer.train(files=[corpus_file], trainer=trainer)
-    
+
     # Save
     tokenizer.save(tokenizer_file)
     tokenizer.model.save(tokenizer_dir)
-    
+
     print(f"[INFO] Saved tokenizer to {tokenizer_dir}")
-    
+
     return tokenizer
 
 
@@ -418,35 +421,35 @@ def compute_stats(
     qwen_counts = []
     nemotron_counts = []
     ipa_counts = {vs: [] for vs in ipa_tokenizers.keys()}
-    
+
     for pair in text_pairs:
         # Qwen tokenizer on raw text
         qwen_tokens = qwen_tokenizer.encode(pair.raw_text)
         qwen_counts.append(len(qwen_tokens))
-        
+
         # Nemotron tokenizer on raw text
         nemotron_tokens = nemotron_tokenizer.encode(pair.raw_text)
         nemotron_counts.append(len(nemotron_tokens))
-        
+
         # IPA tokenizers on IPA text
         for vocab_size, tokenizer in ipa_tokenizers.items():
             ipa_tokens = tokenizer.encode(pair.ipa_text)
             ipa_counts[vocab_size].append(len(ipa_tokens.ids))
-    
+
     # Calculate total duration and token counts
     total_duration = sum(pair.duration for pair in text_pairs)
     qwen_total = sum(qwen_counts)
     nemotron_total = sum(nemotron_counts)
-    
+
     # Compute tokens per second
     qwen_tps = qwen_total / total_duration if total_duration > 0 else 0.0
     nemotron_tps = nemotron_total / total_duration if total_duration > 0 else 0.0
-    
+
     ipa_tps = {}
     for vocab_size in ipa_tokenizers.keys():
         ipa_total = sum(ipa_counts[vocab_size])
         ipa_tps[vocab_size] = ipa_total / total_duration if total_duration > 0 else 0.0
-    
+
     return TokenizationStats(
         lang=lang,
         num_samples=len(text_pairs),
@@ -462,32 +465,32 @@ def print_stats_table(all_stats: List[TokenizationStats], vocab_sizes: List[int]
     print("\n" + "=" * 120)
     print("TOKENS PER SECOND: Qwen2.5-1.5B-Instruct & Nemotron Nano 30B (raw text) vs IPA BPE (phonemized)")
     print("=" * 120)
-    
+
     # Header
     header = f"{'Lang':<6} {'Samples':>8} {'Duration(s)':>12} {'Qwen tok/s':>12} {'Nemo tok/s':>12}"
     for vs in vocab_sizes:
         header += f" {'IPA-' + str(vs):>10}"
     print(header)
     print("-" * 120)
-    
+
     # Data rows
     for stats in all_stats:
         row = f"{stats.lang:<6} {stats.num_samples:>8} {stats.total_duration:>12.2f} {stats.qwen_tokens_per_second:>12.2f} {stats.nemotron_tokens_per_second:>12.2f}"
         for vs in vocab_sizes:
             row += f" {stats.ipa_tokens_per_second[vs]:>10.2f}"
         print(row)
-    
+
     # Aggregated stats
     print("-" * 120)
     total_samples = sum(s.num_samples for s in all_stats)
     total_duration = sum(s.total_duration for s in all_stats)
-    
+
     # Compute overall tokens per second (weighted by duration)
     total_qwen_tokens = sum(s.qwen_tokens_per_second * s.total_duration for s in all_stats)
     total_nemotron_tokens = sum(s.nemotron_tokens_per_second * s.total_duration for s in all_stats)
     overall_qwen_tps = total_qwen_tokens / total_duration if total_duration > 0 else 0
     overall_nemotron_tps = total_nemotron_tokens / total_duration if total_duration > 0 else 0
-    
+
     agg_row = f"{'TOTAL':<6} {total_samples:>8} {total_duration:>12.2f} {overall_qwen_tps:>12.2f} {overall_nemotron_tps:>12.2f}"
     for vs in vocab_sizes:
         total_ipa_tokens = sum(s.ipa_tokens_per_second[vs] * s.total_duration for s in all_stats)
@@ -495,7 +498,7 @@ def print_stats_table(all_stats: List[TokenizationStats], vocab_sizes: List[int]
         agg_row += f" {overall_ipa_tps:>10.2f}"
     print(agg_row)
     print("=" * 120)
-    
+
     # Summary
     print("\nSUMMARY:")
     print(f"  - Total samples analyzed: {total_samples}")
@@ -523,20 +526,21 @@ def save_results_json(
         },
         "results": [],
     }
-    
+
     for stats in all_stats:
-        output["results"].append({
-            "lang": stats.lang,
-            "num_samples": stats.num_samples,
-            "total_duration_seconds": stats.total_duration,
-            "qwen_tokens_per_second": stats.qwen_tokens_per_second,
-            "nemotron_tokens_per_second": stats.nemotron_tokens_per_second,
-            "ipa_tokens_per_second": {
-                str(vs): stats.ipa_tokens_per_second[vs]
-                for vs in stats.ipa_tokens_per_second.keys()
+        output["results"].append(
+            {
+                "lang": stats.lang,
+                "num_samples": stats.num_samples,
+                "total_duration_seconds": stats.total_duration,
+                "qwen_tokens_per_second": stats.qwen_tokens_per_second,
+                "nemotron_tokens_per_second": stats.nemotron_tokens_per_second,
+                "ipa_tokens_per_second": {
+                    str(vs): stats.ipa_tokens_per_second[vs] for vs in stats.ipa_tokens_per_second.keys()
+                },
             }
-        })
-    
+        )
+
     with open(output_path, "w", encoding="utf-8") as f:
         json.dump(output, f, indent=2)
     print(f"[INFO] Saved results to {output_path}")
@@ -555,9 +559,7 @@ def parse_lang_arg(arg: str, available_langs: List[str]) -> List[str]:
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Compare tokenization between Qwen and IPA BPE tokenizers."
-    )
+    parser = argparse.ArgumentParser(description="Compare tokenization between Qwen and IPA BPE tokenizers.")
     parser.add_argument(
         "--output_dir",
         type=str,
@@ -598,7 +600,7 @@ def main():
         "--config",
         type=str,
         default=None,
-        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}"
+        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}",
     )
     parser.add_argument(
         "--max_count_per_lang",
@@ -607,15 +609,15 @@ def main():
         help="Max count per language when counting IPA strings (default: 100000)",
     )
     args = parser.parse_args()
-    
+
     os.makedirs(args.output_dir, exist_ok=True)
-    
+
     # Load config
     config_path = Path(args.config) if args.config else None
     cuts_dirs = load_cuts_dirs_config(config_path)
     available_langs = list(cuts_dirs.keys())
     print(f"[INFO] Loaded config with languages: {available_langs}")
-    
+
     # Parse train and test languages
     try:
         train_langs = parse_lang_arg(args.train_langs, available_langs)
@@ -623,20 +625,20 @@ def main():
     except ValueError as e:
         print(f"[ERROR] {e}")
         sys.exit(1)
-    
+
     print(f"[INFO] Training languages: {train_langs}")
     print(f"[INFO] Testing languages: {test_langs}")
     print(f"[INFO] Samples per language for testing: {args.samples_per_lang}")
     print(f"[INFO] Max samples per language for training: {args.max_samples_per_lang or 'auto (min across langs)'}")
     print(f"[INFO] Vocab sizes: {VOCAB_SIZES}")
-    
+
     # Step 1: Create balanced IPA corpus once
     print("\n" + "=" * 60)
     print("STEP 1: Creating balanced IPA corpus")
     print("=" * 60)
-    
+
     corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt")
-    
+
     # Check if corpus already exists
     if os.path.exists(corpus_file):
         print(f"[INFO] Using existing corpus file: {corpus_file}")
@@ -652,12 +654,12 @@ def main():
             max_count_per_lang=args.max_count_per_lang,
             seed=args.seed,
         )
-    
+
     # Step 2: Train IPA BPE tokenizers at different vocab sizes (reusing corpus)
     print("\n" + "=" * 60)
     print("STEP 2: Training IPA BPE tokenizers")
     print("=" * 60)
-    
+
     ipa_tokenizers = {}
     for vocab_size in VOCAB_SIZES:
         print(f"\n[INFO] Training tokenizer with vocab_size={vocab_size}")
@@ -667,60 +669,64 @@ def main():
             corpus_file=corpus_file,
             min_frequency=2,
         )
-    
+
     # Step 3: Load Qwen and Nemotron tokenizers
     print("\n" + "=" * 60)
     print("STEP 3: Loading Qwen and Nemotron tokenizers")
     print("=" * 60)
-    
+
     print("[INFO] Loading Qwen/Qwen2.5-1.5B-Instruct tokenizer...")
     qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
     print(f"[INFO] Qwen tokenizer vocab size: {qwen_tokenizer.vocab_size}")
-    
+
     print("[INFO] Loading nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 tokenizer...")
-    
-    nemotron_tokenizer =  AutoTokenizer.from_pretrained("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", trust_remote_code=True)
-    
+
+    nemotron_tokenizer = AutoTokenizer.from_pretrained(
+        "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", trust_remote_code=True
+    )
+
     print(f"[INFO] Nemotron tokenizer vocab size: {nemotron_tokenizer.vocab_size}")
-    
+
     # Step 4: Sample text pairs and compute statistics (on test languages)
     print("\n" + "=" * 60)
     print("STEP 4: Sampling and analyzing (test languages)")
     print("=" * 60)
-    
+
     all_stats = []
     for lang in test_langs:
         print(f"\n[INFO] Processing language: {lang}")
-        
+
         # Sample text pairs
         text_pairs = sample_text_pairs(lang, cuts_dirs, args.samples_per_lang, args.seed)
-        
+
         if not text_pairs:
             print(f"[WARN] No text pairs found for {lang}, skipping")
             continue
-        
+
         print(f"[INFO] Sampled {len(text_pairs)} text pairs for {lang}")
-        
+
         # Compute stats
         stats = compute_stats(text_pairs, qwen_tokenizer, nemotron_tokenizer, ipa_tokenizers, lang)
         all_stats.append(stats)
-        
+
         # Print intermediate results
-        print(f"[INFO] {lang}: duration={stats.total_duration:.2f}s, Qwen={stats.qwen_tokens_per_second:.2f} tok/s, Nemotron={stats.nemotron_tokens_per_second:.2f} tok/s")
+        print(
+            f"[INFO] {lang}: duration={stats.total_duration:.2f}s, Qwen={stats.qwen_tokens_per_second:.2f} tok/s, Nemotron={stats.nemotron_tokens_per_second:.2f} tok/s"
+        )
         for vs in VOCAB_SIZES:
             print(f"       IPA-{vs}={stats.ipa_tokens_per_second[vs]:.2f} tok/s")
-    
+
     # Step 5: Print and save results
     print("\n" + "=" * 60)
     print("STEP 5: Results")
     print("=" * 60)
-    
+
     print_stats_table(all_stats, VOCAB_SIZES)
-    
+
     # Save to JSON with metadata
     results_path = os.path.join(args.output_dir, "tokenization_comparison.json")
     save_results_json(all_stats, results_path, train_langs, test_langs)
-    
+
     print("[INFO] Done!")
 
 
diff --git a/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py b/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py
index c6098d93839a..825129d2c928 100644
--- a/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py
+++ b/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py
@@ -50,13 +50,14 @@ def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[
     """Load CUTS_DIRS_BY_LANG from a JSON config file."""
     if config_path is None:
         config_path = DEFAULT_CONFIG_PATH
-    
+
     if not config_path.exists():
         raise FileNotFoundError(f"Config file not found: {config_path}")
-    
+
     with open(config_path, "r", encoding="utf-8") as f:
         return json.load(f)
 
+
 OUTPUT_SUFFIX = "_with_ipa"  # cuts -> cuts_with_ipa
 SHARD_GLOB = "cuts.*.jsonl.gz"
 
@@ -82,7 +83,7 @@ def iter_shards(ipa_dir: Path) -> List[Path]:
 def extract_ipa_from_shard(shard_path: Path) -> Generator[str, None, None]:
     """
     Extract all IPA strings from a single shard file.
-    
+
     Yields:
         IPA strings from cut["supervisions"][i]["custom"]["ipa"]
     """
@@ -121,11 +122,11 @@ def collect_ipa_strings(
 ) -> Generator[str, None, None]:
     """
     Collect all IPA strings from the specified language(s).
-    
+
     Args:
         cuts_dirs: Dictionary mapping language codes to lists of cuts directories
         lang: Language code or None for all languages.
-    
+
     Yields:
         IPA strings
     """
@@ -135,17 +136,17 @@ def collect_ipa_strings(
         if lang not in cuts_dirs:
             raise ValueError(f"Unknown language: {lang}. Available: {get_available_languages(cuts_dirs)}")
         langs_to_process = [lang]
-    
+
     for lang_code in langs_to_process:
         print(f"[INFO] Processing language: {lang_code}")
         for cuts_dir_str in cuts_dirs[lang_code]:
             cuts_dir = Path(cuts_dir_str)
             ipa_dir = get_ipa_dir(cuts_dir)
-            
+
             if not ipa_dir.exists():
                 print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr)
                 continue
-            
+
             print(f"[INFO] Reading from: {ipa_dir}")
             count = 0
             for ipa in extract_ipa_from_dir(ipa_dir):
@@ -161,14 +162,14 @@ def iter_ipa_strings_for_lang(
     """Iterate over all IPA strings for a single language (memory-efficient)."""
     if lang not in cuts_dirs:
         return
-    
+
     for cuts_dir_str in cuts_dirs[lang]:
         cuts_dir = Path(cuts_dir_str)
         ipa_dir = get_ipa_dir(cuts_dir)
-        
+
         if not ipa_dir.exists():
             continue
-        
+
         for ipa in extract_ipa_from_dir(ipa_dir):
             yield ipa
 
@@ -192,31 +193,31 @@ def simple_sample_ipa_strings(
 ) -> List[str]:
     """
     Simple sampling: collect up to max_collect IPA strings, then randomly sample k.
-    
+
     This avoids reading through all data like reservoir sampling does.
-    
+
     Args:
         lang: Language code
         cuts_dirs: Dictionary mapping language codes to lists of cuts directories
         k: Number of samples to select
         max_collect: Maximum number of strings to collect before sampling
         seed: Random seed for reproducibility
-    
+
     Returns:
         List of up to k sampled IPA strings
     """
     rng = random.Random(seed)
     collected: List[str] = []
-    
+
     for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs):
         collected.append(ipa)
         if len(collected) >= max_collect:
             break
-    
+
     # If we have fewer than k, return all
     if len(collected) <= k:
         return collected
-    
+
     # Otherwise, randomly sample k
     return rng.sample(collected, k)
 
@@ -242,11 +243,11 @@ def create_balanced_corpus(
 ) -> Tuple[str, Dict[str, int]]:
     """
     Create a balanced IPA corpus file with equal samples from each language.
-    
+
     Uses a memory-efficient two-pass approach:
     1. First pass: Count sentences per language (up to max_count_per_lang)
     2. Second pass: Use simple sampling to select samples
-    
+
     Args:
         train_langs: List of language codes to include
         cuts_dirs: Dictionary mapping language codes to lists of cuts directories
@@ -254,14 +255,14 @@ def create_balanced_corpus(
         max_samples_per_lang: Optional cap on samples per language
         max_count_per_lang: Max count per language when counting IPA strings
         seed: Random seed for reproducibility
-    
+
     Returns:
         Tuple of (corpus_file_path, dict of lang -> actual_count)
     """
     # First pass: Count sentences per language
     print("[INFO] Pass 1: Counting IPA strings per language...")
     lang_counts: Dict[str, int] = {}
-    
+
     for lang in train_langs:
         if lang not in cuts_dirs:
             print(f"[WARN] Language {lang} not in config, skipping")
@@ -270,42 +271,42 @@ def create_balanced_corpus(
         count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang)
         lang_counts[lang] = count
         print(f"{count} IPA strings")
-    
+
     if not lang_counts:
         raise ValueError("No IPA strings found for any language")
-    
+
     # Find minimum count across languages
     min_count = min(lang_counts.values())
     print(f"[INFO] Minimum count across languages: {min_count}")
-    
+
     # Apply max_samples_per_lang cap if specified
     samples_per_lang = min_count
     if max_samples_per_lang is not None and max_samples_per_lang < min_count:
         samples_per_lang = max_samples_per_lang
         print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}")
-    
+
     # Second pass: Sample from each language using simple sampling
     print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...")
     actual_counts: Dict[str, int] = {}
     total_written = 0
-    
+
     with open(output_file, "w", encoding="utf-8") as f:
         for lang in lang_counts.keys():
             print(f"[INFO] Sampling from {lang}...", end=" ", flush=True)
             # Use different seed per language for variety, but reproducible
             lang_seed = seed + hash(lang) % 10000
             sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed)
-            
+
             for ipa in sampled:
                 f.write(ipa + "\n")
                 total_written += 1
-            
+
             actual_counts[lang] = len(sampled)
             print(f"sampled {len(sampled)} strings")
-    
+
     print(f"[INFO] Total IPA strings written to corpus: {total_written}")
     print(f"[INFO] Balanced corpus saved to: {output_file}")
-    
+
     return output_file, actual_counts
 
 
@@ -317,45 +318,45 @@ def train_bpe_tokenizer(
 ) -> Tokenizer:
     """
     Train a byte-level BPE tokenizer on IPA strings from a corpus file.
-    
+
     Args:
         corpus_file: Path to the IPA corpus file (one IPA string per line)
         vocab_size: Target vocabulary size
         min_frequency: Minimum frequency for a token to be included
         output_dir: Directory to save the tokenizer files
-    
+
     Returns:
         Trained Tokenizer object
     """
     # Create output directory
     os.makedirs(output_dir, exist_ok=True)
-    
+
     # Check if tokenizer already exists
     tokenizer_path = os.path.join(output_dir, "tokenizer.json")
     if os.path.exists(tokenizer_path):
         print(f"[INFO] Loading existing tokenizer from {tokenizer_path}")
         return Tokenizer.from_file(tokenizer_path)
-    
+
     # Count lines in corpus
     with open(corpus_file, "r", encoding="utf-8") as f:
         total_count = sum(1 for _ in f)
     print(f"[INFO] Corpus contains {total_count} IPA strings")
-    
+
     if total_count == 0:
         raise ValueError("Corpus file is empty. Make sure the cuts_with_ipa directories exist.")
-    
+
     # Initialize a byte-level BPE tokenizer
     tokenizer = Tokenizer(BPE(unk_token="<unk>"))
-    
+
     # Use byte-level pre-tokenization (like GPT-2)
     tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
-    
+
     # Add byte-level decoder to properly convert back to original text
     tokenizer.decoder = ByteLevelDecoder()
-    
+
     # Define special tokens
     special_tokens = ["<pad>", "<blank>", "<unk>"]
-    
+
     # Create trainer
     trainer = BpeTrainer(
         vocab_size=vocab_size,
@@ -363,27 +364,27 @@ def train_bpe_tokenizer(
         special_tokens=special_tokens,
         show_progress=True,
     )
-    
+
     # Train the tokenizer
     print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}, min_frequency={min_frequency}")
     tokenizer.train(files=[corpus_file], trainer=trainer)
-    
+
     # Save the tokenizer
     vocab_path = os.path.join(output_dir, "vocab.json")
     merges_path = os.path.join(output_dir, "merges.txt")
-    
+
     # Save using the tokenizer's model save method
     tokenizer.model.save(output_dir)
-    
+
     # Also save the full tokenizer for easy loading
     tokenizer.save(tokenizer_path)
-    
+
     print(f"[INFO] Tokenizer saved to: {output_dir}")
     print(f"[INFO]   - vocab.json: {vocab_path}")
     print(f"[INFO]   - merges.txt: {merges_path}")
     print(f"[INFO]   - tokenizer.json: {tokenizer_path}")
     print(f"[INFO] Vocabulary size: {tokenizer.get_vocab_size()}")
-    
+
     return tokenizer
 
 
@@ -431,7 +432,7 @@ def main():
         "--config",
         type=str,
         default=None,
-        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}"
+        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}",
     )
     parser.add_argument(
         "--max_count_per_lang",
@@ -440,19 +441,19 @@ def main():
         help="Max count per language when counting IPA strings (default: 100000)",
     )
     args = parser.parse_args()
-    
+
     # Load config
     config_path = Path(args.config) if args.config else None
     cuts_dirs = load_cuts_dirs_config(config_path)
     available_langs = get_available_languages(cuts_dirs)
-    
+
     # Parse train_langs
     try:
         train_langs = parse_langs_arg(args.train_langs, available_langs)
     except ValueError as e:
         print(f"[ERROR] {e}")
         sys.exit(1)
-    
+
     print(f"[INFO] Training IPA BPE tokenizer")
     print(f"[INFO]   Output directory: {args.output_dir}")
     print(f"[INFO]   Vocabulary size: {args.vocab_size}")
@@ -461,16 +462,16 @@ def main():
     print(f"[INFO]   Max samples per lang: {args.max_samples_per_lang or 'auto (min across langs)'}")
     print(f"[INFO]   Max count per lang: {args.max_count_per_lang}")
     print(f"[INFO]   Available languages: {available_langs}")
-    
+
     os.makedirs(args.output_dir, exist_ok=True)
-    
+
     # Step 1: Create balanced corpus
     print("\n" + "=" * 60)
     print("STEP 1: Creating balanced IPA corpus")
     print("=" * 60)
-    
+
     corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt")
-    
+
     if os.path.exists(corpus_file):
         print(f"[INFO] Using existing corpus file: {corpus_file}")
         with open(corpus_file, "r", encoding="utf-8") as f:
@@ -485,24 +486,24 @@ def main():
             max_count_per_lang=args.max_count_per_lang,
             seed=args.seed,
         )
-    
+
     # Step 2: Train tokenizer
     print("\n" + "=" * 60)
     print("STEP 2: Training BPE tokenizer")
     print("=" * 60)
-    
+
     tokenizer = train_bpe_tokenizer(
         corpus_file=corpus_file,
         vocab_size=args.vocab_size,
         min_frequency=args.min_frequency,
         output_dir=args.output_dir,
     )
-    
+
     # Test the tokenizer
     print("\n[INFO] Testing tokenizer with sample IPA strings:")
     test_strings = [
         "həˈloʊ wɜːld",  # hello world
-        "ˈaɪ pʰiː eɪ",   # IPA
+        "ˈaɪ pʰiː eɪ",  # IPA
         "ˈtɛstɪŋ wʌn tuː θriː",  # testing one two three
     ]
     for test_str in test_strings:
@@ -513,7 +514,7 @@ def main():
         print(f"  IDs:     {encoded.ids}")
         print(f"  Decoded: '{decoded}'")
         print()
-    
+
     print("[INFO] Done!")
 
 

From 91f71c8347097587208ab4a266826287ffe33230 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Mon, 2 Feb 2026 11:12:13 -0800
Subject: [PATCH 30/94] nemotron mamba model (#58)

* nemotron mamba model

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* lhotse config update

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* mamba inference working

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

---------

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 .../tts/conf/magpietts/easy_magpietts.yaml    |   39 +
 .../conf/magpietts/easy_magpietts_lhotse.yaml |   40 +
 examples/tts/evalset_config.json              |   40 +-
 nemo/collections/tts/models/easy_magpietts.py |   87 +-
 nemo/collections/tts/modules/__init__.py      |   10 +
 .../tts/modules/nemotron_h_decoder.py         | 1456 +++++++++++++++++
 .../tts/test_nemotron_h_decoder.py            |  745 +++++++++
 7 files changed, 2381 insertions(+), 36 deletions(-)
 create mode 100644 nemo/collections/tts/modules/nemotron_h_decoder.py
 create mode 100644 tests/collections/tts/test_nemotron_h_decoder.py

diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
index eea075870b07..6166fd68968f 100644
--- a/examples/tts/conf/magpietts/easy_magpietts.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -13,7 +13,46 @@ train_ds_meta: ???
 val_ds_meta: ???
 
 model:
+  # Decoder backend selection
+  # Options: "huggingface" (default), "nemotron_h"
+  decoder_type: "huggingface"
+  
+  # HuggingFace backend config (used when decoder_type: "huggingface")
   transformer_hf_backend: "Qwen/Qwen2.5-1.5B"
+  
+  # NemotronH config (used when decoder_type: "nemotron_h")
+  # This is a hybrid Mamba2/Attention model. Layer types are specified via hybrid_override_pattern:
+  # 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer
+  nemotron_h_config:
+    hidden_size: 1536  # Should match embedding_dim
+    num_hidden_layers: 24
+    vocab_size: 131072
+    # Attention config
+    num_attention_heads: 12
+    num_key_value_heads: 4
+    attention_dropout: 0.0
+    attention_bias: false
+    max_position_embeddings: 8192
+    # Mamba config
+    mamba_num_heads: 64
+    mamba_head_dim: 24
+    ssm_state_size: 128
+    conv_kernel: 4
+    n_groups: 8
+    chunk_size: 256
+    mamba_hidden_act: "silu"
+    use_conv_bias: true
+    use_bias: false
+    # MLP config
+    intermediate_size: 4096
+    mlp_hidden_act: "silu"
+    mlp_bias: false
+    # Layer pattern: alternating Mamba and Attention
+    hybrid_override_pattern: "M*M*M*M*M*M*M*M*M*M*M*M*"
+    # Normalization
+    layer_norm_epsilon: 1e-5
+    residual_in_fp32: true
+  
   use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided.
   context_duration_min: 5.0
   context_duration_max: 5.0
diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
index 2327820e44a4..5461af8d6ee5 100644
--- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -11,7 +11,47 @@ quadratic_duration: 20
 
 model:
   use_lhotse: true
+  
+  # Decoder backend selection
+  # Options: "huggingface" (default), "nemotron_h"
+  decoder_type: "huggingface"
+  
+  # HuggingFace backend config (used when decoder_type: "huggingface")
   transformer_hf_backend: "Qwen/Qwen2.5-1.5B"
+  
+  # NemotronH config (used when decoder_type: "nemotron_h")
+  # This is a hybrid Mamba2/Attention model. Layer types are specified via hybrid_override_pattern:
+  # 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer
+  nemotron_h_config:
+    hidden_size: 1536  # Should match embedding_dim
+    num_hidden_layers: 24
+    vocab_size: 131072
+    # Attention config
+    num_attention_heads: 12
+    num_key_value_heads: 4
+    attention_dropout: 0.0
+    attention_bias: false
+    max_position_embeddings: 8192
+    # Mamba config
+    mamba_num_heads: 64
+    mamba_head_dim: 24
+    ssm_state_size: 128
+    conv_kernel: 4
+    n_groups: 8
+    chunk_size: 256
+    mamba_hidden_act: "silu"
+    use_conv_bias: true
+    use_bias: false
+    # MLP config
+    intermediate_size: 4096
+    mlp_hidden_act: "silu"
+    mlp_bias: false
+    # Layer pattern: alternating Mamba and Attention
+    hybrid_override_pattern: "M*M*M*M*M*M*M*M*M*M*M*M*"
+    # Normalization
+    layer_norm_epsilon: 1e-5
+    residual_in_fp32: true
+  
   use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided.
   context_duration_min: 5.0
   context_duration_max: 5.0
diff --git a/examples/tts/evalset_config.json b/examples/tts/evalset_config.json
index 029f818ef53b..49822ce9cf25 100644
--- a/examples/tts/evalset_config.json
+++ b/examples/tts/evalset_config.json
@@ -15,43 +15,51 @@
         "feature_dir": null
     },
     "riva_multibpe": {
-        "manifest_path": "/Data/evaluation_manifests/riva_hard_multi_bpe.ndjson",
+        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/riva_hard_multi_bpe.ndjson",
         "audio_dir": "/Data/RIVA-TTS",
-        "feature_dir": "/Data/RIVA-TTS"
+        "feature_dir": "/Data/RIVA-TTS",
+        "tokenizer_names": ["nemotron_nano_30b"]
     },
     "riva_hard_digits": {
-        "manifest_path": "/Data/evaluation_manifests/hard-digits-path-corrected.ndjson",
+        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-digits-path-corrected.ndjson",
         "audio_dir": "/Data/RIVA-TTS",
-        "feature_dir": "/Data/RIVA-TTS"
+        "feature_dir": "/Data/RIVA-TTS",
+        "tokenizer_names": ["nemotron_nano_30b"]
     },
     "riva_hard_letters": {
-        "manifest_path": "/Data/evaluation_manifests/hard-letters-path-corrected.ndjson",
+        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-letters-path-corrected.ndjson",
         "audio_dir": "/Data/RIVA-TTS",
-        "feature_dir": "/Data/RIVA-TTS"
+        "feature_dir": "/Data/RIVA-TTS",
+        "tokenizer_names": ["nemotron_nano_30b"]
     },
     "riva_hard_money": {
-        "manifest_path": "/Data/evaluation_manifests/hard-money-path-corrected.ndjson",
+        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-money-path-corrected.ndjson",
         "audio_dir": "/Data/RIVA-TTS",
-        "feature_dir": "/Data/RIVA-TTS"
+        "feature_dir": "/Data/RIVA-TTS",
+        "tokenizer_names": ["nemotron_nano_30b"]
     },
     "riva_hard_short": {
-        "manifest_path": "/Data/evaluation_manifests/hard-short-path-corrected.ndjson",
+        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-short-path-corrected.ndjson",
         "audio_dir": "/Data/RIVA-TTS",
-        "feature_dir": "/Data/RIVA-TTS"
+        "feature_dir": "/Data/RIVA-TTS",
+        "tokenizer_names": ["nemotron_nano_30b"]
     },
     "vctk": {
-        "manifest_path": "/Data/evaluation_manifests/smallvctk__phoneme__nemo_audio_21fps_8codebooks_2kcodes_v2bWithWavLM_simplet5_withcontextaudiopaths_silence_trimmed.json",
+        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/smallvctk__phoneme__nemo_audio_21fps_8codebooks_2kcodes_v2bWithWavLM_simplet5_withcontextaudiopaths_silence_trimmed.json",
         "audio_dir": "/Data/VCTK-Corpus-0.92",
-        "feature_dir": "/Data/VCTK-Corpus-0.92"
+        "feature_dir": "/Data/VCTK-Corpus-0.92",
+        "tokenizer_names": ["nemotron_nano_30b"]
     },
     "libritts_seen": {
-        "manifest_path": "/Data/evaluation_manifests/LibriTTS_seen_evalset_from_testclean_v2.json",
+        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/LibriTTS_seen_evalset_from_testclean_v2.json",
         "audio_dir": "/Data/LibriTTS",
-        "feature_dir": "/Data/LibriTTS"
+        "feature_dir": "/Data/LibriTTS",
+        "tokenizer_names": ["nemotron_nano_30b"]
     },
     "libritts_test_clean": {
-        "manifest_path": "/Data/evaluation_manifests/LibriTTS_test_clean_withContextAudioPaths.jsonl",
+        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/LibriTTS_test_clean_withContextAudioPaths.jsonl",
         "audio_dir": "/Data/LibriTTS",
-        "feature_dir": "/Data/LibriTTS"
+        "feature_dir": "/Data/LibriTTS",
+        "tokenizer_names": ["nemotron_nano_30b"]
     }
 }
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index f6dd9c7728b3..dabdd0ae6f30 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -301,15 +301,37 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings)
             self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor)
 
-        self.transformer_backend_config = AutoConfig.from_pretrained(
-            cfg.transformer_hf_backend,
-            trust_remote_code=True,
-        )
+        # Decoder backend selection - supports HuggingFace models or NemotronH
+        self.decoder_type = cfg.get('decoder_type', 'huggingface')  # backward compatible default
+        logging.info(f"Using decoder type: {self.decoder_type}")
+
+        if self.decoder_type == 'huggingface':
+            # Existing HuggingFace path
+            self.transformer_backend_config = AutoConfig.from_pretrained(
+                cfg.transformer_hf_backend,
+                trust_remote_code=True,
+            )
+            hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config)
+            self.decoder = hf_transformer.model
+            self.lm_text_head = hf_transformer.lm_head
+
+        elif self.decoder_type == 'nemotron_h':
+            # NemotronH hybrid Mamba2/Attention backend
+            from nemo.collections.tts.modules.nemotron_h_decoder import NemotronHConfig, NemotronHForCausalLM
+
+            # Build config from YAML parameters
+            nemotron_h_config_dict = dict(cfg.get('nemotron_h_config', {}))
+            # Ensure hidden_size matches embedding_dim for compatibility
+            if 'hidden_size' not in nemotron_h_config_dict:
+                nemotron_h_config_dict['hidden_size'] = cfg.embedding_dim
+            nemotron_config = NemotronHConfig(**nemotron_h_config_dict)
+            nemotron_model = NemotronHForCausalLM(nemotron_config)
+            self.decoder = nemotron_model.backbone
+            self.lm_text_head = nemotron_model.lm_head
+            logging.info(f"NemotronH config: {nemotron_config.num_hidden_layers} layers, pattern={nemotron_config.hybrid_override_pattern[:20]}...")
 
-        hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config)
-        self.decoder = hf_transformer.model
-        # self.decoder.to(torch.float32)
-        self.lm_text_head = hf_transformer.lm_head
+        else:
+            raise ValueError(f"Unknown decoder_type: {self.decoder_type}. Supported: 'huggingface', 'nemotron_h'")
 
         self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim)
         self.decoder.set_input_embeddings(self.text_embedding)
@@ -647,13 +669,23 @@ def compute_phoneme_loss(self, logits, phoneme_tokens, phoneme_tokens_lens):
         total_phoneme_loss = total_phoneme_loss / self.phoneme_stacking_factor
         return total_phoneme_loss, loss_mask
 
-    def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None):
-        backend_out = self.decoder(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            use_cache=use_cache,
-            past_key_values=past_key_values,
-        )
+    def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None, cache_position=None):
+        # Only pass cache_position for NemotronH (HF transformers may not accept it)
+        if self.decoder_type == 'nemotron_h':
+            backend_out = self.decoder(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+            )
+        else:
+            backend_out = self.decoder(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                past_key_values=past_key_values,
+            )
         # hidden_states = backend_out.last_hidden_state  # (B, T_total, H)
         return backend_out
 
@@ -1999,17 +2031,25 @@ def infer_batch(
                 ]  # (2B, T_min, E)
             else:
                 first_inference_input = context_plus_audio_embedded[:, :min_context_len, :]  # (B, T_min, E)
+            
+            # Initialize cache_position for tracking sequence position (needed for NemotronH)
+            cache_position = torch.arange(min_context_len, device=context_embedding.device)
+            
             # First forward pass to get the initial hidden state and past key values
             transformer_out = self.forward(
                 inputs_embeds=first_inference_input,
                 attention_mask=None,
                 use_cache=True,
                 past_key_values=None,  # No past key values for the first step
+                cache_position=cache_position,
             )
 
             time_to_first_prediction = time.time() - start_time
             last_hidden = transformer_out.last_hidden_state  # (B, T_total, E)
             past_kv = transformer_out.past_key_values
+            
+            # Track the current sequence length for cache_position updates
+            current_cache_seq_len = min_context_len
 
             all_predictions = []
             end_indices = {}
@@ -2034,7 +2074,7 @@ def infer_batch(
                 current_text_positions += 1
                 if self.phoneme_tokenizer is not None:
                     current_phoneme_positions += 1
-                    print("current_phoneme_positions", current_phoneme_positions)
+                    # print("current_phoneme_positions", current_phoneme_positions)
                 if idx % 20 == 0:
                     print(f"Decoding timestep {idx}")
 
@@ -2098,15 +2138,15 @@ def infer_batch(
                         device=context_embedding.device,
                     ).long()  # (B, phoneme_stacking_factor)
                     use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long()
-                    print("use_bos_phoneme", use_bos_phoneme)
+                    # print("use_bos_phoneme", use_bos_phoneme)
                     pred_phoneme_tokens = (
                         use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens
                     ).long()  # (B, phoneme_stacking_factor)
 
-                    print("pred_phoneme_tokens", pred_phoneme_tokens)
+                    # print("pred_phoneme_tokens", pred_phoneme_tokens)
                     gt_phoneme_idx = min(idx, gt_phoneme_tokens.size(2) - 1)
                     gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx]  # (B, phoneme_stacking_factor)
-                    print("gt_phoneme_tokens_current", gt_phoneme_tokens_current)
+                    # print("gt_phoneme_tokens_current", gt_phoneme_tokens_current)
 
                     input_phoneme_tokens_current = (
                         gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens
@@ -2126,7 +2166,7 @@ def infer_batch(
                     phoneme_channel_input_t = (
                         use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding
                     )
-                    print("use_phoneme_input", use_phoneme_input)
+                    # print("use_phoneme_input", use_phoneme_input)
                     for item_idx in range(actual_batch_size):
                         if use_phoneme_input[item_idx, 0, 0] > 0:
                             for phoneme_channel_idx in range(self.phoneme_stacking_factor):
@@ -2202,14 +2242,21 @@ def infer_batch(
                     if use_cfg:
                         next_input = torch.cat([next_input, new_emb_unconditional], dim=0)  # (2B, 1, E)
 
+                # Update cache_position for current step (needed for NemotronH cached forward)
+                cache_position = torch.tensor([current_cache_seq_len], device=context_embedding.device)
+                
                 transformer_out = self.forward(
                     inputs_embeds=next_input,
                     attention_mask=None,
                     use_cache=True,
                     past_key_values=past_kv,
+                    cache_position=cache_position,
                 )
                 last_hidden = transformer_out.last_hidden_state
                 past_kv = transformer_out.past_key_values
+                
+                # Increment sequence length for next iteration
+                current_cache_seq_len += 1
                 if len(end_indices) == audio_codes_next.size(0):
                     print("All items finished at timestep {}".format(idx))
                     break
diff --git a/nemo/collections/tts/modules/__init__.py b/nemo/collections/tts/modules/__init__.py
index c4dffba34215..0c9a8c182b71 100644
--- a/nemo/collections/tts/modules/__init__.py
+++ b/nemo/collections/tts/modules/__init__.py
@@ -15,3 +15,13 @@
 import nemo.collections.tts.modules.adapters
 import nemo.collections.tts.modules.ffn_modules
 import nemo.collections.tts.modules.moe_modules
+from nemo.collections.tts.modules.nemotron_h_decoder import (
+    HybridMambaAttentionDynamicCache,
+    NemotronHConfig,
+    NemotronHForCausalLM,
+    NemotronHModel,
+)
+from nemo.collections.tts.modules.tacotron2 import Decoder as Taco2Decoder
+from nemo.collections.tts.modules.tacotron2 import Encoder as Taco2Encoder
+from nemo.collections.tts.modules.tacotron2 import Postnet as Taco2Postnet
+from nemo.collections.tts.modules.waveglow import WaveGlowModule
diff --git a/nemo/collections/tts/modules/nemotron_h_decoder.py b/nemo/collections/tts/modules/nemotron_h_decoder.py
new file mode 100644
index 000000000000..b33c1ecba663
--- /dev/null
+++ b/nemo/collections/tts/modules/nemotron_h_decoder.py
@@ -0,0 +1,1456 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+NemotronH model implementation for use as a decoder backbone in TTS models.
+Ported from: https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/blob/main/modeling_nemotron_h.py
+
+This is a hybrid Mamba2/Attention model that can be configured with different
+layer types (Mamba, Attention, MLP, MoE) via the hybrid_override_pattern config.
+"""
+
+import math
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from nemo.utils import logging
+
+
+# Try to import optimized kernels, fall back to pure PyTorch if unavailable
+try:
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+    MAMBA_SSM_AVAILABLE = True
+except ImportError:
+    selective_state_update = None
+    mamba_chunk_scan_combined = None
+    mamba_split_conv1d_scan_combined = None
+    MAMBA_SSM_AVAILABLE = False
+
+try:
+    from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn
+    RMSNORM_FN_AVAILABLE = True
+except ImportError:
+    rmsnorm_fn = None
+    RMSNORM_FN_AVAILABLE = False
+
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+    CAUSAL_CONV1D_AVAILABLE = True
+except ImportError:
+    causal_conv1d_fn = None
+    causal_conv1d_update = None
+    CAUSAL_CONV1D_AVAILABLE = False
+
+try:
+    from flash_attn import flash_attn_func
+    FLASH_ATTN_AVAILABLE = True
+except ImportError:
+    flash_attn_func = None
+    FLASH_ATTN_AVAILABLE = False
+
+
+# Check if fast path is available (all optimized kernels present)
+IS_FAST_PATH_AVAILABLE = all([
+    MAMBA_SSM_AVAILABLE,
+    CAUSAL_CONV1D_AVAILABLE,
+    selective_state_update is not None,
+    mamba_chunk_scan_combined is not None,
+    causal_conv1d_fn is not None,
+])
+
+
+def get_activation_fn(activation: str):
+    """Get activation function by name."""
+    if activation == "silu" or activation == "swish":
+        return F.silu
+    elif activation == "gelu":
+        return F.gelu
+    elif activation == "relu":
+        return F.relu
+    else:
+        raise ValueError(f"Unsupported activation: {activation}")
+
+
+@dataclass
+class NemotronHConfig:
+    """
+    Configuration class for NemotronH model.
+    
+    This configuration controls the hybrid Mamba2/Attention architecture.
+    The layer types are specified via hybrid_override_pattern where:
+    - 'M' = Mamba2 layer
+    - '*' = Attention layer  
+    - '-' = MLP layer
+    - 'E' = MoE layer
+    """
+    # Model dimensions
+    hidden_size: int = 1536
+    num_hidden_layers: int = 24
+    vocab_size: int = 131072
+    
+    # Attention config
+    num_attention_heads: int = 12
+    num_key_value_heads: int = 4
+    head_dim: Optional[int] = None
+    attention_dropout: float = 0.0
+    attention_bias: bool = False
+    max_position_embeddings: int = 4096
+    
+    # Mamba config
+    mamba_num_heads: int = 64
+    mamba_head_dim: int = 64
+    ssm_state_size: int = 128
+    conv_kernel: int = 4
+    n_groups: int = 8
+    chunk_size: int = 256
+    time_step_min: float = 0.001
+    time_step_max: float = 0.1
+    time_step_floor: float = 1e-4
+    time_step_limit: Tuple[float, float] = (0.0, float("inf"))
+    mamba_hidden_act: str = "silu"
+    use_conv_bias: bool = True
+    use_bias: bool = False
+    
+    # MLP config
+    intermediate_size: int = 4096
+    mlp_hidden_act: str = "silu"
+    mlp_bias: bool = False
+    
+    # MoE config (if using MoE layers)
+    n_routed_experts: int = 8
+    num_experts_per_tok: int = 2
+    moe_intermediate_size: int = 1024
+    moe_shared_expert_intermediate_size: int = 2048
+    n_group: int = 1
+    topk_group: int = 1
+    routed_scaling_factor: float = 1.0
+    norm_topk_prob: bool = True
+    
+    # Layer pattern: M=Mamba, *=Attention, -=MLP, E=MoE
+    # Example: "M*M*M*M*" = alternating Mamba and Attention
+    hybrid_override_pattern: str = "M*M*M*M*M*M*M*M*M*M*M*M*"
+    
+    # Normalization
+    layer_norm_epsilon: float = 1e-5
+    residual_in_fp32: bool = True
+    
+    # Initialization
+    initializer_range: float = 0.02
+    rescale_prenorm_residual: bool = True
+    
+    # Output
+    use_cache: bool = True
+    use_return_dict: bool = True
+    output_attentions: bool = False
+    output_hidden_states: bool = False
+    num_logits_to_keep: int = 1
+    
+    # Attention implementation
+    _attn_implementation: str = "sdpa"  # "eager", "sdpa", or "flash_attention_2"
+    
+    def __post_init__(self):
+        # Derive layers_block_type from hybrid_override_pattern
+        pattern_map = {'M': 'mamba', '*': 'attention', '-': 'mlp', 'E': 'moe'}
+        self.layers_block_type = [pattern_map.get(c, 'mamba') for c in self.hybrid_override_pattern]
+        
+        # Ensure num_hidden_layers matches pattern length
+        if len(self.layers_block_type) != self.num_hidden_layers:
+            # Extend or truncate pattern to match num_hidden_layers
+            if len(self.layers_block_type) < self.num_hidden_layers:
+                # Repeat pattern
+                full_pattern = self.hybrid_override_pattern * (self.num_hidden_layers // len(self.hybrid_override_pattern) + 1)
+                self.hybrid_override_pattern = full_pattern[:self.num_hidden_layers]
+                self.layers_block_type = [pattern_map.get(c, 'mamba') for c in self.hybrid_override_pattern]
+            else:
+                self.layers_block_type = self.layers_block_type[:self.num_hidden_layers]
+                self.hybrid_override_pattern = self.hybrid_override_pattern[:self.num_hidden_layers]
+        
+        # Set head_dim if not specified
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.num_attention_heads
+
+
+@dataclass
+class NemotronHOutput:
+    """Output class for NemotronH model."""
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Any] = None  # HybridMambaAttentionDynamicCache
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class NemotronHCausalLMOutput:
+    """Output class for NemotronH causal LM."""
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Any] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class HybridMambaAttentionDynamicCache:
+    """
+    A dynamic cache that handles both attention cache (with seq_len dimension) 
+    and mamba cache (with constant shape regardless of seq_len).
+    """
+    
+    def __init__(self, config: NemotronHConfig, batch_size: int, dtype=torch.float16, device=None):
+        self.dtype = dtype
+        self.has_previous_state = False
+        self.conv_kernel_size = config.conv_kernel
+        
+        intermediate_size = config.mamba_num_heads * config.mamba_head_dim
+        ssm_state_size = config.ssm_state_size
+        conv_kernel_size = config.conv_kernel
+        
+        self.conv_states = []
+        self.ssm_states = []
+        self.key_cache = []
+        self.value_cache = []
+        self.transformer_layers = []
+        
+        for i in range(config.num_hidden_layers):
+            if config.layers_block_type[i] == "mamba":
+                self.conv_states.append(
+                    torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
+                )
+                self.ssm_states.append(
+                    torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype)
+                )
+            else:
+                self.conv_states.append(torch.tensor([[]] * batch_size, device=device))
+                self.ssm_states.append(torch.tensor([[]] * batch_size, device=device))
+                self.transformer_layers.append(i)
+        
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+    
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.key_cache[layer_idx].shape[-1] == 0:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+    
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        return self.key_cache[layer_idx].shape[-2] if self.key_cache[layer_idx].dim() > 2 else 0
+    
+    def update_conv_state(self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False):
+        if cache_init:
+            self.conv_states[layer_idx] = new_conv_state.to(self.conv_states[layer_idx].device)
+        else:
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1)
+            self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states[layer_idx].device)
+        return self.conv_states[layer_idx]
+    
+    def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor):
+        self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states[layer_idx].device)
+        return self.ssm_states[layer_idx]
+    
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+            
+            device = self.conv_states[layer_idx].device
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.ssm_states[layer_idx].device
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+    
+    def reset(self):
+        """Reset all cache states to zero."""
+        for i in range(len(self.conv_states)):
+            if self.conv_states[i].numel() > 0:
+                self.conv_states[i].zero_()
+            if self.ssm_states[i].numel() > 0:
+                self.ssm_states[i].zero_()
+        for i in range(len(self.key_cache)):
+            if self.key_cache[i].numel() > 0:
+                self.key_cache[i].zero_()
+            if self.value_cache[i].numel() > 0:
+                self.value_cache[i].zero_()
+
+
+class NemotronHRMSNorm(nn.Module):
+    """RMSNorm implementation for NemotronH."""
+    
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
+
+
+class MambaRMSNormGated(nn.Module):
+    """Gated RMSNorm for Mamba layers."""
+    
+    def __init__(self, hidden_size: int, group_size: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.group_size = group_size
+    
+    def forward(self, hidden_states: torch.Tensor, gate: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # Only use Triton kernel if available AND tensors are on CUDA
+        use_triton = (
+            RMSNORM_FN_AVAILABLE 
+            and rmsnorm_fn is not None 
+            and hidden_states.is_cuda
+        )
+        
+        if use_triton:
+            return rmsnorm_fn(
+                x=hidden_states,
+                weight=self.weight,
+                bias=None,
+                z=gate,
+                eps=self.variance_epsilon,
+                group_size=self.group_size,
+                norm_before_gate=False
+            )
+        else:
+            # Fallback: simple RMSNorm + gating (works on CPU and GPU)
+            input_dtype = hidden_states.dtype
+            hidden_states = hidden_states.to(torch.float32)
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+            hidden_states = (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
+            if gate is not None:
+                hidden_states = hidden_states * F.silu(gate)
+            return hidden_states
+
+
+def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int):
+    """Pad tensor on seq_len dim (dim=1)."""
+    pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0)
+    return F.pad(input_tensor, pad_shape, mode="constant", value=0)
+
+
+def reshape_into_chunks(input_tensor, pad_size, chunk_size):
+    """Pad and reshape tensor into chunks."""
+    input_tensor = pad_tensor_by_size(input_tensor, pad_size)
+    if len(input_tensor.shape) == 3:
+        return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2])
+    else:
+        return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3])
+
+
+def segment_sum(input_tensor):
+    """Compute segment sum for SSM."""
+    chunk_size = input_tensor.size(-1)
+    input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size)
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1)
+    input_tensor = input_tensor.masked_fill(~mask, 0)
+    tensor_segsum = torch.cumsum(input_tensor, dim=-2)
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0)
+    tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf)
+    return tensor_segsum
+
+
+def apply_mask_to_padding_states(hidden_states, attention_mask):
+    """Zero out hidden states for padding tokens."""
+    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+        dtype = hidden_states.dtype
+        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+    return hidden_states
+
+
+class NemotronHMamba2Mixer(nn.Module):
+    """
+    Mamba2 mixer layer implementation.
+    Computes state space model operations for sequence modeling.
+    """
+    
+    def __init__(self, config: NemotronHConfig, layer_idx: int):
+        super().__init__()
+        self.num_heads = config.mamba_num_heads
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.ssm_state_size
+        self.conv_kernel_size = config.conv_kernel
+        self.intermediate_size = config.mamba_num_heads * config.mamba_head_dim
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.use_conv_bias
+        self.activation = config.mamba_hidden_act
+        self.act = get_activation_fn(config.mamba_hidden_act)
+        self.layer_norm_epsilon = config.layer_norm_epsilon
+        self.n_groups = config.n_groups
+        self.head_dim = config.mamba_head_dim
+        self.chunk_size = config.chunk_size
+        self.time_step_limit = config.time_step_limit
+        self.time_step_min = config.time_step_min
+        self.time_step_max = config.time_step_max
+        
+        self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
+        self.conv1d = nn.Conv1d(
+            in_channels=self.conv_dim,
+            out_channels=self.conv_dim,
+            bias=config.use_conv_bias,
+            kernel_size=config.conv_kernel,
+            groups=self.conv_dim,
+            padding=config.conv_kernel - 1,
+        )
+        
+        projection_size = self.intermediate_size + self.conv_dim + self.num_heads
+        self.in_proj = nn.Linear(self.hidden_size, projection_size, bias=config.use_bias)
+        
+        self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
+        
+        A = torch.arange(1, self.num_heads + 1)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.A_log._no_weight_decay = True
+        
+        self.norm = MambaRMSNormGated(
+            self.intermediate_size, 
+            eps=self.layer_norm_epsilon, 
+            group_size=self.intermediate_size // self.n_groups
+        )
+        self.D = nn.Parameter(torch.ones(self.num_heads))
+        self.D._no_weight_decay = True
+        
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
+        self.use_bias = config.use_bias
+    
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # Only use CUDA kernels if available AND tensors are on CUDA
+        if IS_FAST_PATH_AVAILABLE and hidden_states.is_cuda:
+            return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
+        return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+    
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+        projected_states = self.in_proj(hidden_states)
+        
+        batch_size, seq_len, _ = hidden_states.shape
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+        d_mlp = (
+            projected_states.shape[-1]
+            - 2 * self.intermediate_size
+            - 2 * self.n_groups * self.ssm_state_size
+            - self.num_heads
+        ) // 2
+        
+        if cache_params is not None and cache_position is not None and cache_position[0] > 0:
+            # Cached forward (single token)
+            _, _, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split(
+                [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
+            )
+            
+            hidden_states_B_C = causal_conv1d_update(
+                hidden_states_B_C,
+                cache_params.conv_states[self.layer_idx],
+                self.conv1d.weight.squeeze(1),
+                self.conv1d.bias,
+                self.activation,
+            )
+            
+            hidden_states, B, C = torch.split(
+                hidden_states_B_C,
+                [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                dim=-1,
+            )
+            
+            A = -torch.exp(self.A_log.float())
+            A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
+            C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
+            hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim)
+            
+            hidden_states = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states_reshaped,
+                dt, A, B, C, D,
+                z=None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
+            hidden_states = self.norm(hidden_states, gate)
+            out = self.out_proj(hidden_states)[:, None, ...]
+        else:
+            # Full sequence forward
+            A = -torch.exp(self.A_log.float())
+            dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit}
+            
+            if self.training and cache_params is None:
+                out = mamba_split_conv1d_scan_combined(
+                    projected_states,
+                    self.conv1d.weight.squeeze(1),
+                    self.conv1d.bias,
+                    self.dt_bias,
+                    A,
+                    D=self.D,
+                    chunk_size=self.chunk_size,
+                    seq_idx=None,
+                    activation=self.activation,
+                    rmsnorm_weight=self.norm.weight,
+                    rmsnorm_eps=self.norm.variance_epsilon,
+                    outproj_weight=self.out_proj.weight,
+                    outproj_bias=self.out_proj.bias,
+                    headdim=self.head_dim,
+                    ngroups=self.n_groups,
+                    norm_before_gate=False,
+                    return_final_states=False,
+                    **dt_limit_kwargs,
+                )
+            else:
+                _, _, gate, hidden_states_B_C, dt = projected_states.split(
+                    [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
+                )
+                
+                if cache_params is not None:
+                    hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
+                    conv_states = F.pad(
+                        hidden_states_B_C_transposed,
+                        (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0),
+                    )
+                    cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True)
+                
+                if self.activation not in ["silu", "swish"]:
+                    hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2))
+                else:
+                    hidden_states_B_C = causal_conv1d_fn(
+                        x=hidden_states_B_C.transpose(1, 2),
+                        weight=self.conv1d.weight.squeeze(1),
+                        bias=self.conv1d.bias,
+                        activation=self.activation,
+                    ).transpose(1, 2)
+                
+                hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
+                hidden_states, B, C = torch.split(
+                    hidden_states_B_C,
+                    [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                    dim=-1,
+                )
+                
+                scan_output, ssm_state = mamba_chunk_scan_combined(
+                    hidden_states.view(batch_size, seq_len, -1, self.head_dim),
+                    dt, A,
+                    B.view(batch_size, seq_len, self.n_groups, -1),
+                    C.view(batch_size, seq_len, self.n_groups, -1),
+                    chunk_size=self.chunk_size,
+                    D=self.D,
+                    z=None,
+                    seq_idx=None,
+                    return_final_states=True,
+                    dt_bias=self.dt_bias,
+                    dt_softplus=True,
+                    **dt_limit_kwargs,
+                )
+                
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state)
+                
+                scan_output = scan_output.view(batch_size, seq_len, -1)
+                scan_output = self.norm(scan_output, gate)
+                out = self.out_proj(scan_output)
+        
+        return out
+    
+    def torch_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """Pure PyTorch implementation (slower but works without CUDA kernels)."""
+        batch_size, seq_len, _ = hidden_states.shape
+        dtype = hidden_states.dtype
+        
+        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+        projected_states = self.in_proj(hidden_states)
+        
+        d_mlp = (
+            projected_states.shape[-1] - 2 * self.intermediate_size 
+            - 2 * self.n_groups * self.ssm_state_size - self.num_heads
+        ) // 2
+        _, _, gate, hidden_states_B_C, dt = projected_states.split(
+            [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
+        )
+        
+        # Convolution
+        if cache_params is not None and cache_position is not None and cache_position[0] > 0:
+            cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False)
+            conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device)
+            hidden_states_B_C = torch.sum(conv_states * self.conv1d.weight.squeeze(1), dim=-1)
+            if self.use_conv_bias:
+                hidden_states_B_C = hidden_states_B_C + self.conv1d.bias
+            hidden_states_B_C = self.act(hidden_states_B_C)
+        else:
+            if cache_params is not None:
+                hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
+                conv_states = F.pad(
+                    hidden_states_B_C_transposed, 
+                    (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0)
+                )
+                cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True)
+            hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2))
+        
+        hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
+        hidden_states, B, C = torch.split(
+            hidden_states_B_C,
+            [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size],
+            dim=-1
+        )
+        
+        # SSM
+        A = -torch.exp(self.A_log.float())
+        
+        if cache_params is not None and cache_position is not None and cache_position[0] > 0:
+            # Single step SSM update
+            cache_device = cache_params.ssm_states[self.layer_idx].device
+            dt = dt[:, 0, :][:, None, ...]
+            dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim)
+            dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim)
+            dt = F.softplus(dt + dt_bias.to(dt.dtype))
+            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
+            
+            A_expanded = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dA = (torch.exp(dt[..., None] * A_expanded)).to(device=cache_device)
+            
+            B = B.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous()
+            B = B.reshape(batch_size, -1, B.shape[-1])
+            dB = dt[..., None] * B[..., None, :]
+            
+            hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim)
+            dBx = (dB * hidden_states[..., None]).to(device=cache_device)
+            
+            cache_params.update_ssm_state(
+                layer_idx=self.layer_idx,
+                new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx
+            )
+            
+            C = C.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous()
+            C = C.reshape(batch_size, -1, C.shape[-1])
+            
+            ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype)
+            ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size)
+            C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1)
+            y = torch.bmm(ssm_states_reshaped, C_reshaped)
+            y = y.view(batch_size, self.num_heads, self.head_dim)
+            
+            D = self.D[..., None].expand(self.D.shape[0], self.head_dim)
+            y = (y + hidden_states * D).to(y.dtype)
+            y = y.reshape(batch_size, -1)[:, None, ...]
+        else:
+            # Full sequence SSM (chunked)
+            dt = F.softplus(dt + self.dt_bias)
+            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
+            hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
+            B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            
+            pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
+            D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
+            
+            hidden_states = hidden_states * dt[..., None]
+            A_dt = A.to(hidden_states.dtype) * dt
+            
+            hidden_states, A_dt, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A_dt, B, C)]
+            
+            A_dt = A_dt.permute(0, 3, 1, 2)
+            A_cumsum = torch.cumsum(A_dt, dim=-1)
+            L = torch.exp(segment_sum(A_dt))
+            
+            G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
+            G = G_intermediate.sum(dim=-1)
+            M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None]
+            M = M_intermediate.sum(dim=-1)
+            Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3)
+            
+            decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
+            B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None]
+            states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2)
+            
+            if cache_params is not None and cache_position is not None and cache_position[0] > 0:
+                previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device)
+            else:
+                previous_states = torch.zeros_like(states[:, :1])
+            
+            states = torch.cat([previous_states, states], dim=1)
+            decay_chunk = torch.exp(segment_sum(F.pad(A_cumsum[:, :, :, -1], (1, 0))))
+            decay_chunk = decay_chunk.transpose(1, 3)
+            new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1)
+            states, ssm_state = new_states[:, :-1], new_states[:, -1]
+            
+            state_decay_out = torch.exp(A_cumsum)
+            C_times_states = (C[..., None, :] * states[:, :, None, ...])
+            state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1)
+            Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None])
+            
+            y = Y_diag + Y_off
+            y = y.reshape(batch_size, -1, self.num_heads, self.head_dim)
+            y = y + D_residual
+            
+            if pad_size > 0:
+                y = y[:, :seq_len, :, :]
+            y = y.reshape(batch_size, seq_len, -1)
+            
+            if ssm_state is not None and cache_params is not None:
+                cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state)
+        
+        scan_output = self.norm(y, gate)
+        contextualized_states = self.out_proj(scan_output.to(dtype))
+        return contextualized_states
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """Repeat key/value heads for multi-query attention."""
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class NemotronHAttention(nn.Module):
+    """Multi-headed attention for NemotronH."""
+    
+    def __init__(self, config: NemotronHConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.is_causal = True
+        
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.head_dim * self.num_heads, self.hidden_size, bias=config.attention_bias)
+    
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
+        
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+        
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim)
+        attn_output = self.o_proj(attn_output)
+        
+        return attn_output, None, past_key_value
+
+
+class NemotronHMLP(nn.Module):
+    """MLP layer for NemotronH."""
+    
+    def __init__(self, config: NemotronHConfig, intermediate_size: Optional[int] = None, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = intermediate_size or config.intermediate_size
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = get_activation_fn(config.mlp_hidden_act)
+    
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.up_proj(x)))
+
+
+class NemotronHTopkRouter(nn.Module):
+    """
+    Top-k router for Mixture of Experts.
+    
+    Routes tokens to the top-k experts based on learned routing weights.
+    Supports grouped routing where experts are divided into groups and
+    top-k groups are selected first, then top-k experts within those groups.
+    """
+    
+    def __init__(self, config: NemotronHConfig):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        self.norm_topk_prob = config.norm_topk_prob
+        
+        self.weight = nn.Parameter(
+            torch.empty((self.n_routed_experts, config.hidden_size), dtype=torch.float32)
+        )
+        self.register_buffer(
+            "e_score_correction_bias", 
+            torch.zeros(self.n_routed_experts, dtype=torch.float32)
+        )
+    
+    @torch.no_grad()
+    def get_topk_indices(self, scores: torch.Tensor) -> torch.Tensor:
+        """Get top-k expert indices using grouped routing."""
+        scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
+        
+        # Compute group scores by taking top-2 within each group and summing
+        group_scores = (
+            scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .topk(2, dim=-1)[0]
+            .sum(dim=-1)
+        )
+        
+        # Select top-k groups
+        group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
+        group_mask = torch.zeros_like(group_scores)
+        group_mask.scatter_(1, group_idx, 1)
+        
+        # Create mask for experts in selected groups
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .reshape(-1, self.n_routed_experts)
+        )
+        
+        # Zero out scores for experts not in selected groups
+        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
+        
+        # Select top-k experts from remaining
+        topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
+        return topk_indices
+    
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Route tokens to experts.
+        
+        Args:
+            hidden_states: Input tensor of shape (batch_size, seq_len, hidden_size)
+            
+        Returns:
+            topk_indices: Indices of selected experts (batch_size * seq_len, top_k)
+            topk_weights: Weights for selected experts (batch_size * seq_len, top_k)
+        """
+        hidden_states = hidden_states.view(-1, self.config.hidden_size)
+        
+        # Compute router logits and convert to probabilities via sigmoid
+        router_logits = F.linear(hidden_states.float(), self.weight.float())
+        scores = router_logits.sigmoid()
+        
+        # Get top-k expert indices
+        topk_indices = self.get_topk_indices(scores)
+        
+        # Gather weights for selected experts
+        topk_weights = scores.gather(1, topk_indices)
+        
+        # Optionally normalize weights
+        if self.norm_topk_prob:
+            denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weights = topk_weights / denominator
+        
+        # Apply routing scaling factor
+        topk_weights = topk_weights * self.routed_scaling_factor
+        
+        return topk_indices, topk_weights
+
+
+class NemotronHMOE(nn.Module):
+    """
+    Mixture of Experts layer for NemotronH.
+    
+    Combines multiple expert MLPs with a router that selects which experts
+    to use for each token. Also includes shared experts that are always used.
+    """
+    
+    def __init__(self, config: NemotronHConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        
+        # Create routed experts
+        self.experts = nn.ModuleList([
+            NemotronHMLP(
+                config, 
+                intermediate_size=config.moe_intermediate_size, 
+                layer_idx=layer_idx
+            )
+            for _ in range(config.n_routed_experts)
+        ])
+        
+        # Router for selecting experts
+        self.gate = NemotronHTopkRouter(config)
+        
+        # Shared experts (always used)
+        self.shared_experts = NemotronHMLP(
+            config=config,
+            intermediate_size=config.moe_shared_expert_intermediate_size,
+            layer_idx=layer_idx
+        )
+    
+    def moe(
+        self, 
+        hidden_states: torch.Tensor, 
+        topk_indices: torch.Tensor, 
+        topk_weights: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Apply mixture of experts to hidden states.
+        
+        Args:
+            hidden_states: Input tensor of shape (batch_size * seq_len, hidden_size)
+            topk_indices: Expert indices of shape (batch_size * seq_len, top_k)
+            topk_weights: Expert weights of shape (batch_size * seq_len, top_k)
+            
+        Returns:
+            Output tensor of shape (batch_size * seq_len, hidden_size)
+        """
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        
+        # Create one-hot mask for expert selection
+        expert_mask = F.one_hot(topk_indices, num_classes=len(self.experts))
+        expert_mask = expert_mask.permute(2, 0, 1)  # (num_experts, batch*seq, top_k)
+        
+        for expert_idx in range(len(self.experts)):
+            expert = self.experts[expert_idx]
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+            
+            if token_indices.numel() > 0:
+                # Get weights and inputs for this expert
+                expert_weights = topk_weights[token_indices, weight_indices]
+                expert_input = hidden_states[token_indices]
+                
+                # Apply expert and weight the output
+                expert_output = expert(expert_input)
+                weighted_output = expert_output * expert_weights.unsqueeze(-1)
+                
+                # Accumulate weighted outputs
+                final_hidden_states.index_add_(0, token_indices, weighted_output)
+            else:
+                # No-op compute to mark params as used (for distributed training)
+                expert_dtype = expert.down_proj.weight.dtype
+                dummy_input = torch.zeros_like(hidden_states[0]).unsqueeze(0).to(expert_dtype)
+                dummy_out = expert(dummy_input)
+                final_hidden_states = final_hidden_states + dummy_out * 0
+        
+        return final_hidden_states.to(hidden_states.dtype)
+    
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through MoE layer.
+        
+        Args:
+            hidden_states: Input tensor of shape (batch_size, seq_len, hidden_size)
+            
+        Returns:
+            Output tensor of shape (batch_size, seq_len, hidden_size)
+        """
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        
+        # Route tokens to experts
+        topk_indices, topk_weights = self.gate(hidden_states)
+        
+        # Flatten for expert processing
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        
+        # Apply mixture of experts
+        hidden_states = self.moe(hidden_states, topk_indices, topk_weights)
+        
+        # Reshape back to original shape
+        hidden_states = hidden_states.view(*orig_shape)
+        
+        # Add shared expert output
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        
+        return hidden_states
+
+
+class NemotronHBlock(nn.Module):
+    """A single block in NemotronH - can be Mamba, Attention, MLP, or MoE."""
+    
+    def __init__(self, config: NemotronHConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.norm = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        
+        self.block_type = config.layers_block_type[layer_idx]
+        if self.block_type == "mamba":
+            self.mixer = NemotronHMamba2Mixer(config, layer_idx=layer_idx)
+        elif self.block_type == "attention":
+            self.mixer = NemotronHAttention(config, layer_idx=layer_idx)
+        elif self.block_type == "mlp":
+            self.mixer = NemotronHMLP(config, layer_idx=layer_idx)
+        elif self.block_type == "moe":
+            self.mixer = NemotronHMOE(config, layer_idx=layer_idx)
+        else:
+            raise ValueError(f"Invalid block type: {self.block_type}")
+    
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # Use torch.cuda.stream() to avoid NaN issues when using multiple GPUs
+        if hidden_states.is_cuda:
+            with torch.cuda.stream(torch.cuda.default_stream(hidden_states.device)):
+                return self._forward_impl(hidden_states, cache_params, cache_position, attention_mask)
+        else:
+            return self._forward_impl(hidden_states, cache_params, cache_position, attention_mask)
+    
+    def _forward_impl(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
+        if self.residual_in_fp32:
+            residual = residual.to(torch.float32)
+        
+        if self.block_type == "mamba":
+            hidden_states = self.mixer(
+                hidden_states, cache_params=cache_params, cache_position=cache_position
+            )
+        elif self.block_type == "attention":
+            hidden_states = self.mixer(
+                hidden_states, cache_position=cache_position, past_key_value=cache_params
+            )
+            hidden_states = hidden_states[0]
+        elif self.block_type in ("mlp", "moe"):
+            hidden_states = self.mixer(hidden_states)
+        
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class NemotronHModel(nn.Module):
+    """
+    NemotronH backbone model.
+    
+    This is the main backbone that can be used as a decoder in TTS models.
+    It exposes the same interface as HuggingFace transformer models.
+    """
+    
+    def __init__(self, config: NemotronHConfig):
+        super().__init__()
+        self.config = config
+        
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([NemotronHBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)])
+        self.norm_f = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        
+        self.gradient_checkpointing = False
+        self._init_weights()
+    
+    def _init_weights(self):
+        """Initialize weights with special handling for Mamba components."""
+        for name, module in self.named_modules():
+            if isinstance(module, NemotronHMamba2Mixer):
+                # Mark parameters that should not have weight decay
+                module.A_log._no_weight_decay = True
+                module.D._no_weight_decay = True
+                
+                # Special initialization for dt_bias using inverse softplus
+                # This follows the Mamba2 initialization scheme
+                dt = torch.exp(
+                    torch.rand(module.num_heads)
+                    * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                    + math.log(self.config.time_step_min)
+                ).clamp(min=self.config.time_step_floor)
+                
+                # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+                inv_dt = dt + torch.log(-torch.expm1(-dt))
+                with torch.no_grad():
+                    module.dt_bias.copy_(inv_dt)
+                module.dt_bias._no_reinit = True
+                
+            elif isinstance(module, nn.Linear):
+                nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+                if module.bias is not None:
+                    if not getattr(module.bias, "_no_reinit", False):
+                        nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.Embedding):
+                nn.init.normal_(module.weight, std=self.config.initializer_range)
+        
+        # Rescale prenorm residual weights for better training stability
+        # Following GPT-2 paper: scale by 1/sqrt(2 * n_layer)
+        if self.config.rescale_prenorm_residual:
+            for name, p in self.named_parameters():
+                if "out_proj.weight" in name:
+                    # Special Scaled Initialization for residual projections
+                    # Scale by 1/sqrt(num_hidden_layers)
+                    with torch.no_grad():
+                        p /= math.sqrt(self.config.num_hidden_layers)
+    
+    def get_input_embeddings(self):
+        return self.embeddings
+    
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+    
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, NemotronHOutput]:
+        # Support both cache_params and past_key_values for compatibility
+        if past_key_values is not None and cache_params is None:
+            cache_params = past_key_values
+        
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+        
+        hidden_states = inputs_embeds
+        
+        # Create cache if use_cache=True but no cache provided
+        if use_cache and cache_params is None:
+            cache_params = HybridMambaAttentionDynamicCache(
+                self.config,
+                batch_size=hidden_states.shape[0],
+                dtype=hidden_states.dtype,
+                device=hidden_states.device,
+            )
+        
+        if cache_position is None:
+            cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        
+        # Create causal mask for attention layers
+        causal_mask = self._create_causal_mask(attention_mask, inputs_embeds, cache_position)
+        mamba_mask = self._update_mamba_mask(attention_mask, cache_position)
+        
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        
+        for layer_idx, layer in enumerate(self.layers):
+            if layer.block_type == "mamba":
+                layer_mask = mamba_mask
+            elif layer.block_type == "attention":
+                layer_mask = causal_mask
+            else:
+                layer_mask = None
+            
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            
+            if self.gradient_checkpointing and self.training:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    layer.__call__, hidden_states, cache_params, cache_position, layer_mask
+                )
+            else:
+                hidden_states = layer(
+                    hidden_states,
+                    cache_params=cache_params,
+                    cache_position=cache_position,
+                    attention_mask=layer_mask,
+                )
+        
+        hidden_states = self.norm_f(hidden_states)
+        
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        
+        if not return_dict:
+            return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)
+        
+        return NemotronHOutput(
+            last_hidden_state=hidden_states,
+            past_key_values=cache_params if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    
+    def _create_causal_mask(self, attention_mask, input_tensor, cache_position):
+        """Create causal attention mask."""
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = cache_position[-1] + 1
+        
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+        
+        return causal_mask
+    
+    def _update_mamba_mask(self, attention_mask, cache_position):
+        """
+        Update Mamba mask with optimization.
+        
+        No need for zeroing states when:
+            1. Cached forward (cache_position[0] > 0)
+            2. Attending to all inputs (all mask values are 1)
+        """
+        mamba_mask = attention_mask
+        if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)):
+            mamba_mask = None
+        return mamba_mask
+
+
+class NemotronHForCausalLM(nn.Module):
+    """
+    NemotronH model with a language modeling head.
+    
+    This is the full model that matches the AutoModelForCausalLM interface.
+    """
+    
+    def __init__(self, config: NemotronHConfig):
+        super().__init__()
+        self.config = config
+        self.backbone = NemotronHModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        
+        self._init_weights()
+    
+    def _init_weights(self):
+        """Initialize weights."""
+        nn.init.normal_(self.lm_head.weight, mean=0.0, std=self.config.initializer_range)
+    
+    def get_input_embeddings(self):
+        return self.backbone.get_input_embeddings()
+    
+    def set_input_embeddings(self, new_embeddings):
+        self.backbone.set_input_embeddings(new_embeddings)
+    
+    def get_output_embeddings(self):
+        return self.lm_head
+    
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    
+    @property
+    def model(self):
+        """Alias for backbone, for HuggingFace compatibility."""
+        return self.backbone
+    
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, NemotronHCausalLMOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        
+        outputs = self.backbone(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_params=cache_params,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        
+        hidden_states = outputs.last_hidden_state if return_dict else outputs[0]
+        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+        
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        
+        return NemotronHCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        """Prepare inputs for generation."""
+        empty_past_kv = past_key_values is None
+        
+        # If we have cache: slice input_ids through cache_position to keep only unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids
+        # Exception 3: with synced GPUs cache_position may go out of bounds
+        if not empty_past_kv:
+            if (
+                inputs_embeds is not None  # Exception 1
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0]:]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case
+                input_ids = input_ids[:, cache_position]
+        else:
+            past_key_values = HybridMambaAttentionDynamicCache(
+                self.config, input_ids.shape[0], self.backbone.embeddings.weight.dtype, device=input_ids.device
+            )
+        
+        # Create position_ids on the fly for batch generation if not provided
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        
+        # If inputs_embeds are passed, only use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}
+        
+        model_inputs.update({
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "attention_mask": attention_mask,
+            "cache_position": cache_position,
+        })
+        return model_inputs
diff --git a/tests/collections/tts/test_nemotron_h_decoder.py b/tests/collections/tts/test_nemotron_h_decoder.py
new file mode 100644
index 000000000000..4b21dc1ae716
--- /dev/null
+++ b/tests/collections/tts/test_nemotron_h_decoder.py
@@ -0,0 +1,745 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Test script for NemotronH decoder module.
+
+This script tests:
+1. NemotronHConfig initialization
+2. NemotronHModel forward pass
+3. NemotronHForCausalLM forward pass
+4. KV caching for inference
+5. Interface compatibility with EasyMagpieTTSModel requirements
+"""
+
+try:
+    import pytest
+    PYTEST_AVAILABLE = True
+except ImportError:
+    PYTEST_AVAILABLE = False
+    # Create a dummy pytest fixture decorator for standalone execution
+    class pytest:
+        @staticmethod
+        def fixture(func):
+            return func
+
+import torch
+
+from nemo.collections.tts.modules.nemotron_h_decoder import (
+    HybridMambaAttentionDynamicCache,
+    NemotronHConfig,
+    NemotronHForCausalLM,
+    NemotronHMLP,
+    NemotronHMOE,
+    NemotronHModel,
+    NemotronHTopkRouter,
+)
+
+
+class TestNemotronHConfig:
+    """Test NemotronHConfig initialization and defaults."""
+    
+    def test_default_config(self):
+        """Test default config initialization."""
+        config = NemotronHConfig()
+        assert config.hidden_size == 1536
+        assert config.num_hidden_layers == 24
+        assert len(config.layers_block_type) == config.num_hidden_layers
+    
+    def test_custom_pattern(self):
+        """Test custom hybrid_override_pattern."""
+        config = NemotronHConfig(
+            num_hidden_layers=8,
+            hybrid_override_pattern="M*M*M*M*"
+        )
+        assert config.layers_block_type == ['mamba', 'attention'] * 4
+    
+    def test_pattern_extension(self):
+        """Test that short patterns are extended to match num_hidden_layers."""
+        config = NemotronHConfig(
+            num_hidden_layers=8,
+            hybrid_override_pattern="M*"
+        )
+        assert len(config.layers_block_type) == 8
+
+
+class TestNemotronHModel:
+    """Test NemotronHModel backbone."""
+    
+    @pytest.fixture
+    def small_config(self):
+        """Create a small config for testing."""
+        return NemotronHConfig(
+            hidden_size=64,
+            num_hidden_layers=4,
+            vocab_size=1000,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            mamba_num_heads=8,
+            mamba_head_dim=8,
+            ssm_state_size=16,
+            n_groups=2,
+            intermediate_size=128,
+            hybrid_override_pattern="M*M*",
+        )
+    
+    @pytest.fixture
+    def model(self, small_config):
+        """Create a small model for testing."""
+        return NemotronHModel(small_config)
+    
+    def test_model_creation(self, model, small_config):
+        """Test model can be created."""
+        assert model is not None
+        assert len(model.layers) == small_config.num_hidden_layers
+    
+    def test_forward_with_input_ids(self, model):
+        """Test forward pass with input_ids."""
+        batch_size, seq_len = 2, 16
+        input_ids = torch.randint(0, 1000, (batch_size, seq_len))
+        
+        output = model(input_ids=input_ids)
+        
+        assert output.last_hidden_state is not None
+        assert output.last_hidden_state.shape == (batch_size, seq_len, 64)
+    
+    def test_forward_with_inputs_embeds(self, model):
+        """Test forward pass with inputs_embeds (required for TTS)."""
+        batch_size, seq_len, hidden_size = 2, 16, 64
+        inputs_embeds = torch.randn(batch_size, seq_len, hidden_size)
+        
+        output = model(inputs_embeds=inputs_embeds)
+        
+        assert output.last_hidden_state is not None
+        assert output.last_hidden_state.shape == (batch_size, seq_len, hidden_size)
+    
+    def test_get_set_input_embeddings(self, model):
+        """Test get/set input embeddings interface."""
+        original_embeddings = model.get_input_embeddings()
+        assert original_embeddings is not None
+        
+        new_embeddings = torch.nn.Embedding(100, 64)
+        model.set_input_embeddings(new_embeddings)
+        
+        assert model.get_input_embeddings() is new_embeddings
+
+
+class TestNemotronHForCausalLM:
+    """Test NemotronHForCausalLM full model."""
+    
+    @pytest.fixture
+    def small_config(self):
+        """Create a small config for testing."""
+        return NemotronHConfig(
+            hidden_size=64,
+            num_hidden_layers=4,
+            vocab_size=1000,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            mamba_num_heads=8,
+            mamba_head_dim=8,
+            ssm_state_size=16,
+            n_groups=2,
+            intermediate_size=128,
+            hybrid_override_pattern="M*M*",
+        )
+    
+    @pytest.fixture
+    def model(self, small_config):
+        """Create a small model for testing."""
+        return NemotronHForCausalLM(small_config)
+    
+    def test_model_creation(self, model, small_config):
+        """Test model can be created."""
+        assert model is not None
+        assert model.backbone is not None
+        assert model.lm_head is not None
+    
+    def test_model_alias(self, model):
+        """Test that model.model returns backbone (HF compatibility)."""
+        assert model.model is model.backbone
+    
+    def test_forward_with_inputs_embeds(self, model):
+        """Test forward pass with inputs_embeds."""
+        batch_size, seq_len, hidden_size = 2, 16, 64
+        inputs_embeds = torch.randn(batch_size, seq_len, hidden_size)
+        
+        output = model(inputs_embeds=inputs_embeds)
+        
+        assert output.logits is not None
+        assert output.logits.shape == (batch_size, seq_len, 1000)  # vocab_size
+    
+    def test_interface_compatibility(self, model):
+        """Test that model satisfies EasyMagpieTTSModel interface requirements."""
+        # Test 1: decoder.get_input_embeddings()
+        embeddings = model.backbone.get_input_embeddings()
+        assert embeddings is not None
+        
+        # Test 2: decoder.set_input_embeddings()
+        new_emb = torch.nn.Embedding(100, 64)
+        model.backbone.set_input_embeddings(new_emb)
+        assert model.backbone.get_input_embeddings() is new_emb
+        
+        # Reset for next tests
+        model.backbone.set_input_embeddings(embeddings)
+        
+        # Test 3: decoder(inputs_embeds, attention_mask, use_cache, past_key_values)
+        batch_size, seq_len, hidden_size = 2, 16, 64
+        inputs_embeds = torch.randn(batch_size, seq_len, hidden_size)
+        attention_mask = torch.ones(batch_size, seq_len)
+        
+        output = model.backbone(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            use_cache=False,
+            past_key_values=None,
+        )
+        
+        # Test 4: Return .last_hidden_state
+        assert hasattr(output, 'last_hidden_state')
+        assert output.last_hidden_state is not None
+        
+        # Test 5: Return .past_key_values (when use_cache=True not tested here as it requires more setup)
+        assert hasattr(output, 'past_key_values')
+
+
+class TestHybridCache:
+    """Test HybridMambaAttentionDynamicCache."""
+    
+    def test_cache_creation(self):
+        """Test cache can be created."""
+        config = NemotronHConfig(
+            hidden_size=64,
+            num_hidden_layers=4,
+            mamba_num_heads=8,
+            mamba_head_dim=8,
+            ssm_state_size=16,
+            conv_kernel=4,
+            hybrid_override_pattern="M*M*",
+        )
+        
+        batch_size = 2
+        cache = HybridMambaAttentionDynamicCache(config, batch_size, dtype=torch.float32)
+        
+        assert len(cache.conv_states) == config.num_hidden_layers
+        assert len(cache.ssm_states) == config.num_hidden_layers
+        assert len(cache.key_cache) == config.num_hidden_layers
+        assert len(cache.value_cache) == config.num_hidden_layers
+
+
+class TestNemotronHCausality:
+    """Test that NemotronH model is causal (future timesteps don't affect previous ones)."""
+    
+    @pytest.fixture
+    def small_config(self):
+        """Create a small config for testing causality."""
+        return NemotronHConfig(
+            hidden_size=64,
+            num_hidden_layers=4,
+            vocab_size=1000,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            mamba_num_heads=8,
+            mamba_head_dim=8,
+            ssm_state_size=16,
+            n_groups=2,
+            intermediate_size=128,
+            hybrid_override_pattern="M*M*",
+        )
+    
+    @pytest.fixture
+    def model(self, small_config):
+        """Create a small model for testing."""
+        model = NemotronHModel(small_config)
+        model.eval()  # Set to eval mode for deterministic behavior
+        return model
+    
+    def test_causality_with_input_modification(self, model, small_config):
+        """
+        Test causality by modifying future timesteps and checking that earlier outputs are unchanged.
+        
+        The test:
+        1. Pass sequence through the model
+        2. Modify a future timestep in the input
+        3. Verify outputs at earlier timesteps remain exactly the same
+        """
+        batch_size, seq_len = 2, 16
+        hidden_size = small_config.hidden_size
+        
+        # Create a base input
+        torch.manual_seed(42)
+        inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size)
+        
+        # Get output with original input
+        with torch.no_grad():
+            output_original = model(inputs_embeds=inputs_embeds_original.clone())
+        
+        # Test at different positions
+        test_positions = [seq_len // 4, seq_len // 2, 3 * seq_len // 4]
+        
+        for modify_pos in test_positions:
+            # Create modified input where we change timesteps from modify_pos onwards
+            inputs_embeds_modified = inputs_embeds_original.clone()
+            # Add random noise to all positions from modify_pos onwards
+            inputs_embeds_modified[:, modify_pos:, :] += torch.randn(
+                batch_size, seq_len - modify_pos, hidden_size
+            ) * 10.0  # Large modification to ensure it would affect outputs if not causal
+            
+            # Get output with modified input
+            with torch.no_grad():
+                output_modified = model(inputs_embeds=inputs_embeds_modified)
+            
+            # Check that outputs BEFORE modify_pos are unchanged
+            outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :]
+            outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :]
+            
+            # Should be exactly equal (within floating point tolerance)
+            assert torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5), \
+                f"Causality violation: modifying position {modify_pos} affected earlier positions"
+            
+            # Verify that outputs AT and AFTER modify_pos are different (sanity check)
+            outputs_after_original = output_original.last_hidden_state[:, modify_pos:, :]
+            outputs_after_modified = output_modified.last_hidden_state[:, modify_pos:, :]
+            
+            assert not torch.allclose(outputs_after_original, outputs_after_modified, atol=1e-3), \
+                f"Sanity check failed: modifying position {modify_pos} should affect outputs at/after that position"
+    
+    def test_causality_incremental_vs_full(self, model, small_config):
+        """
+        Test causality by comparing incremental (token-by-token) vs full sequence processing.
+        
+        A causal model should produce the same output whether we:
+        1. Process the full sequence at once
+        2. Process tokens incrementally one at a time
+        """
+        batch_size, seq_len = 1, 8  # Smaller seq for incremental test
+        hidden_size = small_config.hidden_size
+        
+        torch.manual_seed(123)
+        inputs_embeds = torch.randn(batch_size, seq_len, hidden_size)
+        
+        # Get output from full sequence
+        with torch.no_grad():
+            output_full = model(inputs_embeds=inputs_embeds)
+        
+        # Get outputs incrementally (one token at a time)
+        # For a causal model, output at each position should match
+        incremental_outputs = []
+        for t in range(1, seq_len + 1):
+            with torch.no_grad():
+                partial_output = model(inputs_embeds=inputs_embeds[:, :t, :])
+            # Take only the last timestep output for comparison
+            incremental_outputs.append(partial_output.last_hidden_state[:, -1:, :])
+        
+        # Stack incremental outputs
+        output_incremental = torch.cat(incremental_outputs, dim=1)
+        
+        # Compare: the full sequence output should match the incrementally computed outputs
+        assert torch.allclose(output_full.last_hidden_state, output_incremental, atol=1e-4), \
+            "Causality violation: incremental processing produces different results than full sequence"
+    
+    def test_causality_causal_lm(self, small_config):
+        """Test causality for NemotronHForCausalLM."""
+        model = NemotronHForCausalLM(small_config)
+        model.eval()
+        
+        batch_size, seq_len = 2, 12
+        hidden_size = small_config.hidden_size
+        
+        torch.manual_seed(456)
+        inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size)
+        
+        modify_pos = seq_len // 2
+        
+        # Get logits with original input
+        with torch.no_grad():
+            output_original = model(inputs_embeds=inputs_embeds_original.clone())
+        
+        # Modify future positions
+        inputs_embeds_modified = inputs_embeds_original.clone()
+        inputs_embeds_modified[:, modify_pos:, :] += torch.randn(
+            batch_size, seq_len - modify_pos, hidden_size
+        ) * 10.0
+        
+        with torch.no_grad():
+            output_modified = model(inputs_embeds=inputs_embeds_modified)
+        
+        # Check logits before modify_pos are unchanged
+        logits_before_original = output_original.logits[:, :modify_pos, :]
+        logits_before_modified = output_modified.logits[:, :modify_pos, :]
+        
+        assert torch.allclose(logits_before_original, logits_before_modified, atol=1e-5), \
+            "Causality violation in CausalLM: modifying future positions affected earlier logits"
+    
+    def test_causality_different_layer_types(self):
+        """Test causality with different hybrid patterns (Mamba-only, Attention-only, mixed)."""
+        patterns = [
+            "MMMM",  # Mamba only
+            "****",  # Attention only
+            "M*M*",  # Alternating
+            "MM**",  # Mixed blocks
+        ]
+        
+        for pattern in patterns:
+            config = NemotronHConfig(
+                hidden_size=64,
+                num_hidden_layers=4,
+                vocab_size=1000,
+                num_attention_heads=4,
+                num_key_value_heads=2,
+                mamba_num_heads=8,
+                mamba_head_dim=8,
+                ssm_state_size=16,
+                n_groups=2,
+                intermediate_size=128,
+                hybrid_override_pattern=pattern,
+            )
+            
+            model = NemotronHModel(config)
+            model.eval()
+            
+            batch_size, seq_len = 2, 8
+            hidden_size = config.hidden_size
+            
+            torch.manual_seed(789)
+            inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size)
+            
+            modify_pos = 4
+            
+            with torch.no_grad():
+                output_original = model(inputs_embeds=inputs_embeds_original.clone())
+            
+            inputs_embeds_modified = inputs_embeds_original.clone()
+            inputs_embeds_modified[:, modify_pos:, :] += torch.randn(
+                batch_size, seq_len - modify_pos, hidden_size
+            ) * 10.0
+            
+            with torch.no_grad():
+                output_modified = model(inputs_embeds=inputs_embeds_modified)
+            
+            outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :]
+            outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :]
+            
+            assert torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5), \
+                f"Causality violation for pattern '{pattern}': modifying future positions affected earlier outputs"
+
+
+class TestMoELayer:
+    """Test Mixture of Experts layer."""
+    
+    @pytest.fixture
+    def moe_config(self):
+        """Create a config for MoE testing."""
+        return NemotronHConfig(
+            hidden_size=64,
+            num_hidden_layers=4,
+            vocab_size=1000,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            mamba_num_heads=8,
+            mamba_head_dim=8,
+            ssm_state_size=16,
+            n_groups=2,
+            intermediate_size=128,
+            # MoE config
+            n_routed_experts=4,
+            num_experts_per_tok=2,
+            moe_intermediate_size=64,
+            moe_shared_expert_intermediate_size=128,
+            n_group=1,
+            topk_group=1,
+            routed_scaling_factor=1.0,
+            norm_topk_prob=True,
+            hybrid_override_pattern="M*ME",  # Includes MoE layer
+        )
+    
+    def test_topk_router_creation(self, moe_config):
+        """Test NemotronHTopkRouter creation."""
+        router = NemotronHTopkRouter(moe_config)
+        assert router.weight.shape == (moe_config.n_routed_experts, moe_config.hidden_size)
+        assert router.top_k == moe_config.num_experts_per_tok
+    
+    def test_topk_router_forward(self, moe_config):
+        """Test NemotronHTopkRouter forward pass."""
+        router = NemotronHTopkRouter(moe_config)
+        batch_size, seq_len = 2, 8
+        hidden_states = torch.randn(batch_size, seq_len, moe_config.hidden_size)
+        
+        topk_indices, topk_weights = router(hidden_states)
+        
+        # Check shapes
+        assert topk_indices.shape == (batch_size * seq_len, moe_config.num_experts_per_tok)
+        assert topk_weights.shape == (batch_size * seq_len, moe_config.num_experts_per_tok)
+        
+        # Check indices are valid
+        assert topk_indices.min() >= 0
+        assert topk_indices.max() < moe_config.n_routed_experts
+    
+    def test_moe_layer_creation(self, moe_config):
+        """Test NemotronHMOE creation."""
+        moe = NemotronHMOE(moe_config, layer_idx=0)
+        
+        assert len(moe.experts) == moe_config.n_routed_experts
+        assert moe.gate is not None
+        assert moe.shared_experts is not None
+    
+    def test_moe_layer_forward(self, moe_config):
+        """Test NemotronHMOE forward pass."""
+        moe = NemotronHMOE(moe_config, layer_idx=0)
+        batch_size, seq_len = 2, 8
+        hidden_states = torch.randn(batch_size, seq_len, moe_config.hidden_size)
+        
+        output = moe(hidden_states)
+        
+        assert output.shape == hidden_states.shape
+    
+    def test_model_with_moe_pattern(self, moe_config):
+        """Test full model with MoE layer."""
+        model = NemotronHModel(moe_config)
+        
+        # Check that MoE layer was created
+        assert model.layers[3].block_type == "moe"
+        
+        # Test forward pass
+        batch_size, seq_len = 2, 8
+        inputs_embeds = torch.randn(batch_size, seq_len, moe_config.hidden_size)
+        
+        output = model(inputs_embeds=inputs_embeds)
+        
+        assert output.last_hidden_state is not None
+        assert output.last_hidden_state.shape == (batch_size, seq_len, moe_config.hidden_size)
+
+
+if __name__ == "__main__":
+    """Run basic tests without pytest."""
+    print("Testing NemotronH Decoder Module...")
+    
+    # Test 1: Config
+    print("\n1. Testing NemotronHConfig...")
+    config = NemotronHConfig(
+        hidden_size=64,
+        num_hidden_layers=4,
+        vocab_size=1000,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        mamba_num_heads=8,
+        mamba_head_dim=8,
+        ssm_state_size=16,
+        n_groups=2,
+        intermediate_size=128,
+        hybrid_override_pattern="M*M*",
+    )
+    print(f"   Config created: {config.num_hidden_layers} layers, pattern={config.hybrid_override_pattern}")
+    print(f"   Layer types: {config.layers_block_type}")
+    
+    # Test 2: Model creation
+    print("\n2. Testing NemotronHModel creation...")
+    model = NemotronHModel(config)
+    print(f"   Model created with {len(model.layers)} layers")
+    
+    # Test 3: Forward pass with inputs_embeds
+    print("\n3. Testing forward pass with inputs_embeds...")
+    batch_size, seq_len, hidden_size = 2, 16, 64
+    inputs_embeds = torch.randn(batch_size, seq_len, hidden_size)
+    output = model(inputs_embeds=inputs_embeds)
+    print(f"   Input shape: {inputs_embeds.shape}")
+    print(f"   Output shape: {output.last_hidden_state.shape}")
+    
+    # Test 4: Full model
+    print("\n4. Testing NemotronHForCausalLM...")
+    full_model = NemotronHForCausalLM(config)
+    output = full_model(inputs_embeds=inputs_embeds)
+    print(f"   Logits shape: {output.logits.shape}")
+    
+    # Test 5: Interface compatibility
+    print("\n5. Testing interface compatibility for EasyMagpieTTSModel...")
+    decoder = full_model.backbone
+    
+    # get_input_embeddings
+    emb = decoder.get_input_embeddings()
+    print(f"   get_input_embeddings(): {type(emb).__name__}")
+    
+    # set_input_embeddings
+    new_emb = torch.nn.Embedding(100, 64)
+    decoder.set_input_embeddings(new_emb)
+    print(f"   set_input_embeddings(): OK")
+    decoder.set_input_embeddings(emb)  # Reset
+    
+    # forward with expected args
+    output = decoder(
+        inputs_embeds=inputs_embeds,
+        attention_mask=torch.ones(batch_size, seq_len),
+        use_cache=False,
+        past_key_values=None,
+    )
+    print(f"   forward(inputs_embeds, attention_mask, use_cache, past_key_values): OK")
+    print(f"   .last_hidden_state: {output.last_hidden_state.shape}")
+    print(f"   .past_key_values: {output.past_key_values}")
+    
+    # Test 6: MoE layer
+    print("\n6. Testing MoE (Mixture of Experts) layer...")
+    moe_config = NemotronHConfig(
+        hidden_size=64,
+        num_hidden_layers=4,
+        vocab_size=1000,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        mamba_num_heads=8,
+        mamba_head_dim=8,
+        ssm_state_size=16,
+        n_groups=2,
+        intermediate_size=128,
+        # MoE config
+        n_routed_experts=4,
+        num_experts_per_tok=2,
+        moe_intermediate_size=64,
+        moe_shared_expert_intermediate_size=128,
+        n_group=1,
+        topk_group=1,
+        routed_scaling_factor=1.0,
+        norm_topk_prob=True,
+        hybrid_override_pattern="M*ME",  # Includes MoE layer
+    )
+    print(f"   Config: pattern={moe_config.hybrid_override_pattern}, block_types={moe_config.layers_block_type}")
+    
+    # Test router
+    router = NemotronHTopkRouter(moe_config)
+    test_input = torch.randn(2, 8, 64)
+    topk_indices, topk_weights = router(test_input)
+    print(f"   Router: topk_indices shape={topk_indices.shape}, topk_weights shape={topk_weights.shape}")
+    
+    # Test MoE layer
+    moe = NemotronHMOE(moe_config, layer_idx=0)
+    moe_output = moe(test_input)
+    print(f"   MoE layer: input={test_input.shape}, output={moe_output.shape}")
+    
+    # Test full model with MoE
+    moe_model = NemotronHModel(moe_config)
+    moe_model_output = moe_model(inputs_embeds=test_input)
+    print(f"   Full model with MoE: output={moe_model_output.last_hidden_state.shape}")
+    
+    # Test 7: Causality test
+    print("\n7. Testing model causality (future timesteps don't affect previous ones)...")
+    
+    # Create model for causality test
+    causality_config = NemotronHConfig(
+        hidden_size=64,
+        num_hidden_layers=4,
+        vocab_size=1000,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        mamba_num_heads=8,
+        mamba_head_dim=8,
+        ssm_state_size=16,
+        n_groups=2,
+        intermediate_size=128,
+        hybrid_override_pattern="M*M*",
+    )
+    causality_model = NemotronHModel(causality_config)
+    causality_model.eval()
+    
+    batch_size, seq_len = 2, 16
+    hidden_size = 64
+    
+    # Create base input
+    torch.manual_seed(42)
+    inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size)
+    
+    # Get output with original input
+    with torch.no_grad():
+        output_original = causality_model(inputs_embeds=inputs_embeds_original.clone())
+    
+    # Test at different positions
+    test_positions = [4, 8, 12]
+    causality_passed = True
+    
+    for modify_pos in test_positions:
+        # Create modified input where we change timesteps from modify_pos onwards
+        inputs_embeds_modified = inputs_embeds_original.clone()
+        inputs_embeds_modified[:, modify_pos:, :] += torch.randn(
+            batch_size, seq_len - modify_pos, hidden_size
+        ) * 10.0
+        
+        # Get output with modified input
+        with torch.no_grad():
+            output_modified = causality_model(inputs_embeds=inputs_embeds_modified)
+        
+        # Check that outputs BEFORE modify_pos are unchanged
+        outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :]
+        outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :]
+        
+        if torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5):
+            print(f"   Position {modify_pos}: PASS (earlier outputs unchanged)")
+        else:
+            print(f"   Position {modify_pos}: FAIL (causality violation!)")
+            causality_passed = False
+        
+        # Verify outputs at/after modify_pos are different (sanity check)
+        outputs_after_original = output_original.last_hidden_state[:, modify_pos:, :]
+        outputs_after_modified = output_modified.last_hidden_state[:, modify_pos:, :]
+        
+        if not torch.allclose(outputs_after_original, outputs_after_modified, atol=1e-3):
+            print(f"   Position {modify_pos}: Sanity check PASS (later outputs changed)")
+        else:
+            print(f"   Position {modify_pos}: Sanity check FAIL (later outputs should change)")
+            causality_passed = False
+    
+    # Test with different layer patterns
+    print("\n   Testing causality with different layer patterns...")
+    patterns = ["MMMM", "****", "M*M*", "MM**"]
+    for pattern in patterns:
+        pattern_config = NemotronHConfig(
+            hidden_size=64,
+            num_hidden_layers=4,
+            vocab_size=1000,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            mamba_num_heads=8,
+            mamba_head_dim=8,
+            ssm_state_size=16,
+            n_groups=2,
+            intermediate_size=128,
+            hybrid_override_pattern=pattern,
+        )
+        pattern_model = NemotronHModel(pattern_config)
+        pattern_model.eval()
+        
+        torch.manual_seed(789)
+        test_input = torch.randn(2, 8, 64)
+        modify_pos = 4
+        
+        with torch.no_grad():
+            out_orig = pattern_model(inputs_embeds=test_input.clone())
+        
+        test_input_mod = test_input.clone()
+        test_input_mod[:, modify_pos:, :] += torch.randn(2, 4, 64) * 10.0
+        
+        with torch.no_grad():
+            out_mod = pattern_model(inputs_embeds=test_input_mod)
+        
+        if torch.allclose(out_orig.last_hidden_state[:, :modify_pos, :], 
+                          out_mod.last_hidden_state[:, :modify_pos, :], atol=1e-5):
+            print(f"   Pattern '{pattern}': PASS")
+        else:
+            print(f"   Pattern '{pattern}': FAIL (causality violation!)")
+            causality_passed = False
+    
+    if causality_passed:
+        print("   All causality tests PASSED!")
+    else:
+        print("   WARNING: Some causality tests FAILED!")
+    
+    print("\n" + "="*50)
+    print("All tests passed!")
+    print("="*50)

From 3c055497da83ab3fd4be151dbba404e530c0ceff Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Mon, 2 Feb 2026 19:13:01 +0000
Subject: [PATCH 31/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/easy_magpietts.py |  14 +-
 .../tts/modules/nemotron_h_decoder.py         | 568 +++++++++---------
 .../tts/test_nemotron_h_decoder.py            | 309 +++++-----
 3 files changed, 450 insertions(+), 441 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index dabdd0ae6f30..3f107736604a 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -328,7 +328,9 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             nemotron_model = NemotronHForCausalLM(nemotron_config)
             self.decoder = nemotron_model.backbone
             self.lm_text_head = nemotron_model.lm_head
-            logging.info(f"NemotronH config: {nemotron_config.num_hidden_layers} layers, pattern={nemotron_config.hybrid_override_pattern[:20]}...")
+            logging.info(
+                f"NemotronH config: {nemotron_config.num_hidden_layers} layers, pattern={nemotron_config.hybrid_override_pattern[:20]}..."
+            )
 
         else:
             raise ValueError(f"Unknown decoder_type: {self.decoder_type}. Supported: 'huggingface', 'nemotron_h'")
@@ -2031,10 +2033,10 @@ def infer_batch(
                 ]  # (2B, T_min, E)
             else:
                 first_inference_input = context_plus_audio_embedded[:, :min_context_len, :]  # (B, T_min, E)
-            
+
             # Initialize cache_position for tracking sequence position (needed for NemotronH)
             cache_position = torch.arange(min_context_len, device=context_embedding.device)
-            
+
             # First forward pass to get the initial hidden state and past key values
             transformer_out = self.forward(
                 inputs_embeds=first_inference_input,
@@ -2047,7 +2049,7 @@ def infer_batch(
             time_to_first_prediction = time.time() - start_time
             last_hidden = transformer_out.last_hidden_state  # (B, T_total, E)
             past_kv = transformer_out.past_key_values
-            
+
             # Track the current sequence length for cache_position updates
             current_cache_seq_len = min_context_len
 
@@ -2244,7 +2246,7 @@ def infer_batch(
 
                 # Update cache_position for current step (needed for NemotronH cached forward)
                 cache_position = torch.tensor([current_cache_seq_len], device=context_embedding.device)
-                
+
                 transformer_out = self.forward(
                     inputs_embeds=next_input,
                     attention_mask=None,
@@ -2254,7 +2256,7 @@ def infer_batch(
                 )
                 last_hidden = transformer_out.last_hidden_state
                 past_kv = transformer_out.past_key_values
-                
+
                 # Increment sequence length for next iteration
                 current_cache_seq_len += 1
                 if len(end_indices) == audio_codes_next.size(0):
diff --git a/nemo/collections/tts/modules/nemotron_h_decoder.py b/nemo/collections/tts/modules/nemotron_h_decoder.py
index b33c1ecba663..f89e0a8fd326 100644
--- a/nemo/collections/tts/modules/nemotron_h_decoder.py
+++ b/nemo/collections/tts/modules/nemotron_h_decoder.py
@@ -37,6 +37,7 @@
 try:
     from mamba_ssm.ops.triton.selective_state_update import selective_state_update
     from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+
     MAMBA_SSM_AVAILABLE = True
 except ImportError:
     selective_state_update = None
@@ -46,6 +47,7 @@
 
 try:
     from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn
+
     RMSNORM_FN_AVAILABLE = True
 except ImportError:
     rmsnorm_fn = None
@@ -53,6 +55,7 @@
 
 try:
     from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+
     CAUSAL_CONV1D_AVAILABLE = True
 except ImportError:
     causal_conv1d_fn = None
@@ -61,6 +64,7 @@
 
 try:
     from flash_attn import flash_attn_func
+
     FLASH_ATTN_AVAILABLE = True
 except ImportError:
     flash_attn_func = None
@@ -68,13 +72,15 @@
 
 
 # Check if fast path is available (all optimized kernels present)
-IS_FAST_PATH_AVAILABLE = all([
-    MAMBA_SSM_AVAILABLE,
-    CAUSAL_CONV1D_AVAILABLE,
-    selective_state_update is not None,
-    mamba_chunk_scan_combined is not None,
-    causal_conv1d_fn is not None,
-])
+IS_FAST_PATH_AVAILABLE = all(
+    [
+        MAMBA_SSM_AVAILABLE,
+        CAUSAL_CONV1D_AVAILABLE,
+        selective_state_update is not None,
+        mamba_chunk_scan_combined is not None,
+        causal_conv1d_fn is not None,
+    ]
+)
 
 
 def get_activation_fn(activation: str):
@@ -93,19 +99,20 @@ def get_activation_fn(activation: str):
 class NemotronHConfig:
     """
     Configuration class for NemotronH model.
-    
+
     This configuration controls the hybrid Mamba2/Attention architecture.
     The layer types are specified via hybrid_override_pattern where:
     - 'M' = Mamba2 layer
-    - '*' = Attention layer  
+    - '*' = Attention layer
     - '-' = MLP layer
     - 'E' = MoE layer
     """
+
     # Model dimensions
     hidden_size: int = 1536
     num_hidden_layers: int = 24
     vocab_size: int = 131072
-    
+
     # Attention config
     num_attention_heads: int = 12
     num_key_value_heads: int = 4
@@ -113,7 +120,7 @@ class NemotronHConfig:
     attention_dropout: float = 0.0
     attention_bias: bool = False
     max_position_embeddings: int = 4096
-    
+
     # Mamba config
     mamba_num_heads: int = 64
     mamba_head_dim: int = 64
@@ -128,12 +135,12 @@ class NemotronHConfig:
     mamba_hidden_act: str = "silu"
     use_conv_bias: bool = True
     use_bias: bool = False
-    
+
     # MLP config
     intermediate_size: int = 4096
     mlp_hidden_act: str = "silu"
     mlp_bias: bool = False
-    
+
     # MoE config (if using MoE layers)
     n_routed_experts: int = 8
     num_experts_per_tok: int = 2
@@ -143,46 +150,48 @@ class NemotronHConfig:
     topk_group: int = 1
     routed_scaling_factor: float = 1.0
     norm_topk_prob: bool = True
-    
+
     # Layer pattern: M=Mamba, *=Attention, -=MLP, E=MoE
     # Example: "M*M*M*M*" = alternating Mamba and Attention
     hybrid_override_pattern: str = "M*M*M*M*M*M*M*M*M*M*M*M*"
-    
+
     # Normalization
     layer_norm_epsilon: float = 1e-5
     residual_in_fp32: bool = True
-    
+
     # Initialization
     initializer_range: float = 0.02
     rescale_prenorm_residual: bool = True
-    
+
     # Output
     use_cache: bool = True
     use_return_dict: bool = True
     output_attentions: bool = False
     output_hidden_states: bool = False
     num_logits_to_keep: int = 1
-    
+
     # Attention implementation
     _attn_implementation: str = "sdpa"  # "eager", "sdpa", or "flash_attention_2"
-    
+
     def __post_init__(self):
         # Derive layers_block_type from hybrid_override_pattern
         pattern_map = {'M': 'mamba', '*': 'attention', '-': 'mlp', 'E': 'moe'}
         self.layers_block_type = [pattern_map.get(c, 'mamba') for c in self.hybrid_override_pattern]
-        
+
         # Ensure num_hidden_layers matches pattern length
         if len(self.layers_block_type) != self.num_hidden_layers:
             # Extend or truncate pattern to match num_hidden_layers
             if len(self.layers_block_type) < self.num_hidden_layers:
                 # Repeat pattern
-                full_pattern = self.hybrid_override_pattern * (self.num_hidden_layers // len(self.hybrid_override_pattern) + 1)
-                self.hybrid_override_pattern = full_pattern[:self.num_hidden_layers]
+                full_pattern = self.hybrid_override_pattern * (
+                    self.num_hidden_layers // len(self.hybrid_override_pattern) + 1
+                )
+                self.hybrid_override_pattern = full_pattern[: self.num_hidden_layers]
                 self.layers_block_type = [pattern_map.get(c, 'mamba') for c in self.hybrid_override_pattern]
             else:
-                self.layers_block_type = self.layers_block_type[:self.num_hidden_layers]
-                self.hybrid_override_pattern = self.hybrid_override_pattern[:self.num_hidden_layers]
-        
+                self.layers_block_type = self.layers_block_type[: self.num_hidden_layers]
+                self.hybrid_override_pattern = self.hybrid_override_pattern[: self.num_hidden_layers]
+
         # Set head_dim if not specified
         if self.head_dim is None:
             self.head_dim = self.hidden_size // self.num_attention_heads
@@ -191,6 +200,7 @@ def __post_init__(self):
 @dataclass
 class NemotronHOutput:
     """Output class for NemotronH model."""
+
     last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Any] = None  # HybridMambaAttentionDynamicCache
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -200,6 +210,7 @@ class NemotronHOutput:
 @dataclass
 class NemotronHCausalLMOutput:
     """Output class for NemotronH causal LM."""
+
     loss: Optional[torch.FloatTensor] = None
     logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Any] = None
@@ -209,25 +220,25 @@ class NemotronHCausalLMOutput:
 
 class HybridMambaAttentionDynamicCache:
     """
-    A dynamic cache that handles both attention cache (with seq_len dimension) 
+    A dynamic cache that handles both attention cache (with seq_len dimension)
     and mamba cache (with constant shape regardless of seq_len).
     """
-    
+
     def __init__(self, config: NemotronHConfig, batch_size: int, dtype=torch.float16, device=None):
         self.dtype = dtype
         self.has_previous_state = False
         self.conv_kernel_size = config.conv_kernel
-        
+
         intermediate_size = config.mamba_num_heads * config.mamba_head_dim
         ssm_state_size = config.ssm_state_size
         conv_kernel_size = config.conv_kernel
-        
+
         self.conv_states = []
         self.ssm_states = []
         self.key_cache = []
         self.value_cache = []
         self.transformer_layers = []
-        
+
         for i in range(config.num_hidden_layers):
             if config.layers_block_type[i] == "mamba":
                 self.conv_states.append(
@@ -240,10 +251,10 @@ def __init__(self, config: NemotronHConfig, batch_size: int, dtype=torch.float16
                 self.conv_states.append(torch.tensor([[]] * batch_size, device=device))
                 self.ssm_states.append(torch.tensor([[]] * batch_size, device=device))
                 self.transformer_layers.append(i)
-        
+
         self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
         self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
-    
+
     def update(
         self,
         key_states: torch.Tensor,
@@ -258,13 +269,13 @@ def update(
             self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
             self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
         return self.key_cache[layer_idx], self.value_cache[layer_idx]
-    
+
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
         if len(self.key_cache) <= layer_idx:
             return 0
         return self.key_cache[layer_idx].shape[-2] if self.key_cache[layer_idx].dim() > 2 else 0
-    
+
     def update_conv_state(self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False):
         if cache_init:
             self.conv_states[layer_idx] = new_conv_state.to(self.conv_states[layer_idx].device)
@@ -272,11 +283,11 @@ def update_conv_state(self, layer_idx: int, new_conv_state: torch.Tensor, cache_
             self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1)
             self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states[layer_idx].device)
         return self.conv_states[layer_idx]
-    
+
     def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor):
         self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states[layer_idx].device)
         return self.ssm_states[layer_idx]
-    
+
     def reorder_cache(self, beam_idx: torch.LongTensor):
         """Reorders the cache for beam search, given the selected beam indices."""
         for layer_idx in range(len(self.key_cache)):
@@ -284,12 +295,12 @@ def reorder_cache(self, beam_idx: torch.LongTensor):
             self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
             device = self.value_cache[layer_idx].device
             self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
-            
+
             device = self.conv_states[layer_idx].device
             self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
             device = self.ssm_states[layer_idx].device
             self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
-    
+
     def reset(self):
         """Reset all cache states to zero."""
         for i in range(len(self.conv_states)):
@@ -306,12 +317,12 @@ def reset(self):
 
 class NemotronHRMSNorm(nn.Module):
     """RMSNorm implementation for NemotronH."""
-    
+
     def __init__(self, hidden_size: int, eps: float = 1e-6):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         self.variance_epsilon = eps
-    
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         input_dtype = hidden_states.dtype
         hidden_states = hidden_states.to(torch.float32)
@@ -322,21 +333,17 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 class MambaRMSNormGated(nn.Module):
     """Gated RMSNorm for Mamba layers."""
-    
+
     def __init__(self, hidden_size: int, group_size: int, eps: float = 1e-5):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         self.variance_epsilon = eps
         self.group_size = group_size
-    
+
     def forward(self, hidden_states: torch.Tensor, gate: Optional[torch.Tensor] = None) -> torch.Tensor:
         # Only use Triton kernel if available AND tensors are on CUDA
-        use_triton = (
-            RMSNORM_FN_AVAILABLE 
-            and rmsnorm_fn is not None 
-            and hidden_states.is_cuda
-        )
-        
+        use_triton = RMSNORM_FN_AVAILABLE and rmsnorm_fn is not None and hidden_states.is_cuda
+
         if use_triton:
             return rmsnorm_fn(
                 x=hidden_states,
@@ -345,7 +352,7 @@ def forward(self, hidden_states: torch.Tensor, gate: Optional[torch.Tensor] = No
                 z=gate,
                 eps=self.variance_epsilon,
                 group_size=self.group_size,
-                norm_before_gate=False
+                norm_before_gate=False,
             )
         else:
             # Fallback: simple RMSNorm + gating (works on CPU and GPU)
@@ -371,7 +378,9 @@ def reshape_into_chunks(input_tensor, pad_size, chunk_size):
     if len(input_tensor.shape) == 3:
         return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2])
     else:
-        return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3])
+        return input_tensor.reshape(
+            input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3]
+        )
 
 
 def segment_sum(input_tensor):
@@ -399,7 +408,7 @@ class NemotronHMamba2Mixer(nn.Module):
     Mamba2 mixer layer implementation.
     Computes state space model operations for sequence modeling.
     """
-    
+
     def __init__(self, config: NemotronHConfig, layer_idx: int):
         super().__init__()
         self.num_heads = config.mamba_num_heads
@@ -418,7 +427,7 @@ def __init__(self, config: NemotronHConfig, layer_idx: int):
         self.time_step_limit = config.time_step_limit
         self.time_step_min = config.time_step_min
         self.time_step_max = config.time_step_max
-        
+
         self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
         self.conv1d = nn.Conv1d(
             in_channels=self.conv_dim,
@@ -428,27 +437,25 @@ def __init__(self, config: NemotronHConfig, layer_idx: int):
             groups=self.conv_dim,
             padding=config.conv_kernel - 1,
         )
-        
+
         projection_size = self.intermediate_size + self.conv_dim + self.num_heads
         self.in_proj = nn.Linear(self.hidden_size, projection_size, bias=config.use_bias)
-        
+
         self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
-        
+
         A = torch.arange(1, self.num_heads + 1)
         self.A_log = nn.Parameter(torch.log(A))
         self.A_log._no_weight_decay = True
-        
+
         self.norm = MambaRMSNormGated(
-            self.intermediate_size, 
-            eps=self.layer_norm_epsilon, 
-            group_size=self.intermediate_size // self.n_groups
+            self.intermediate_size, eps=self.layer_norm_epsilon, group_size=self.intermediate_size // self.n_groups
         )
         self.D = nn.Parameter(torch.ones(self.num_heads))
         self.D._no_weight_decay = True
-        
+
         self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
         self.use_bias = config.use_bias
-    
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -460,7 +467,7 @@ def forward(
         if IS_FAST_PATH_AVAILABLE and hidden_states.is_cuda:
             return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
         return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
-    
+
     def cuda_kernels_forward(
         self,
         hidden_states: torch.Tensor,
@@ -470,7 +477,7 @@ def cuda_kernels_forward(
     ):
         hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
         projected_states = self.in_proj(hidden_states)
-        
+
         batch_size, seq_len, _ = hidden_states.shape
         groups_time_state_size = self.n_groups * self.ssm_state_size
         d_mlp = (
@@ -479,13 +486,13 @@ def cuda_kernels_forward(
             - 2 * self.n_groups * self.ssm_state_size
             - self.num_heads
         ) // 2
-        
+
         if cache_params is not None and cache_position is not None and cache_position[0] > 0:
             # Cached forward (single token)
             _, _, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split(
                 [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
             )
-            
+
             hidden_states_B_C = causal_conv1d_update(
                 hidden_states_B_C,
                 cache_params.conv_states[self.layer_idx],
@@ -493,13 +500,13 @@ def cuda_kernels_forward(
                 self.conv1d.bias,
                 self.activation,
             )
-            
+
             hidden_states, B, C = torch.split(
                 hidden_states_B_C,
                 [self.intermediate_size, groups_time_state_size, groups_time_state_size],
                 dim=-1,
             )
-            
+
             A = -torch.exp(self.A_log.float())
             A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
             dt = dt[:, :, None].expand(-1, -1, self.head_dim)
@@ -508,11 +515,15 @@ def cuda_kernels_forward(
             B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
             C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
             hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim)
-            
+
             hidden_states = selective_state_update(
                 cache_params.ssm_states[self.layer_idx],
                 hidden_states_reshaped,
-                dt, A, B, C, D,
+                dt,
+                A,
+                B,
+                C,
+                D,
                 z=None,
                 dt_bias=dt_bias,
                 dt_softplus=True,
@@ -524,7 +535,7 @@ def cuda_kernels_forward(
             # Full sequence forward
             A = -torch.exp(self.A_log.float())
             dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit}
-            
+
             if self.training and cache_params is None:
                 out = mamba_split_conv1d_scan_combined(
                     projected_states,
@@ -550,17 +561,21 @@ def cuda_kernels_forward(
                 _, _, gate, hidden_states_B_C, dt = projected_states.split(
                     [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
                 )
-                
+
                 if cache_params is not None:
                     hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
                     conv_states = F.pad(
                         hidden_states_B_C_transposed,
                         (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0),
                     )
-                    cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True)
-                
+                    cache_params.update_conv_state(
+                        layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True
+                    )
+
                 if self.activation not in ["silu", "swish"]:
-                    hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2))
+                    hidden_states_B_C = self.act(
+                        self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)
+                    )
                 else:
                     hidden_states_B_C = causal_conv1d_fn(
                         x=hidden_states_B_C.transpose(1, 2),
@@ -568,17 +583,18 @@ def cuda_kernels_forward(
                         bias=self.conv1d.bias,
                         activation=self.activation,
                     ).transpose(1, 2)
-                
+
                 hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
                 hidden_states, B, C = torch.split(
                     hidden_states_B_C,
                     [self.intermediate_size, groups_time_state_size, groups_time_state_size],
                     dim=-1,
                 )
-                
+
                 scan_output, ssm_state = mamba_chunk_scan_combined(
                     hidden_states.view(batch_size, seq_len, -1, self.head_dim),
-                    dt, A,
+                    dt,
+                    A,
                     B.view(batch_size, seq_len, self.n_groups, -1),
                     C.view(batch_size, seq_len, self.n_groups, -1),
                     chunk_size=self.chunk_size,
@@ -590,16 +606,16 @@ def cuda_kernels_forward(
                     dt_softplus=True,
                     **dt_limit_kwargs,
                 )
-                
+
                 if ssm_state is not None and cache_params is not None:
                     cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state)
-                
+
                 scan_output = scan_output.view(batch_size, seq_len, -1)
                 scan_output = self.norm(scan_output, gate)
                 out = self.out_proj(scan_output)
-        
+
         return out
-    
+
     def torch_forward(
         self,
         hidden_states: torch.Tensor,
@@ -610,21 +626,25 @@ def torch_forward(
         """Pure PyTorch implementation (slower but works without CUDA kernels)."""
         batch_size, seq_len, _ = hidden_states.shape
         dtype = hidden_states.dtype
-        
+
         hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
         projected_states = self.in_proj(hidden_states)
-        
+
         d_mlp = (
-            projected_states.shape[-1] - 2 * self.intermediate_size 
-            - 2 * self.n_groups * self.ssm_state_size - self.num_heads
+            projected_states.shape[-1]
+            - 2 * self.intermediate_size
+            - 2 * self.n_groups * self.ssm_state_size
+            - self.num_heads
         ) // 2
         _, _, gate, hidden_states_B_C, dt = projected_states.split(
             [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
         )
-        
+
         # Convolution
         if cache_params is not None and cache_position is not None and cache_position[0] > 0:
-            cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False)
+            cache_params.update_conv_state(
+                layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False
+            )
             conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device)
             hidden_states_B_C = torch.sum(conv_states * self.conv1d.weight.squeeze(1), dim=-1)
             if self.use_conv_bias:
@@ -634,22 +654,22 @@ def torch_forward(
             if cache_params is not None:
                 hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
                 conv_states = F.pad(
-                    hidden_states_B_C_transposed, 
-                    (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0)
+                    hidden_states_B_C_transposed,
+                    (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0),
                 )
                 cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True)
             hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2))
-        
+
         hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
         hidden_states, B, C = torch.split(
             hidden_states_B_C,
             [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size],
-            dim=-1
+            dim=-1,
         )
-        
+
         # SSM
         A = -torch.exp(self.A_log.float())
-        
+
         if cache_params is not None and cache_position is not None and cache_position[0] > 0:
             # Single step SSM update
             cache_device = cache_params.ssm_states[self.layer_idx].device
@@ -658,33 +678,34 @@ def torch_forward(
             dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim)
             dt = F.softplus(dt + dt_bias.to(dt.dtype))
             dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
-            
-            A_expanded = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+
+            A_expanded = (
+                A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            )
             dA = (torch.exp(dt[..., None] * A_expanded)).to(device=cache_device)
-            
+
             B = B.reshape(batch_size, self.n_groups, -1)[..., None, :]
             B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous()
             B = B.reshape(batch_size, -1, B.shape[-1])
             dB = dt[..., None] * B[..., None, :]
-            
+
             hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim)
             dBx = (dB * hidden_states[..., None]).to(device=cache_device)
-            
+
             cache_params.update_ssm_state(
-                layer_idx=self.layer_idx,
-                new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx
+                layer_idx=self.layer_idx, new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx
             )
-            
+
             C = C.reshape(batch_size, self.n_groups, -1)[..., None, :]
             C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous()
             C = C.reshape(batch_size, -1, C.shape[-1])
-            
+
             ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype)
             ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size)
             C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1)
             y = torch.bmm(ssm_states_reshaped, C_reshaped)
             y = y.view(batch_size, self.num_heads, self.head_dim)
-            
+
             D = self.D[..., None].expand(self.D.shape[0], self.head_dim)
             y = (y + hidden_states * D).to(y.dtype)
             y = y.reshape(batch_size, -1)[:, None, ...]
@@ -697,56 +718,58 @@ def torch_forward(
             C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
             B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
             C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
-            
+
             pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
             D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
-            
+
             hidden_states = hidden_states * dt[..., None]
             A_dt = A.to(hidden_states.dtype) * dt
-            
-            hidden_states, A_dt, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A_dt, B, C)]
-            
+
+            hidden_states, A_dt, B, C = [
+                reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A_dt, B, C)
+            ]
+
             A_dt = A_dt.permute(0, 3, 1, 2)
             A_cumsum = torch.cumsum(A_dt, dim=-1)
             L = torch.exp(segment_sum(A_dt))
-            
+
             G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]
             G = G_intermediate.sum(dim=-1)
             M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None]
             M = M_intermediate.sum(dim=-1)
             Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3)
-            
+
             decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
             B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None]
             states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2)
-            
+
             if cache_params is not None and cache_position is not None and cache_position[0] > 0:
                 previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device)
             else:
                 previous_states = torch.zeros_like(states[:, :1])
-            
+
             states = torch.cat([previous_states, states], dim=1)
             decay_chunk = torch.exp(segment_sum(F.pad(A_cumsum[:, :, :, -1], (1, 0))))
             decay_chunk = decay_chunk.transpose(1, 3)
             new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1)
             states, ssm_state = new_states[:, :-1], new_states[:, -1]
-            
+
             state_decay_out = torch.exp(A_cumsum)
-            C_times_states = (C[..., None, :] * states[:, :, None, ...])
+            C_times_states = C[..., None, :] * states[:, :, None, ...]
             state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1)
-            Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None])
-            
+            Y_off = C_times_states.sum(-1) * state_decay_out_permuted[..., None]
+
             y = Y_diag + Y_off
             y = y.reshape(batch_size, -1, self.num_heads, self.head_dim)
             y = y + D_residual
-            
+
             if pad_size > 0:
                 y = y[:, :seq_len, :, :]
             y = y.reshape(batch_size, seq_len, -1)
-            
+
             if ssm_state is not None and cache_params is not None:
                 cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state)
-        
+
         scan_output = self.norm(y, gate)
         contextualized_states = self.out_proj(scan_output.to(dtype))
         return contextualized_states
@@ -763,7 +786,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 
 class NemotronHAttention(nn.Module):
     """Multi-headed attention for NemotronH."""
-    
+
     def __init__(self, config: NemotronHConfig, layer_idx: int):
         super().__init__()
         self.config = config
@@ -776,12 +799,12 @@ def __init__(self, config: NemotronHConfig, layer_idx: int):
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.is_causal = True
-        
+
         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
         self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
         self.o_proj = nn.Linear(self.head_dim * self.num_heads, self.hidden_size, bias=config.attention_bias)
-    
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -793,32 +816,32 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
-        
+
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
-        
+
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        
+
         if past_key_value is not None:
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
-        
+
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-        
+
         causal_mask = attention_mask
         if attention_mask is not None:
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
-        
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
         if query_states.device.type == "cuda" and attention_mask is not None:
             query_states = query_states.contiguous()
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
-        
+
         is_causal = True if causal_mask is None and q_len > 1 else False
-        
+
         attn_output = F.scaled_dot_product_attention(
             query_states,
             key_states,
@@ -827,18 +850,20 @@ def forward(
             dropout_p=self.attention_dropout if self.training else 0.0,
             is_causal=is_causal,
         )
-        
+
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim)
         attn_output = self.o_proj(attn_output)
-        
+
         return attn_output, None, past_key_value
 
 
 class NemotronHMLP(nn.Module):
     """MLP layer for NemotronH."""
-    
-    def __init__(self, config: NemotronHConfig, intermediate_size: Optional[int] = None, layer_idx: Optional[int] = None):
+
+    def __init__(
+        self, config: NemotronHConfig, intermediate_size: Optional[int] = None, layer_idx: Optional[int] = None
+    ):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -847,7 +872,7 @@ def __init__(self, config: NemotronHConfig, intermediate_size: Optional[int] = N
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
         self.act_fn = get_activation_fn(config.mlp_hidden_act)
-    
+
     def forward(self, x):
         return self.down_proj(self.act_fn(self.up_proj(x)))
 
@@ -855,12 +880,12 @@ def forward(self, x):
 class NemotronHTopkRouter(nn.Module):
     """
     Top-k router for Mixture of Experts.
-    
+
     Routes tokens to the top-k experts based on learned routing weights.
     Supports grouped routing where experts are divided into groups and
     top-k groups are selected first, then top-k experts within those groups.
     """
-    
+
     def __init__(self, config: NemotronHConfig):
         super().__init__()
         self.config = config
@@ -870,150 +895,136 @@ def __init__(self, config: NemotronHConfig):
         self.n_group = config.n_group
         self.topk_group = config.topk_group
         self.norm_topk_prob = config.norm_topk_prob
-        
-        self.weight = nn.Parameter(
-            torch.empty((self.n_routed_experts, config.hidden_size), dtype=torch.float32)
-        )
-        self.register_buffer(
-            "e_score_correction_bias", 
-            torch.zeros(self.n_routed_experts, dtype=torch.float32)
-        )
-    
+
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size), dtype=torch.float32))
+        self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts, dtype=torch.float32))
+
     @torch.no_grad()
     def get_topk_indices(self, scores: torch.Tensor) -> torch.Tensor:
         """Get top-k expert indices using grouped routing."""
         scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
-        
+
         # Compute group scores by taking top-2 within each group and summing
         group_scores = (
             scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
             .topk(2, dim=-1)[0]
             .sum(dim=-1)
         )
-        
+
         # Select top-k groups
         group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
         group_mask = torch.zeros_like(group_scores)
         group_mask.scatter_(1, group_idx, 1)
-        
+
         # Create mask for experts in selected groups
         score_mask = (
             group_mask.unsqueeze(-1)
             .expand(-1, self.n_group, self.n_routed_experts // self.n_group)
             .reshape(-1, self.n_routed_experts)
         )
-        
+
         # Zero out scores for experts not in selected groups
         scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
-        
+
         # Select top-k experts from remaining
         topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
         return topk_indices
-    
+
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Route tokens to experts.
-        
+
         Args:
             hidden_states: Input tensor of shape (batch_size, seq_len, hidden_size)
-            
+
         Returns:
             topk_indices: Indices of selected experts (batch_size * seq_len, top_k)
             topk_weights: Weights for selected experts (batch_size * seq_len, top_k)
         """
         hidden_states = hidden_states.view(-1, self.config.hidden_size)
-        
+
         # Compute router logits and convert to probabilities via sigmoid
         router_logits = F.linear(hidden_states.float(), self.weight.float())
         scores = router_logits.sigmoid()
-        
+
         # Get top-k expert indices
         topk_indices = self.get_topk_indices(scores)
-        
+
         # Gather weights for selected experts
         topk_weights = scores.gather(1, topk_indices)
-        
+
         # Optionally normalize weights
         if self.norm_topk_prob:
             denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
             topk_weights = topk_weights / denominator
-        
+
         # Apply routing scaling factor
         topk_weights = topk_weights * self.routed_scaling_factor
-        
+
         return topk_indices, topk_weights
 
 
 class NemotronHMOE(nn.Module):
     """
     Mixture of Experts layer for NemotronH.
-    
+
     Combines multiple expert MLPs with a router that selects which experts
     to use for each token. Also includes shared experts that are always used.
     """
-    
+
     def __init__(self, config: NemotronHConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
-        
+
         # Create routed experts
-        self.experts = nn.ModuleList([
-            NemotronHMLP(
-                config, 
-                intermediate_size=config.moe_intermediate_size, 
-                layer_idx=layer_idx
-            )
-            for _ in range(config.n_routed_experts)
-        ])
-        
+        self.experts = nn.ModuleList(
+            [
+                NemotronHMLP(config, intermediate_size=config.moe_intermediate_size, layer_idx=layer_idx)
+                for _ in range(config.n_routed_experts)
+            ]
+        )
+
         # Router for selecting experts
         self.gate = NemotronHTopkRouter(config)
-        
+
         # Shared experts (always used)
         self.shared_experts = NemotronHMLP(
-            config=config,
-            intermediate_size=config.moe_shared_expert_intermediate_size,
-            layer_idx=layer_idx
+            config=config, intermediate_size=config.moe_shared_expert_intermediate_size, layer_idx=layer_idx
         )
-    
-    def moe(
-        self, 
-        hidden_states: torch.Tensor, 
-        topk_indices: torch.Tensor, 
-        topk_weights: torch.Tensor
-    ) -> torch.Tensor:
+
+    def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor) -> torch.Tensor:
         """
         Apply mixture of experts to hidden states.
-        
+
         Args:
             hidden_states: Input tensor of shape (batch_size * seq_len, hidden_size)
             topk_indices: Expert indices of shape (batch_size * seq_len, top_k)
             topk_weights: Expert weights of shape (batch_size * seq_len, top_k)
-            
+
         Returns:
             Output tensor of shape (batch_size * seq_len, hidden_size)
         """
         final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
-        
+
         # Create one-hot mask for expert selection
         expert_mask = F.one_hot(topk_indices, num_classes=len(self.experts))
         expert_mask = expert_mask.permute(2, 0, 1)  # (num_experts, batch*seq, top_k)
-        
+
         for expert_idx in range(len(self.experts)):
             expert = self.experts[expert_idx]
             mask = expert_mask[expert_idx]
             token_indices, weight_indices = torch.where(mask)
-            
+
             if token_indices.numel() > 0:
                 # Get weights and inputs for this expert
                 expert_weights = topk_weights[token_indices, weight_indices]
                 expert_input = hidden_states[token_indices]
-                
+
                 # Apply expert and weight the output
                 expert_output = expert(expert_input)
                 weighted_output = expert_output * expert_weights.unsqueeze(-1)
-                
+
                 # Accumulate weighted outputs
                 final_hidden_states.index_add_(0, token_indices, weighted_output)
             else:
@@ -1022,50 +1033,50 @@ def moe(
                 dummy_input = torch.zeros_like(hidden_states[0]).unsqueeze(0).to(expert_dtype)
                 dummy_out = expert(dummy_input)
                 final_hidden_states = final_hidden_states + dummy_out * 0
-        
+
         return final_hidden_states.to(hidden_states.dtype)
-    
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """
         Forward pass through MoE layer.
-        
+
         Args:
             hidden_states: Input tensor of shape (batch_size, seq_len, hidden_size)
-            
+
         Returns:
             Output tensor of shape (batch_size, seq_len, hidden_size)
         """
         residuals = hidden_states
         orig_shape = hidden_states.shape
-        
+
         # Route tokens to experts
         topk_indices, topk_weights = self.gate(hidden_states)
-        
+
         # Flatten for expert processing
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-        
+
         # Apply mixture of experts
         hidden_states = self.moe(hidden_states, topk_indices, topk_weights)
-        
+
         # Reshape back to original shape
         hidden_states = hidden_states.view(*orig_shape)
-        
+
         # Add shared expert output
         hidden_states = hidden_states + self.shared_experts(residuals)
-        
+
         return hidden_states
 
 
 class NemotronHBlock(nn.Module):
     """A single block in NemotronH - can be Mamba, Attention, MLP, or MoE."""
-    
+
     def __init__(self, config: NemotronHConfig, layer_idx: int):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         self.residual_in_fp32 = config.residual_in_fp32
         self.norm = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
-        
+
         self.block_type = config.layers_block_type[layer_idx]
         if self.block_type == "mamba":
             self.mixer = NemotronHMamba2Mixer(config, layer_idx=layer_idx)
@@ -1077,7 +1088,7 @@ def __init__(self, config: NemotronHConfig, layer_idx: int):
             self.mixer = NemotronHMOE(config, layer_idx=layer_idx)
         else:
             raise ValueError(f"Invalid block type: {self.block_type}")
-    
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1091,7 +1102,7 @@ def forward(
                 return self._forward_impl(hidden_states, cache_params, cache_position, attention_mask)
         else:
             return self._forward_impl(hidden_states, cache_params, cache_position, attention_mask)
-    
+
     def _forward_impl(
         self,
         hidden_states: torch.Tensor,
@@ -1103,19 +1114,15 @@ def _forward_impl(
         hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
         if self.residual_in_fp32:
             residual = residual.to(torch.float32)
-        
+
         if self.block_type == "mamba":
-            hidden_states = self.mixer(
-                hidden_states, cache_params=cache_params, cache_position=cache_position
-            )
+            hidden_states = self.mixer(hidden_states, cache_params=cache_params, cache_position=cache_position)
         elif self.block_type == "attention":
-            hidden_states = self.mixer(
-                hidden_states, cache_position=cache_position, past_key_value=cache_params
-            )
+            hidden_states = self.mixer(hidden_states, cache_position=cache_position, past_key_value=cache_params)
             hidden_states = hidden_states[0]
         elif self.block_type in ("mlp", "moe"):
             hidden_states = self.mixer(hidden_states)
-        
+
         hidden_states = residual + hidden_states
         return hidden_states
 
@@ -1123,22 +1130,22 @@ def _forward_impl(
 class NemotronHModel(nn.Module):
     """
     NemotronH backbone model.
-    
+
     This is the main backbone that can be used as a decoder in TTS models.
     It exposes the same interface as HuggingFace transformer models.
     """
-    
+
     def __init__(self, config: NemotronHConfig):
         super().__init__()
         self.config = config
-        
+
         self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
         self.layers = nn.ModuleList([NemotronHBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)])
         self.norm_f = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
-        
+
         self.gradient_checkpointing = False
         self._init_weights()
-    
+
     def _init_weights(self):
         """Initialize weights with special handling for Mamba components."""
         for name, module in self.named_modules():
@@ -1146,7 +1153,7 @@ def _init_weights(self):
                 # Mark parameters that should not have weight decay
                 module.A_log._no_weight_decay = True
                 module.D._no_weight_decay = True
-                
+
                 # Special initialization for dt_bias using inverse softplus
                 # This follows the Mamba2 initialization scheme
                 dt = torch.exp(
@@ -1154,13 +1161,13 @@ def _init_weights(self):
                     * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
                     + math.log(self.config.time_step_min)
                 ).clamp(min=self.config.time_step_floor)
-                
+
                 # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
                 inv_dt = dt + torch.log(-torch.expm1(-dt))
                 with torch.no_grad():
                     module.dt_bias.copy_(inv_dt)
                 module.dt_bias._no_reinit = True
-                
+
             elif isinstance(module, nn.Linear):
                 nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
                 if module.bias is not None:
@@ -1168,7 +1175,7 @@ def _init_weights(self):
                         nn.init.zeros_(module.bias)
             elif isinstance(module, nn.Embedding):
                 nn.init.normal_(module.weight, std=self.config.initializer_range)
-        
+
         # Rescale prenorm residual weights for better training stability
         # Following GPT-2 paper: scale by 1/sqrt(2 * n_layer)
         if self.config.rescale_prenorm_residual:
@@ -1178,13 +1185,13 @@ def _init_weights(self):
                     # Scale by 1/sqrt(num_hidden_layers)
                     with torch.no_grad():
                         p /= math.sqrt(self.config.num_hidden_layers)
-    
+
     def get_input_embeddings(self):
         return self.embeddings
-    
+
     def set_input_embeddings(self, new_embeddings):
         self.embeddings = new_embeddings
-    
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1203,20 +1210,22 @@ def forward(
         # Support both cache_params and past_key_values for compatibility
         if past_key_values is not None and cache_params is None:
             cache_params = past_key_values
-        
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        
+
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-        
+
         if inputs_embeds is None:
             inputs_embeds = self.embeddings(input_ids)
-        
+
         hidden_states = inputs_embeds
-        
+
         # Create cache if use_cache=True but no cache provided
         if use_cache and cache_params is None:
             cache_params = HybridMambaAttentionDynamicCache(
@@ -1225,17 +1234,17 @@ def forward(
                 dtype=hidden_states.dtype,
                 device=hidden_states.device,
             )
-        
+
         if cache_position is None:
             cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
-        
+
         # Create causal mask for attention layers
         causal_mask = self._create_causal_mask(attention_mask, inputs_embeds, cache_position)
         mamba_mask = self._update_mamba_mask(attention_mask, cache_position)
-        
+
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        
+
         for layer_idx, layer in enumerate(self.layers):
             if layer.block_type == "mamba":
                 layer_mask = mamba_mask
@@ -1243,10 +1252,10 @@ def forward(
                 layer_mask = causal_mask
             else:
                 layer_mask = None
-            
+
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
-            
+
             if self.gradient_checkpointing and self.training:
                 hidden_states = torch.utils.checkpoint.checkpoint(
                     layer.__call__, hidden_states, cache_params, cache_position, layer_mask
@@ -1258,48 +1267,48 @@ def forward(
                     cache_position=cache_position,
                     attention_mask=layer_mask,
                 )
-        
+
         hidden_states = self.norm_f(hidden_states)
-        
+
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
-        
+
         if not return_dict:
             return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)
-        
+
         return NemotronHOutput(
             last_hidden_state=hidden_states,
             past_key_values=cache_params if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-    
+
     def _create_causal_mask(self, attention_mask, input_tensor, cache_position):
         """Create causal attention mask."""
         dtype, device = input_tensor.dtype, input_tensor.device
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
         target_length = cache_position[-1] + 1
-        
+
         causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
         if sequence_length != 1:
             causal_mask = torch.triu(causal_mask, diagonal=1)
         causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
         causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
-        
+
         if attention_mask is not None:
             causal_mask = causal_mask.clone()
             if attention_mask.dim() == 2:
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
                 causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
-        
+
         return causal_mask
-    
+
     def _update_mamba_mask(self, attention_mask, cache_position):
         """
         Update Mamba mask with optimization.
-        
+
         No need for zeroing states when:
             1. Cached forward (cache_position[0] > 0)
             2. Attending to all inputs (all mask values are 1)
@@ -1313,40 +1322,40 @@ def _update_mamba_mask(self, attention_mask, cache_position):
 class NemotronHForCausalLM(nn.Module):
     """
     NemotronH model with a language modeling head.
-    
+
     This is the full model that matches the AutoModelForCausalLM interface.
     """
-    
+
     def __init__(self, config: NemotronHConfig):
         super().__init__()
         self.config = config
         self.backbone = NemotronHModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        
+
         self._init_weights()
-    
+
     def _init_weights(self):
         """Initialize weights."""
         nn.init.normal_(self.lm_head.weight, mean=0.0, std=self.config.initializer_range)
-    
+
     def get_input_embeddings(self):
         return self.backbone.get_input_embeddings()
-    
+
     def set_input_embeddings(self, new_embeddings):
         self.backbone.set_input_embeddings(new_embeddings)
-    
+
     def get_output_embeddings(self):
         return self.lm_head
-    
+
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
-    
+
     @property
     def model(self):
         """Alias for backbone, for HuggingFace compatibility."""
         return self.backbone
-    
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1364,7 +1373,7 @@ def forward(
         **kwargs,
     ) -> Union[Tuple, NemotronHCausalLMOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        
+
         outputs = self.backbone(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
@@ -1378,10 +1387,10 @@ def forward(
             return_dict=return_dict,
             cache_position=cache_position,
         )
-        
+
         hidden_states = outputs.last_hidden_state if return_dict else outputs[0]
         logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
-        
+
         loss = None
         if labels is not None:
             labels = labels.to(logits.device)
@@ -1389,11 +1398,11 @@ def forward(
             shift_labels = labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-        
+
         if not return_dict:
             output = (logits,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output
-        
+
         return NemotronHCausalLMOutput(
             loss=loss,
             logits=logits,
@@ -1401,7 +1410,7 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-    
+
     def prepare_inputs_for_generation(
         self,
         input_ids,
@@ -1415,42 +1424,41 @@ def prepare_inputs_for_generation(
     ):
         """Prepare inputs for generation."""
         empty_past_kv = past_key_values is None
-        
+
         # If we have cache: slice input_ids through cache_position to keep only unprocessed tokens
         # Exception 1: when passing input_embeds, input_ids may be missing entries
         # Exception 2: some generation methods do special slicing of input_ids
         # Exception 3: with synced GPUs cache_position may go out of bounds
         if not empty_past_kv:
-            if (
-                inputs_embeds is not None  # Exception 1
-                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
-            ):
-                input_ids = input_ids[:, -cache_position.shape[0]:]
+            if inputs_embeds is not None or cache_position[-1] >= input_ids.shape[1]:  # Exception 1  # Exception 3
+                input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case
                 input_ids = input_ids[:, cache_position]
         else:
             past_key_values = HybridMambaAttentionDynamicCache(
                 self.config, input_ids.shape[0], self.backbone.embeddings.weight.dtype, device=input_ids.device
             )
-        
+
         # Create position_ids on the fly for batch generation if not provided
         if attention_mask is not None and position_ids is None:
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if not empty_past_kv:
-                position_ids = position_ids[:, -input_ids.shape[1]:]
-        
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
         # If inputs_embeds are passed, only use them in the 1st generation step
         if inputs_embeds is not None and empty_past_kv:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids.contiguous()}
-        
-        model_inputs.update({
-            "position_ids": position_ids,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-            "attention_mask": attention_mask,
-            "cache_position": cache_position,
-        })
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+            }
+        )
         return model_inputs
diff --git a/tests/collections/tts/test_nemotron_h_decoder.py b/tests/collections/tts/test_nemotron_h_decoder.py
index 4b21dc1ae716..943abe1d9046 100644
--- a/tests/collections/tts/test_nemotron_h_decoder.py
+++ b/tests/collections/tts/test_nemotron_h_decoder.py
@@ -25,15 +25,18 @@
 
 try:
     import pytest
+
     PYTEST_AVAILABLE = True
 except ImportError:
     PYTEST_AVAILABLE = False
+
     # Create a dummy pytest fixture decorator for standalone execution
     class pytest:
         @staticmethod
         def fixture(func):
             return func
 
+
 import torch
 
 from nemo.collections.tts.modules.nemotron_h_decoder import (
@@ -41,42 +44,36 @@ def fixture(func):
     NemotronHConfig,
     NemotronHForCausalLM,
     NemotronHMLP,
-    NemotronHMOE,
     NemotronHModel,
+    NemotronHMOE,
     NemotronHTopkRouter,
 )
 
 
 class TestNemotronHConfig:
     """Test NemotronHConfig initialization and defaults."""
-    
+
     def test_default_config(self):
         """Test default config initialization."""
         config = NemotronHConfig()
         assert config.hidden_size == 1536
         assert config.num_hidden_layers == 24
         assert len(config.layers_block_type) == config.num_hidden_layers
-    
+
     def test_custom_pattern(self):
         """Test custom hybrid_override_pattern."""
-        config = NemotronHConfig(
-            num_hidden_layers=8,
-            hybrid_override_pattern="M*M*M*M*"
-        )
+        config = NemotronHConfig(num_hidden_layers=8, hybrid_override_pattern="M*M*M*M*")
         assert config.layers_block_type == ['mamba', 'attention'] * 4
-    
+
     def test_pattern_extension(self):
         """Test that short patterns are extended to match num_hidden_layers."""
-        config = NemotronHConfig(
-            num_hidden_layers=8,
-            hybrid_override_pattern="M*"
-        )
+        config = NemotronHConfig(num_hidden_layers=8, hybrid_override_pattern="M*")
         assert len(config.layers_block_type) == 8
 
 
 class TestNemotronHModel:
     """Test NemotronHModel backbone."""
-    
+
     @pytest.fixture
     def small_config(self):
         """Create a small config for testing."""
@@ -93,51 +90,51 @@ def small_config(self):
             intermediate_size=128,
             hybrid_override_pattern="M*M*",
         )
-    
+
     @pytest.fixture
     def model(self, small_config):
         """Create a small model for testing."""
         return NemotronHModel(small_config)
-    
+
     def test_model_creation(self, model, small_config):
         """Test model can be created."""
         assert model is not None
         assert len(model.layers) == small_config.num_hidden_layers
-    
+
     def test_forward_with_input_ids(self, model):
         """Test forward pass with input_ids."""
         batch_size, seq_len = 2, 16
         input_ids = torch.randint(0, 1000, (batch_size, seq_len))
-        
+
         output = model(input_ids=input_ids)
-        
+
         assert output.last_hidden_state is not None
         assert output.last_hidden_state.shape == (batch_size, seq_len, 64)
-    
+
     def test_forward_with_inputs_embeds(self, model):
         """Test forward pass with inputs_embeds (required for TTS)."""
         batch_size, seq_len, hidden_size = 2, 16, 64
         inputs_embeds = torch.randn(batch_size, seq_len, hidden_size)
-        
+
         output = model(inputs_embeds=inputs_embeds)
-        
+
         assert output.last_hidden_state is not None
         assert output.last_hidden_state.shape == (batch_size, seq_len, hidden_size)
-    
+
     def test_get_set_input_embeddings(self, model):
         """Test get/set input embeddings interface."""
         original_embeddings = model.get_input_embeddings()
         assert original_embeddings is not None
-        
+
         new_embeddings = torch.nn.Embedding(100, 64)
         model.set_input_embeddings(new_embeddings)
-        
+
         assert model.get_input_embeddings() is new_embeddings
 
 
 class TestNemotronHForCausalLM:
     """Test NemotronHForCausalLM full model."""
-    
+
     @pytest.fixture
     def small_config(self):
         """Create a small config for testing."""
@@ -154,69 +151,69 @@ def small_config(self):
             intermediate_size=128,
             hybrid_override_pattern="M*M*",
         )
-    
+
     @pytest.fixture
     def model(self, small_config):
         """Create a small model for testing."""
         return NemotronHForCausalLM(small_config)
-    
+
     def test_model_creation(self, model, small_config):
         """Test model can be created."""
         assert model is not None
         assert model.backbone is not None
         assert model.lm_head is not None
-    
+
     def test_model_alias(self, model):
         """Test that model.model returns backbone (HF compatibility)."""
         assert model.model is model.backbone
-    
+
     def test_forward_with_inputs_embeds(self, model):
         """Test forward pass with inputs_embeds."""
         batch_size, seq_len, hidden_size = 2, 16, 64
         inputs_embeds = torch.randn(batch_size, seq_len, hidden_size)
-        
+
         output = model(inputs_embeds=inputs_embeds)
-        
+
         assert output.logits is not None
         assert output.logits.shape == (batch_size, seq_len, 1000)  # vocab_size
-    
+
     def test_interface_compatibility(self, model):
         """Test that model satisfies EasyMagpieTTSModel interface requirements."""
         # Test 1: decoder.get_input_embeddings()
         embeddings = model.backbone.get_input_embeddings()
         assert embeddings is not None
-        
+
         # Test 2: decoder.set_input_embeddings()
         new_emb = torch.nn.Embedding(100, 64)
         model.backbone.set_input_embeddings(new_emb)
         assert model.backbone.get_input_embeddings() is new_emb
-        
+
         # Reset for next tests
         model.backbone.set_input_embeddings(embeddings)
-        
+
         # Test 3: decoder(inputs_embeds, attention_mask, use_cache, past_key_values)
         batch_size, seq_len, hidden_size = 2, 16, 64
         inputs_embeds = torch.randn(batch_size, seq_len, hidden_size)
         attention_mask = torch.ones(batch_size, seq_len)
-        
+
         output = model.backbone(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             use_cache=False,
             past_key_values=None,
         )
-        
+
         # Test 4: Return .last_hidden_state
         assert hasattr(output, 'last_hidden_state')
         assert output.last_hidden_state is not None
-        
+
         # Test 5: Return .past_key_values (when use_cache=True not tested here as it requires more setup)
         assert hasattr(output, 'past_key_values')
 
 
 class TestHybridCache:
     """Test HybridMambaAttentionDynamicCache."""
-    
+
     def test_cache_creation(self):
         """Test cache can be created."""
         config = NemotronHConfig(
@@ -228,10 +225,10 @@ def test_cache_creation(self):
             conv_kernel=4,
             hybrid_override_pattern="M*M*",
         )
-        
+
         batch_size = 2
         cache = HybridMambaAttentionDynamicCache(config, batch_size, dtype=torch.float32)
-        
+
         assert len(cache.conv_states) == config.num_hidden_layers
         assert len(cache.ssm_states) == config.num_hidden_layers
         assert len(cache.key_cache) == config.num_hidden_layers
@@ -240,7 +237,7 @@ def test_cache_creation(self):
 
 class TestNemotronHCausality:
     """Test that NemotronH model is causal (future timesteps don't affect previous ones)."""
-    
+
     @pytest.fixture
     def small_config(self):
         """Create a small config for testing causality."""
@@ -257,18 +254,18 @@ def small_config(self):
             intermediate_size=128,
             hybrid_override_pattern="M*M*",
         )
-    
+
     @pytest.fixture
     def model(self, small_config):
         """Create a small model for testing."""
         model = NemotronHModel(small_config)
         model.eval()  # Set to eval mode for deterministic behavior
         return model
-    
+
     def test_causality_with_input_modification(self, model, small_config):
         """
         Test causality by modifying future timesteps and checking that earlier outputs are unchanged.
-        
+
         The test:
         1. Pass sequence through the model
         2. Modify a future timestep in the input
@@ -276,63 +273,65 @@ def test_causality_with_input_modification(self, model, small_config):
         """
         batch_size, seq_len = 2, 16
         hidden_size = small_config.hidden_size
-        
+
         # Create a base input
         torch.manual_seed(42)
         inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size)
-        
+
         # Get output with original input
         with torch.no_grad():
             output_original = model(inputs_embeds=inputs_embeds_original.clone())
-        
+
         # Test at different positions
         test_positions = [seq_len // 4, seq_len // 2, 3 * seq_len // 4]
-        
+
         for modify_pos in test_positions:
             # Create modified input where we change timesteps from modify_pos onwards
             inputs_embeds_modified = inputs_embeds_original.clone()
             # Add random noise to all positions from modify_pos onwards
-            inputs_embeds_modified[:, modify_pos:, :] += torch.randn(
-                batch_size, seq_len - modify_pos, hidden_size
-            ) * 10.0  # Large modification to ensure it would affect outputs if not causal
-            
+            inputs_embeds_modified[:, modify_pos:, :] += (
+                torch.randn(batch_size, seq_len - modify_pos, hidden_size) * 10.0
+            )  # Large modification to ensure it would affect outputs if not causal
+
             # Get output with modified input
             with torch.no_grad():
                 output_modified = model(inputs_embeds=inputs_embeds_modified)
-            
+
             # Check that outputs BEFORE modify_pos are unchanged
             outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :]
             outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :]
-            
+
             # Should be exactly equal (within floating point tolerance)
-            assert torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5), \
-                f"Causality violation: modifying position {modify_pos} affected earlier positions"
-            
+            assert torch.allclose(
+                outputs_before_original, outputs_before_modified, atol=1e-5
+            ), f"Causality violation: modifying position {modify_pos} affected earlier positions"
+
             # Verify that outputs AT and AFTER modify_pos are different (sanity check)
             outputs_after_original = output_original.last_hidden_state[:, modify_pos:, :]
             outputs_after_modified = output_modified.last_hidden_state[:, modify_pos:, :]
-            
-            assert not torch.allclose(outputs_after_original, outputs_after_modified, atol=1e-3), \
-                f"Sanity check failed: modifying position {modify_pos} should affect outputs at/after that position"
-    
+
+            assert not torch.allclose(
+                outputs_after_original, outputs_after_modified, atol=1e-3
+            ), f"Sanity check failed: modifying position {modify_pos} should affect outputs at/after that position"
+
     def test_causality_incremental_vs_full(self, model, small_config):
         """
         Test causality by comparing incremental (token-by-token) vs full sequence processing.
-        
+
         A causal model should produce the same output whether we:
         1. Process the full sequence at once
         2. Process tokens incrementally one at a time
         """
         batch_size, seq_len = 1, 8  # Smaller seq for incremental test
         hidden_size = small_config.hidden_size
-        
+
         torch.manual_seed(123)
         inputs_embeds = torch.randn(batch_size, seq_len, hidden_size)
-        
+
         # Get output from full sequence
         with torch.no_grad():
             output_full = model(inputs_embeds=inputs_embeds)
-        
+
         # Get outputs incrementally (one token at a time)
         # For a causal model, output at each position should match
         incremental_outputs = []
@@ -341,47 +340,47 @@ def test_causality_incremental_vs_full(self, model, small_config):
                 partial_output = model(inputs_embeds=inputs_embeds[:, :t, :])
             # Take only the last timestep output for comparison
             incremental_outputs.append(partial_output.last_hidden_state[:, -1:, :])
-        
+
         # Stack incremental outputs
         output_incremental = torch.cat(incremental_outputs, dim=1)
-        
+
         # Compare: the full sequence output should match the incrementally computed outputs
-        assert torch.allclose(output_full.last_hidden_state, output_incremental, atol=1e-4), \
-            "Causality violation: incremental processing produces different results than full sequence"
-    
+        assert torch.allclose(
+            output_full.last_hidden_state, output_incremental, atol=1e-4
+        ), "Causality violation: incremental processing produces different results than full sequence"
+
     def test_causality_causal_lm(self, small_config):
         """Test causality for NemotronHForCausalLM."""
         model = NemotronHForCausalLM(small_config)
         model.eval()
-        
+
         batch_size, seq_len = 2, 12
         hidden_size = small_config.hidden_size
-        
+
         torch.manual_seed(456)
         inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size)
-        
+
         modify_pos = seq_len // 2
-        
+
         # Get logits with original input
         with torch.no_grad():
             output_original = model(inputs_embeds=inputs_embeds_original.clone())
-        
+
         # Modify future positions
         inputs_embeds_modified = inputs_embeds_original.clone()
-        inputs_embeds_modified[:, modify_pos:, :] += torch.randn(
-            batch_size, seq_len - modify_pos, hidden_size
-        ) * 10.0
-        
+        inputs_embeds_modified[:, modify_pos:, :] += torch.randn(batch_size, seq_len - modify_pos, hidden_size) * 10.0
+
         with torch.no_grad():
             output_modified = model(inputs_embeds=inputs_embeds_modified)
-        
+
         # Check logits before modify_pos are unchanged
         logits_before_original = output_original.logits[:, :modify_pos, :]
         logits_before_modified = output_modified.logits[:, :modify_pos, :]
-        
-        assert torch.allclose(logits_before_original, logits_before_modified, atol=1e-5), \
-            "Causality violation in CausalLM: modifying future positions affected earlier logits"
-    
+
+        assert torch.allclose(
+            logits_before_original, logits_before_modified, atol=1e-5
+        ), "Causality violation in CausalLM: modifying future positions affected earlier logits"
+
     def test_causality_different_layer_types(self):
         """Test causality with different hybrid patterns (Mamba-only, Attention-only, mixed)."""
         patterns = [
@@ -390,7 +389,7 @@ def test_causality_different_layer_types(self):
             "M*M*",  # Alternating
             "MM**",  # Mixed blocks
         ]
-        
+
         for pattern in patterns:
             config = NemotronHConfig(
                 hidden_size=64,
@@ -405,39 +404,40 @@ def test_causality_different_layer_types(self):
                 intermediate_size=128,
                 hybrid_override_pattern=pattern,
             )
-            
+
             model = NemotronHModel(config)
             model.eval()
-            
+
             batch_size, seq_len = 2, 8
             hidden_size = config.hidden_size
-            
+
             torch.manual_seed(789)
             inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size)
-            
+
             modify_pos = 4
-            
+
             with torch.no_grad():
                 output_original = model(inputs_embeds=inputs_embeds_original.clone())
-            
+
             inputs_embeds_modified = inputs_embeds_original.clone()
-            inputs_embeds_modified[:, modify_pos:, :] += torch.randn(
-                batch_size, seq_len - modify_pos, hidden_size
-            ) * 10.0
-            
+            inputs_embeds_modified[:, modify_pos:, :] += (
+                torch.randn(batch_size, seq_len - modify_pos, hidden_size) * 10.0
+            )
+
             with torch.no_grad():
                 output_modified = model(inputs_embeds=inputs_embeds_modified)
-            
+
             outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :]
             outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :]
-            
-            assert torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5), \
-                f"Causality violation for pattern '{pattern}': modifying future positions affected earlier outputs"
+
+            assert torch.allclose(
+                outputs_before_original, outputs_before_modified, atol=1e-5
+            ), f"Causality violation for pattern '{pattern}': modifying future positions affected earlier outputs"
 
 
 class TestMoELayer:
     """Test Mixture of Experts layer."""
-    
+
     @pytest.fixture
     def moe_config(self):
         """Create a config for MoE testing."""
@@ -463,60 +463,60 @@ def moe_config(self):
             norm_topk_prob=True,
             hybrid_override_pattern="M*ME",  # Includes MoE layer
         )
-    
+
     def test_topk_router_creation(self, moe_config):
         """Test NemotronHTopkRouter creation."""
         router = NemotronHTopkRouter(moe_config)
         assert router.weight.shape == (moe_config.n_routed_experts, moe_config.hidden_size)
         assert router.top_k == moe_config.num_experts_per_tok
-    
+
     def test_topk_router_forward(self, moe_config):
         """Test NemotronHTopkRouter forward pass."""
         router = NemotronHTopkRouter(moe_config)
         batch_size, seq_len = 2, 8
         hidden_states = torch.randn(batch_size, seq_len, moe_config.hidden_size)
-        
+
         topk_indices, topk_weights = router(hidden_states)
-        
+
         # Check shapes
         assert topk_indices.shape == (batch_size * seq_len, moe_config.num_experts_per_tok)
         assert topk_weights.shape == (batch_size * seq_len, moe_config.num_experts_per_tok)
-        
+
         # Check indices are valid
         assert topk_indices.min() >= 0
         assert topk_indices.max() < moe_config.n_routed_experts
-    
+
     def test_moe_layer_creation(self, moe_config):
         """Test NemotronHMOE creation."""
         moe = NemotronHMOE(moe_config, layer_idx=0)
-        
+
         assert len(moe.experts) == moe_config.n_routed_experts
         assert moe.gate is not None
         assert moe.shared_experts is not None
-    
+
     def test_moe_layer_forward(self, moe_config):
         """Test NemotronHMOE forward pass."""
         moe = NemotronHMOE(moe_config, layer_idx=0)
         batch_size, seq_len = 2, 8
         hidden_states = torch.randn(batch_size, seq_len, moe_config.hidden_size)
-        
+
         output = moe(hidden_states)
-        
+
         assert output.shape == hidden_states.shape
-    
+
     def test_model_with_moe_pattern(self, moe_config):
         """Test full model with MoE layer."""
         model = NemotronHModel(moe_config)
-        
+
         # Check that MoE layer was created
         assert model.layers[3].block_type == "moe"
-        
+
         # Test forward pass
         batch_size, seq_len = 2, 8
         inputs_embeds = torch.randn(batch_size, seq_len, moe_config.hidden_size)
-        
+
         output = model(inputs_embeds=inputs_embeds)
-        
+
         assert output.last_hidden_state is not None
         assert output.last_hidden_state.shape == (batch_size, seq_len, moe_config.hidden_size)
 
@@ -524,7 +524,7 @@ def test_model_with_moe_pattern(self, moe_config):
 if __name__ == "__main__":
     """Run basic tests without pytest."""
     print("Testing NemotronH Decoder Module...")
-    
+
     # Test 1: Config
     print("\n1. Testing NemotronHConfig...")
     config = NemotronHConfig(
@@ -542,12 +542,12 @@ def test_model_with_moe_pattern(self, moe_config):
     )
     print(f"   Config created: {config.num_hidden_layers} layers, pattern={config.hybrid_override_pattern}")
     print(f"   Layer types: {config.layers_block_type}")
-    
+
     # Test 2: Model creation
     print("\n2. Testing NemotronHModel creation...")
     model = NemotronHModel(config)
     print(f"   Model created with {len(model.layers)} layers")
-    
+
     # Test 3: Forward pass with inputs_embeds
     print("\n3. Testing forward pass with inputs_embeds...")
     batch_size, seq_len, hidden_size = 2, 16, 64
@@ -555,27 +555,27 @@ def test_model_with_moe_pattern(self, moe_config):
     output = model(inputs_embeds=inputs_embeds)
     print(f"   Input shape: {inputs_embeds.shape}")
     print(f"   Output shape: {output.last_hidden_state.shape}")
-    
+
     # Test 4: Full model
     print("\n4. Testing NemotronHForCausalLM...")
     full_model = NemotronHForCausalLM(config)
     output = full_model(inputs_embeds=inputs_embeds)
     print(f"   Logits shape: {output.logits.shape}")
-    
+
     # Test 5: Interface compatibility
     print("\n5. Testing interface compatibility for EasyMagpieTTSModel...")
     decoder = full_model.backbone
-    
+
     # get_input_embeddings
     emb = decoder.get_input_embeddings()
     print(f"   get_input_embeddings(): {type(emb).__name__}")
-    
+
     # set_input_embeddings
     new_emb = torch.nn.Embedding(100, 64)
     decoder.set_input_embeddings(new_emb)
     print(f"   set_input_embeddings(): OK")
     decoder.set_input_embeddings(emb)  # Reset
-    
+
     # forward with expected args
     output = decoder(
         inputs_embeds=inputs_embeds,
@@ -586,7 +586,7 @@ def test_model_with_moe_pattern(self, moe_config):
     print(f"   forward(inputs_embeds, attention_mask, use_cache, past_key_values): OK")
     print(f"   .last_hidden_state: {output.last_hidden_state.shape}")
     print(f"   .past_key_values: {output.past_key_values}")
-    
+
     # Test 6: MoE layer
     print("\n6. Testing MoE (Mixture of Experts) layer...")
     moe_config = NemotronHConfig(
@@ -612,26 +612,26 @@ def test_model_with_moe_pattern(self, moe_config):
         hybrid_override_pattern="M*ME",  # Includes MoE layer
     )
     print(f"   Config: pattern={moe_config.hybrid_override_pattern}, block_types={moe_config.layers_block_type}")
-    
+
     # Test router
     router = NemotronHTopkRouter(moe_config)
     test_input = torch.randn(2, 8, 64)
     topk_indices, topk_weights = router(test_input)
     print(f"   Router: topk_indices shape={topk_indices.shape}, topk_weights shape={topk_weights.shape}")
-    
+
     # Test MoE layer
     moe = NemotronHMOE(moe_config, layer_idx=0)
     moe_output = moe(test_input)
     print(f"   MoE layer: input={test_input.shape}, output={moe_output.shape}")
-    
+
     # Test full model with MoE
     moe_model = NemotronHModel(moe_config)
     moe_model_output = moe_model(inputs_embeds=test_input)
     print(f"   Full model with MoE: output={moe_model_output.last_hidden_state.shape}")
-    
+
     # Test 7: Causality test
     print("\n7. Testing model causality (future timesteps don't affect previous ones)...")
-    
+
     # Create model for causality test
     causality_config = NemotronHConfig(
         hidden_size=64,
@@ -648,53 +648,51 @@ def test_model_with_moe_pattern(self, moe_config):
     )
     causality_model = NemotronHModel(causality_config)
     causality_model.eval()
-    
+
     batch_size, seq_len = 2, 16
     hidden_size = 64
-    
+
     # Create base input
     torch.manual_seed(42)
     inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size)
-    
+
     # Get output with original input
     with torch.no_grad():
         output_original = causality_model(inputs_embeds=inputs_embeds_original.clone())
-    
+
     # Test at different positions
     test_positions = [4, 8, 12]
     causality_passed = True
-    
+
     for modify_pos in test_positions:
         # Create modified input where we change timesteps from modify_pos onwards
         inputs_embeds_modified = inputs_embeds_original.clone()
-        inputs_embeds_modified[:, modify_pos:, :] += torch.randn(
-            batch_size, seq_len - modify_pos, hidden_size
-        ) * 10.0
-        
+        inputs_embeds_modified[:, modify_pos:, :] += torch.randn(batch_size, seq_len - modify_pos, hidden_size) * 10.0
+
         # Get output with modified input
         with torch.no_grad():
             output_modified = causality_model(inputs_embeds=inputs_embeds_modified)
-        
+
         # Check that outputs BEFORE modify_pos are unchanged
         outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :]
         outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :]
-        
+
         if torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5):
             print(f"   Position {modify_pos}: PASS (earlier outputs unchanged)")
         else:
             print(f"   Position {modify_pos}: FAIL (causality violation!)")
             causality_passed = False
-        
+
         # Verify outputs at/after modify_pos are different (sanity check)
         outputs_after_original = output_original.last_hidden_state[:, modify_pos:, :]
         outputs_after_modified = output_modified.last_hidden_state[:, modify_pos:, :]
-        
+
         if not torch.allclose(outputs_after_original, outputs_after_modified, atol=1e-3):
             print(f"   Position {modify_pos}: Sanity check PASS (later outputs changed)")
         else:
             print(f"   Position {modify_pos}: Sanity check FAIL (later outputs should change)")
             causality_passed = False
-    
+
     # Test with different layer patterns
     print("\n   Testing causality with different layer patterns...")
     patterns = ["MMMM", "****", "M*M*", "MM**"]
@@ -714,32 +712,33 @@ def test_model_with_moe_pattern(self, moe_config):
         )
         pattern_model = NemotronHModel(pattern_config)
         pattern_model.eval()
-        
+
         torch.manual_seed(789)
         test_input = torch.randn(2, 8, 64)
         modify_pos = 4
-        
+
         with torch.no_grad():
             out_orig = pattern_model(inputs_embeds=test_input.clone())
-        
+
         test_input_mod = test_input.clone()
         test_input_mod[:, modify_pos:, :] += torch.randn(2, 4, 64) * 10.0
-        
+
         with torch.no_grad():
             out_mod = pattern_model(inputs_embeds=test_input_mod)
-        
-        if torch.allclose(out_orig.last_hidden_state[:, :modify_pos, :], 
-                          out_mod.last_hidden_state[:, :modify_pos, :], atol=1e-5):
+
+        if torch.allclose(
+            out_orig.last_hidden_state[:, :modify_pos, :], out_mod.last_hidden_state[:, :modify_pos, :], atol=1e-5
+        ):
             print(f"   Pattern '{pattern}': PASS")
         else:
             print(f"   Pattern '{pattern}': FAIL (causality violation!)")
             causality_passed = False
-    
+
     if causality_passed:
         print("   All causality tests PASSED!")
     else:
         print("   WARNING: Some causality tests FAILED!")
-    
-    print("\n" + "="*50)
+
+    print("\n" + "=" * 50)
     print("All tests passed!")
-    print("="*50)
+    print("=" * 50)

From 067a6e808e402bb166b58d7878c1ccafba8a151d Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Mon, 2 Feb 2026 17:39:33 -0500
Subject: [PATCH 32/94] inference function refactoring

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 698 ++++++++++++------
 1 file changed, 456 insertions(+), 242 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 3f107736604a..508a1332c31e 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -15,7 +15,7 @@
 import time
 from dataclasses import dataclass
 from functools import partial
-from typing import List, Optional, Sequence, Tuple
+from typing import Dict, List, Optional, Sequence, Tuple
 
 import torch
 import wandb
@@ -1897,6 +1897,274 @@ def setup_validation_data(self, cfg):
     def setup_test_data(self, cfg):
         self._test_dl = self._setup_test_dataloader(cfg)
 
+    def _log_phoneme_predictions(
+        self,
+        pred_phoneme_token_lists: List[List[int]],
+        gt_phoneme_token_lists: List[List[int]],
+        batch_size: int,
+    ) -> None:
+        """Log predicted vs ground truth phoneme tokens for debugging."""
+        for item_idx in range(batch_size):
+            logging.info(f"Predicted phoneme tokens for item {item_idx}: {pred_phoneme_token_lists[item_idx]}")
+            logging.info(f"GT phoneme tokens for item {item_idx}: {gt_phoneme_token_lists[item_idx]}")
+            predicted_phoneme_text = self.phoneme_tokenizer.decode(pred_phoneme_token_lists[item_idx])
+            gt_phoneme_text = self.phoneme_tokenizer.decode(gt_phoneme_token_lists[item_idx])
+            logging.info(f"Predicted phoneme text for item {item_idx}: {predicted_phoneme_text}")
+            logging.info(f"GT phoneme text for item {item_idx}: {gt_phoneme_text}")
+
+    def _collect_phoneme_tokens_for_logging(
+        self,
+        pred_phoneme_tokens: torch.Tensor,
+        gt_phoneme_tokens_current: torch.Tensor,
+        use_phoneme_input: torch.Tensor,
+        pred_phoneme_token_lists: List[List[int]],
+        gt_phoneme_token_lists: List[List[int]],
+        batch_size: int,
+    ) -> None:
+        """Collect phoneme tokens into lists for later logging (does not print)."""
+        special_tokens = {
+            self.phoneme_tokenizer.eos_token_id,
+            self.phoneme_tokenizer.bos_token_id,
+            self.phoneme_tokenizer.pad,
+        }
+        for item_idx in range(batch_size):
+            if use_phoneme_input[item_idx, 0, 0] > 0:
+                for phoneme_channel_idx in range(self.phoneme_stacking_factor):
+                    pred_token = pred_phoneme_tokens[item_idx, phoneme_channel_idx].item()
+                    if pred_token not in special_tokens:
+                        pred_phoneme_token_lists[item_idx].append(pred_token)
+
+                    gt_token = gt_phoneme_tokens_current[item_idx, phoneme_channel_idx].item()
+                    if gt_token not in special_tokens:
+                        gt_phoneme_token_lists[item_idx].append(gt_token)
+
+    def _sample_audio_codes(
+        self,
+        last_hidden: torch.Tensor,
+        all_code_logits_t: torch.Tensor,
+        temperature: float,
+        topk: int,
+        use_local_transformer_for_inference: bool,
+        use_cfg: bool,
+        cfg_scale: float,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Sample audio codes from logits using either local transformer or parallel sampling.
+        
+        Returns:
+            audio_codes_next: Sampled codes with temperature/topk (B, num_codebooks)
+            all_codes_next_argmax: Argmax sampled codes for EOS detection (B, num_codebooks)
+        """
+        if use_local_transformer_for_inference:
+            if self.local_transformer_type == LocalTransformerType.AR:
+                audio_codes_next = self.local_transformer_sample_autoregressive(
+                    dec_output=last_hidden[:, -1, :],
+                    temperature=temperature,
+                    topk=topk,
+                    use_cfg=use_cfg,
+                    cfg_scale=cfg_scale,
+                )
+            else:
+                raise ValueError(
+                    f"Local transformer inference requested but local transformer type is {self.local_transformer_type}"
+                )
+            # TODO @rfejgin: should we add argmax sampling for EOS here too?
+            all_codes_next_argmax = audio_codes_next
+        else:
+            # Parallel sampling from all codebook logits
+            audio_codes_next = self.sample_codes_from_logits(
+                all_code_logits_t, temperature=temperature, topk=topk
+            )
+            # Argmax sampling for reliable EOS detection
+            all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01)
+
+        return audio_codes_next, all_codes_next_argmax
+
+    def _process_phoneme_predictions(
+        self,
+        last_hidden: torch.Tensor,
+        actual_batch_size: int,
+        current_phoneme_positions: torch.Tensor,
+        gt_phoneme_tokens: torch.Tensor,
+        phoneme_input_type: str,
+        phoneme_sampling_method: str,
+        temperature: float,
+        topk: int,
+        timestep_idx: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Process phoneme predictions for the current timestep.
+        
+        Returns:
+            pred_phoneme_tokens: Predicted phoneme tokens (B, phoneme_stacking_factor)
+            gt_phoneme_tokens_current: GT phoneme tokens for current timestep (B, phoneme_stacking_factor)
+            input_phoneme_tokens_current: Tokens to use as input (GT or predicted)
+            input_phoneme_embedding: Embedded phoneme tokens (B, phoneme_stacking_factor, E)
+        """
+        # Get phoneme logits and sample
+        all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :])
+        all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size]
+
+        all_codes_next_phoneme = self.sample_codes_from_logits_phoneme(
+            all_code_logits_t_phoneme, temperature=temperature, topk=topk
+        )
+        all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme(
+            all_code_logits_t_phoneme, temperature=0.01
+        )
+
+        # Select predicted tokens based on sampling method
+        pred_phoneme_tokens = (
+            all_codes_next_phoneme_argmax if phoneme_sampling_method == 'argmax' else all_codes_next_phoneme
+        )
+
+        # Handle BOS token at position 0
+        phoneme_bos_tensor = torch.full(
+            (actual_batch_size, self.phoneme_stacking_factor),
+            self.phoneme_tokenizer.bos_token_id,
+            device=device,
+        ).long()
+        use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long()
+        pred_phoneme_tokens = (
+            use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens
+        ).long()
+
+        # Get ground truth phoneme tokens for current timestep
+        gt_phoneme_idx = min(timestep_idx, gt_phoneme_tokens.size(2) - 1)
+        gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx]
+
+        # Select input tokens (GT or predicted) and embed
+        input_phoneme_tokens_current = (
+            gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens
+        )
+        input_phoneme_embedding = self.embed_phoneme_tokens(input_phoneme_tokens_current.unsqueeze(2))
+
+        return pred_phoneme_tokens, gt_phoneme_tokens_current, input_phoneme_tokens_current, input_phoneme_embedding
+
+    def _compute_phoneme_channel_input(
+        self,
+        input_phoneme_embedding: torch.Tensor,
+        current_phoneme_positions: torch.Tensor,
+        phoneme_stream_ended: torch.Tensor,
+        actual_batch_size: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Compute the phoneme channel input embedding with masking.
+        
+        Returns:
+            phoneme_channel_input_t: Masked phoneme embedding (B, 1, E)
+            use_phoneme_input: Mask indicating which items should use phoneme input (B, 1, 1)
+        """
+        # Determine which items should use phoneme input
+        use_phoneme_input = (current_phoneme_positions >= 0) & (~phoneme_stream_ended)
+        use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float()
+
+        # Create zero embedding for items not using phoneme input
+        zero_phoneme_embedding = torch.zeros(
+            actual_batch_size, 1, self.cfg.embedding_dim, device=device
+        )
+
+        # Combine: use phoneme embedding where active, zero otherwise
+        phoneme_channel_input_t = (
+            use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding
+        )
+
+        return phoneme_channel_input_t, use_phoneme_input
+
+    def _prepare_next_decoder_input(
+        self,
+        audio_codes_next: torch.Tensor,
+        context_plus_audio_embedded: torch.Tensor,
+        context_plus_audio_lens: torch.Tensor,
+        min_context_len: int,
+        idx: int,
+        current_text_input_mode: str,
+        remaining_text_embedded: Optional[torch.Tensor],
+        current_text_positions: torch.Tensor,
+        phoneme_channel_input_t: Optional[torch.Tensor],
+        use_cfg: bool,
+        dummy_context_embedding_unconditional: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Prepare the input embedding for the next decoder step.
+        
+        Handles:
+        - Mixing context embeddings with generated audio embeddings based on context completeness
+        - Adding streaming text embeddings if in streaming mode
+        - Adding phoneme channel input if available
+        - Duplicating for CFG if enabled
+        """
+        batch_size = audio_codes_next.size(0)
+        device = audio_codes_next.device
+
+        # Embed the newly generated audio codes
+        new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2))  # (B, 1, E)
+        new_emb_unconditional = new_emb.clone()
+
+        # Add streaming text embeddings if in streaming mode
+        if current_text_input_mode == 'streaming':
+            remaining_text_idx = current_text_positions.clamp(min=0)
+            remaining_text_embedded_current = remaining_text_embedded[
+                torch.arange(batch_size, device=device), remaining_text_idx, :
+            ].unsqueeze(1)
+            new_emb = new_emb + remaining_text_embedded_current
+
+        # Check which items still have context to process
+        context_incomplete_mask = context_plus_audio_lens > idx + min_context_len
+
+        if context_incomplete_mask.any():
+            # Some items still processing context - blend context with generated embeddings
+            context_incomplete_mask = context_incomplete_mask.unsqueeze(1).unsqueeze(2).float()
+            context_embedding_slice = context_plus_audio_embedded[
+                :, min_context_len + idx : min_context_len + idx + 1, :
+            ]
+            next_input = context_incomplete_mask * context_embedding_slice + (1 - context_incomplete_mask) * new_emb
+
+            if phoneme_channel_input_t is not None:
+                next_input = next_input + phoneme_channel_input_t
+
+            if use_cfg:
+                next_input_unconditional = (
+                    context_incomplete_mask * dummy_context_embedding_unconditional
+                    + (1 - context_incomplete_mask) * new_emb_unconditional
+                )
+                next_input = torch.cat([next_input, next_input_unconditional], dim=0)
+        else:
+            # All items finished context - use generated embeddings
+            next_input = new_emb
+            if phoneme_channel_input_t is not None:
+                next_input = next_input + phoneme_channel_input_t
+
+            if use_cfg:
+                next_input = torch.cat([next_input, new_emb_unconditional], dim=0)
+
+        return next_input
+
+    def _check_eos_and_update_end_indices(
+        self,
+        all_codes_next_argmax: torch.Tensor,
+        audio_codes_next: torch.Tensor,
+        end_indices: Dict[int, int],
+        context_plus_audio_lens: torch.Tensor,
+        min_context_len: int,
+        idx: int,
+        verbose: bool = False,
+    ) -> None:
+        """Check for EOS tokens and update end indices for completed items."""
+        for item_idx in range(all_codes_next_argmax.size(0)):
+            # Only check items that haven't ended and have passed their context
+            if item_idx not in end_indices and idx + min_context_len > context_plus_audio_lens[item_idx]:
+                pred_tokens = all_codes_next_argmax[item_idx]
+                pred_tokens_multinomial = audio_codes_next[item_idx]
+
+                if torch.any(pred_tokens == self.audio_eos_id) or torch.any(
+                    pred_tokens_multinomial == self.audio_eos_id
+                ):
+                    if verbose:
+                        logging.info(f"EOS detected for item {item_idx} at timestep {idx}")
+                    end_indices[item_idx] = idx
+
     def infer_batch(
         self,
         batch,
@@ -1911,42 +2179,56 @@ def infer_batch(
         phoneme_sampling_method='argmax',
         dropout_text_input=False,
         inference_mode: Optional[str] = None,
+        verbose: bool = False,
     ):
         """
-        Run inference on a batch of inputs.
+        Run inference on a batch of inputs to generate audio from text.
 
         Args:
-            batch: Input batch containing text, context, etc.
+            batch: Input batch containing:
+                - text, text_lens: Input text tokens and lengths
+                - context_text_tokens, context_text_tokens_lens: Context text for speaker/style
+                - context_audio_codes/context_audio (optional): Audio context for speaker cloning
             max_decoder_steps: Maximum number of decoding steps.
-            temperature: Sampling temperature.
+            temperature: Sampling temperature for audio codes.
             topk: Top-k sampling parameter.
-            use_local_transformer_for_inference: Whether to use local transformer.
-            maskgit_n_steps: Number of MaskGit steps.
+            use_local_transformer_for_inference: Whether to use local transformer for AR sampling.
+            maskgit_n_steps: Number of MaskGit steps (unused in AR mode).
             use_cfg: Whether to use classifier-free guidance.
-            cfg_scale: CFG scale factor.
+            cfg_scale: CFG scale factor (higher = stronger conditioning).
             phoneme_input_type: 'gt' for ground truth or 'pred' for predicted phonemes.
-            phoneme_sampling_method: 'argmax' or 'sample'.
-            dropout_text_input: Whether to dropout text input.
+            phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
+            dropout_text_input: Whether to dropout text input for CFG training.
             inference_mode: Name of the inference mode to use (e.g., "full", "streaming_4_8").
-                If None, uses the default inference mode (first mode in training_modes).
+                If None, uses the default inference mode.
+            verbose: If True, enables detailed logging of decoding progress, EOS detection,
+                and phoneme predictions. Default False for cleaner output.
+
+        Returns:
+            predicted_audio: Generated audio waveforms (B, max_audio_len)
+            predicted_audio_lens: Lengths of generated audio (B,)
+            predicted_codes: Generated audio codes (B, num_codebooks, T)
+            predicted_codes_lens: Lengths of generated codes (B,)
+            rtf_metrics: Dictionary with timing metrics (rtf, time_to_first_prediction, etc.)
         """
         with torch.inference_mode():
             start_time = time.time()
 
             # Resolve inference mode
             mode_name = inference_mode if inference_mode is not None else self.default_inference_mode
-            if mode_name in self.mode_name_to_mode:
-                selected_training_mode = self.mode_name_to_mode[mode_name]
-                logging.info(f"Using inference mode: {selected_training_mode.name}")
-            else:
+            if mode_name not in self.mode_name_to_mode:
                 available_modes = list(self.mode_name_to_mode.keys())
                 raise ValueError(f"Unknown inference mode '{mode_name}'. Available modes: {available_modes}")
 
-            # Get current mode parameters
+            selected_training_mode = self.mode_name_to_mode[mode_name]
+            if verbose:
+                logging.info(f"Using inference mode: {selected_training_mode.name}")
+
             current_text_input_mode = selected_training_mode.text_input_mode
             current_streaming_speech_delay = selected_training_mode.streaming_speech_delay
             current_streaming_phonemes_delay = selected_training_mode.streaming_phonemes_delay
 
+            # Prepare context embeddings (text + audio context)
             context_tensors = self.prepare_context_tensors(
                 text=batch['text'],
                 text_lens=batch['text_lens'],
@@ -1964,289 +2246,216 @@ def infer_batch(
             remaining_text_embedded = context_tensors.remaining_text_embedded
             remaining_text_lens = context_tensors.remaining_text_lens
 
+            actual_batch_size = context_embedding.size(0)
+            device = context_embedding.device
+
+            # Prepare phoneme channel input if phoneme tokenizer is available
+            gt_phoneme_tokens = None
             if self.phoneme_tokenizer is not None:
                 context_lens_for_phonemes = (
                     context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay
                 )
-                phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = (
-                    self.prepare_phoneme_channel_input(
-                        batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes
-                    )
-                )
-                phoneme_channel_input_pad_tensor = torch.zeros(
-                    phoneme_channel_input.size(0),
-                    max_decoder_steps,
-                    phoneme_channel_input.size(2),
-                    device=phoneme_channel_input.device,
+                _, _, gt_phoneme_tokens, _ = self.prepare_phoneme_channel_input(
+                    batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes
                 )
-                phoneme_channel_input = torch.cat([phoneme_channel_input, phoneme_channel_input_pad_tensor], dim=1)
 
+            # Initialize audio codes with BOS token
             audio_codes_bos = torch.full(
-                (context_embedding.size(0), self.num_audio_codebooks * self.frame_stacking_factor, 1),
+                (actual_batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1),
                 self.audio_bos_id,
-                device=context_embedding.device,
+                device=device,
             ).long()
-            audio_codes_lens = torch.full((context_embedding.size(0),), 1, device=context_embedding.device).long()
-            audio_codes_input = audio_codes_bos
+            audio_codes_lens = torch.ones(actual_batch_size, device=device).long()
 
-            audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input)  # (B, T, E)
+            audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_bos)  # (B, 1, E)
+
+            # For streaming mode, add text embeddings to audio BOS
             if current_text_input_mode == 'streaming':
                 remaining_text_pad_length = max_decoder_steps - remaining_text_lens.max().item() + 1
                 remaining_text_pad_tensor = torch.zeros(
-                    remaining_text_embedded.size(0),
-                    remaining_text_pad_length,
-                    remaining_text_embedded.size(2),
-                    device=remaining_text_embedded.device,
+                    actual_batch_size, remaining_text_pad_length, remaining_text_embedded.size(2), device=device
                 )
                 remaining_text_embedded = torch.cat([remaining_text_embedded, remaining_text_pad_tensor], dim=1)
-                audio_codes_input_embedded = (
-                    audio_codes_input_embedded + remaining_text_embedded[:, :1, :]
-                )  # :1 corresponds to audio BOS.
+                audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded[:, :1, :]
 
+            # Combine context and audio embeddings
             context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally(
                 embeddings=[context_embedding, audio_codes_input_embedded],
                 lengths=[context_lens, audio_codes_lens],
             )
             min_context_len = context_plus_audio_lens.min().item()
+
+            # Adjust min_context_len for phoneme delay if using phoneme tokenizer
             if self.phoneme_tokenizer is not None:
                 min_context_len = (
                     min_context_len - current_streaming_speech_delay + current_streaming_phonemes_delay - 1
-                )  # 1 for audio BOS that we had added.
+                )
 
-            actual_batch_size = context_embedding.size(0)
+            # Setup classifier-free guidance if enabled
+            dummy_context_embedding_unconditional = None
             if use_cfg:
+                # Create unconditional context embedding (all UNK tokens)
                 dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()(
-                    torch.full((actual_batch_size, 1), self.cfg_unk_token_id, device=context_embedding.device)
-                )  # (B, 1, E)
+                    torch.full((actual_batch_size, 1), self.cfg_unk_token_id, device=device)
+                )
                 dummy_context_embedding_unconditional_expanded = dummy_context_embedding_unconditional.expand(
                     -1, context_embedding.size(1), -1
-                )  # (B, T_total, E)
+                )
 
                 dummy_context_plus_audio_embedded, _ = self.join_embeddings_temporally(
                     embeddings=[dummy_context_embedding_unconditional_expanded, audio_codes_input_embedded],
                     lengths=[context_lens, audio_codes_lens],
                 )
+                # Concatenate conditional and unconditional inputs: (2B, T_min, E)
                 first_inference_input = torch.cat(
                     [context_plus_audio_embedded, dummy_context_plus_audio_embedded], dim=0
-                )[
-                    :, :min_context_len, :
-                ]  # (2B, T_min, E)
+                )[:, :min_context_len, :]
             else:
-                first_inference_input = context_plus_audio_embedded[:, :min_context_len, :]  # (B, T_min, E)
+                first_inference_input = context_plus_audio_embedded[:, :min_context_len, :]
 
-            # Initialize cache_position for tracking sequence position (needed for NemotronH)
-            cache_position = torch.arange(min_context_len, device=context_embedding.device)
-
-            # First forward pass to get the initial hidden state and past key values
+            # First forward pass to process all context at once
+            cache_position = torch.arange(min_context_len, device=device)
             transformer_out = self.forward(
                 inputs_embeds=first_inference_input,
                 attention_mask=None,
                 use_cache=True,
-                past_key_values=None,  # No past key values for the first step
+                past_key_values=None,
                 cache_position=cache_position,
             )
 
             time_to_first_prediction = time.time() - start_time
-            last_hidden = transformer_out.last_hidden_state  # (B, T_total, E)
+            last_hidden = transformer_out.last_hidden_state
             past_kv = transformer_out.past_key_values
-
-            # Track the current sequence length for cache_position updates
             current_cache_seq_len = min_context_len
 
+            # Initialize decoding state
             all_predictions = []
-            end_indices = {}
+            end_indices = {}  # Maps item_idx -> timestep when EOS was detected
 
-            current_text_positions = []
-            for item_idx in range(context_embedding.size(0)):
-                # 0 if we have started reading the remaining text otherwise negative (indicating how far we are before we start reading the remaining text)
-                current_text_positions.append(min_context_len - context_plus_audio_lens[item_idx])
-            current_text_positions = torch.tensor(current_text_positions, device=context_embedding.device).long()
-            if self.phoneme_tokenizer is not None:
-                current_phoneme_positions = (
-                    current_text_positions - current_text_positions.max() - 1
-                )  # Make it 0-indexed.
-                # current_text_positions = current_text_positions - self.streaming_speech_delay + self.streaming_phonemes_delay
+            # Track text position for each item in batch
+            # Negative values indicate we haven't started reading remaining text yet
+            current_text_positions = torch.tensor(
+                [min_context_len - context_plus_audio_lens[i] for i in range(actual_batch_size)],
+                device=device,
+            ).long()
+
+            # Initialize phoneme tracking state
+            current_phoneme_positions = None
             pred_phoneme_token_lists = [[] for _ in range(actual_batch_size)]
             gt_phoneme_token_lists = [[] for _ in range(actual_batch_size)]
-            phoneme_stream_ended = torch.zeros(
-                actual_batch_size, device=context_embedding.device
-            ).bool()  # (B,) Whether phoneme stream has ended for this item.
+            phoneme_stream_ended = torch.zeros(actual_batch_size, device=device).bool()
+
+            if self.phoneme_tokenizer is not None:
+                current_phoneme_positions = current_text_positions - current_text_positions.max() - 1
+
+            # Main autoregressive decoding loop
             for idx in range(max_decoder_steps):
-                # import ipdb; ipdb.set_trace()
+                # Update position trackers
                 current_text_positions += 1
                 if self.phoneme_tokenizer is not None:
                     current_phoneme_positions += 1
-                    # print("current_phoneme_positions", current_phoneme_positions)
-                if idx % 20 == 0:
-                    print(f"Decoding timestep {idx}")
 
-                # Project from hidden_dim to audio_embedding_dim, then to logits
-                last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :])
-                all_code_logits_t = self.final_proj(last_hidden_audio)  # (B, num_codebooks * num_tokens_per_codebook)
+                if verbose and idx % 20 == 0:
+                    logging.info(f"Decoding timestep {idx}")
 
-                if self.phoneme_tokenizer is not None:
-                    all_code_logits_t_phoneme = self.phoneme_final_proj(
-                        last_hidden[:, -1, :]
-                    )  # (B, phoneme_stacking_factor * phoneme_vocab_size)
-                    all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size]
+                # Compute audio logits from last hidden state
+                last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :])
+                all_code_logits_t = self.final_proj(last_hidden_audio)
 
+                # Apply CFG to logits if enabled
                 if use_cfg:
                     conditional_logits = all_code_logits_t[:actual_batch_size]
                     unconditional_logits = all_code_logits_t[actual_batch_size:]
                     all_code_logits_t = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
 
-                if use_local_transformer_for_inference:
-                    if self.local_transformer_type == LocalTransformerType.AR:
-                        # Autoregressive sampling with local transformer
-                        audio_codes_next = self.local_transformer_sample_autoregressive(
-                            dec_output=last_hidden[:, -1, :],
-                            temperature=temperature,
-                            topk=topk,
-                            use_cfg=use_cfg,
-                            cfg_scale=cfg_scale,
-                        )
-                    else:
-                        raise ValueError(
-                            f"Local transformer inference requested by but local transformer type is {self.local_transformer_type}"
-                        )
-                    # TODO @rfejgin: should we add argmax sampling for EOS here too?
-                    all_codes_next_argmax = audio_codes_next
-                else:
-                    # Parallel sampling from logits
-                    audio_codes_next = self.sample_codes_from_logits(
-                        all_code_logits_t, temperature=temperature, topk=topk
-                    )  # (B, num_codebooks)
-                    all_codes_next_argmax = self.sample_codes_from_logits(
-                        all_code_logits_t, temperature=0.01
-                    )  # (B, num_codebooks)
+                # Sample audio codes
+                audio_codes_next, all_codes_next_argmax = self._sample_audio_codes(
+                    last_hidden=last_hidden,
+                    all_code_logits_t=all_code_logits_t,
+                    temperature=temperature,
+                    topk=topk,
+                    use_local_transformer_for_inference=use_local_transformer_for_inference,
+                    use_cfg=use_cfg,
+                    cfg_scale=cfg_scale,
+                )
 
+                # Process phoneme predictions if phoneme tokenizer exists
                 phoneme_channel_input_t = None
-
                 if self.phoneme_tokenizer is not None:
-                    all_codes_next_phoneme = self.sample_codes_from_logits_phoneme(
-                        all_code_logits_t_phoneme, temperature=temperature, topk=topk
-                    )  # (B, phoneme_stacking_factor)
-                    all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme(
-                        all_code_logits_t_phoneme, temperature=0.01
-                    )  # (B, phoneme_stacking_factor)
-                    pred_phoneme_tokens = (
-                        all_codes_next_phoneme_argmax
-                        if phoneme_sampling_method == 'argmax'
-                        else all_codes_next_phoneme
-                    )  # B, phoneme_stacking_factor
-                    phoneme_bos_tensor = torch.full(
-                        (actual_batch_size, self.phoneme_stacking_factor),
-                        self.phoneme_tokenizer.bos_token_id,
-                        device=context_embedding.device,
-                    ).long()  # (B, phoneme_stacking_factor)
-                    use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long()
-                    # print("use_bos_phoneme", use_bos_phoneme)
-                    pred_phoneme_tokens = (
-                        use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens
-                    ).long()  # (B, phoneme_stacking_factor)
-
-                    # print("pred_phoneme_tokens", pred_phoneme_tokens)
-                    gt_phoneme_idx = min(idx, gt_phoneme_tokens.size(2) - 1)
-                    gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx]  # (B, phoneme_stacking_factor)
-                    # print("gt_phoneme_tokens_current", gt_phoneme_tokens_current)
-
-                    input_phoneme_tokens_current = (
-                        gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens
+                    (
+                        pred_phoneme_tokens,
+                        gt_phoneme_tokens_current,
+                        input_phoneme_tokens_current,
+                        input_phoneme_embedding,
+                    ) = self._process_phoneme_predictions(
+                        last_hidden=last_hidden,
+                        actual_batch_size=actual_batch_size,
+                        current_phoneme_positions=current_phoneme_positions,
+                        gt_phoneme_tokens=gt_phoneme_tokens,
+                        phoneme_input_type=phoneme_input_type,
+                        phoneme_sampling_method=phoneme_sampling_method,
+                        temperature=temperature,
+                        topk=topk,
+                        timestep_idx=idx,
+                        device=device,
                     )
-                    input_phoneme_embedding = self.embed_phoneme_tokens(
-                        input_phoneme_tokens_current.unsqueeze(2)
-                    )  # (B, phoneme_stacking_factor, E)
-
-                    use_phoneme_input = (current_phoneme_positions >= 0) * (~phoneme_stream_ended)  # (B,)
-                    use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float()  # (B, 1, 1)
-                    zero_phoneme_embedding = torch.zeros(
-                        actual_batch_size, self.cfg.embedding_dim, device=all_codes_next_phoneme.device
-                    ).unsqueeze(
-                        1
-                    )  # (B, 1, E)
-                    # phoneme_channel_input_t = phoneme_channel_input[torch.arange(actual_batch_size), current_phoneme_positions.clamp(min=0) + min_context_len, :].unsqueeze(1) # (B, 1, E)
-                    phoneme_channel_input_t = (
-                        use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding
+
+                    # Compute masked phoneme channel input
+                    phoneme_channel_input_t, use_phoneme_input = self._compute_phoneme_channel_input(
+                        input_phoneme_embedding=input_phoneme_embedding,
+                        current_phoneme_positions=current_phoneme_positions,
+                        phoneme_stream_ended=phoneme_stream_ended,
+                        actual_batch_size=actual_batch_size,
+                        device=device,
                     )
-                    # print("use_phoneme_input", use_phoneme_input)
-                    for item_idx in range(actual_batch_size):
-                        if use_phoneme_input[item_idx, 0, 0] > 0:
-                            for phoneme_channel_idx in range(self.phoneme_stacking_factor):
-                                _phoneme_token = pred_phoneme_tokens[item_idx, phoneme_channel_idx].item()
-                                if _phoneme_token not in [
-                                    self.phoneme_tokenizer.eos_token_id,
-                                    self.phoneme_tokenizer.bos_token_id,
-                                    self.phoneme_tokenizer.pad,
-                                ]:
-                                    pred_phoneme_token_lists[item_idx].append(_phoneme_token)
-
-                                _gt_phoneme_token = gt_phoneme_tokens_current[item_idx, phoneme_channel_idx].item()
-                                if _gt_phoneme_token not in [
-                                    self.phoneme_tokenizer.eos_token_id,
-                                    self.phoneme_tokenizer.bos_token_id,
-                                    self.phoneme_tokenizer.pad,
-                                ]:
-                                    gt_phoneme_token_lists[item_idx].append(_gt_phoneme_token)
 
+                    # Collect phoneme tokens for logging (no printing here)
+                    self._collect_phoneme_tokens_for_logging(
+                        pred_phoneme_tokens=pred_phoneme_tokens,
+                        gt_phoneme_tokens_current=gt_phoneme_tokens_current,
+                        use_phoneme_input=use_phoneme_input,
+                        pred_phoneme_token_lists=pred_phoneme_token_lists,
+                        gt_phoneme_token_lists=gt_phoneme_token_lists,
+                        batch_size=actual_batch_size,
+                    )
+
+                    # Check for phoneme EOS
+                    for item_idx in range(actual_batch_size):
                         if torch.any(input_phoneme_tokens_current[item_idx] == self.phoneme_tokenizer.eos_token_id):
-                            print("Phoneme end detected for item {} at timestep {}".format(item_idx, idx))
+                            if verbose and not phoneme_stream_ended[item_idx]:
+                                logging.info(f"Phoneme EOS detected for item {item_idx} at timestep {idx}")
                             phoneme_stream_ended[item_idx] = True
-                    all_codes_next_phoneme = all_codes_next_phoneme.unsqueeze(1)
-                    # import ipdb; ipdb.set_trace()
-
-                for item_idx in range(all_codes_next_argmax.size(0)):
-                    if item_idx not in end_indices and idx + min_context_len > context_plus_audio_lens[item_idx]:
-                        pred_tokens = all_codes_next_argmax[item_idx]
-                        pred_tokens_multinomial = audio_codes_next[item_idx]
-                        if torch.any(pred_tokens == self.audio_eos_id) or torch.any(
-                            pred_tokens_multinomial == self.audio_eos_id
-                        ):
-                            print("End detected for item {} at timestep {}".format(item_idx, idx))
-                            end_indices[item_idx] = idx
-
-                all_predictions.append(audio_codes_next)
 
-                new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2))  # (B, 1, E)
-                new_emb_unconditional = new_emb * 1
+                # Check for audio EOS
+                self._check_eos_and_update_end_indices(
+                    all_codes_next_argmax=all_codes_next_argmax,
+                    audio_codes_next=audio_codes_next,
+                    end_indices=end_indices,
+                    context_plus_audio_lens=context_plus_audio_lens,
+                    min_context_len=min_context_len,
+                    idx=idx,
+                    verbose=verbose,
+                )
 
-                if current_text_input_mode == 'streaming':
-                    _bs = context_embedding.size(0)
-                    remaining_text_embedded_current = remaining_text_embedded[
-                        torch.arange(_bs), current_text_positions.clamp(min=0), :
-                    ].unsqueeze(
-                        1
-                    )  # (B, 1, E)
-                    new_emb = new_emb + remaining_text_embedded_current
-
-                context_incomplete_mask = context_plus_audio_lens > idx + min_context_len  # (B,)
-                # import ipdb; ipdb.set_trace()
-                # True if we have not yet reached the end of the context for this item
-                # import ipdb; ipdb.set_trace()
-                if context_incomplete_mask.any():
-                    # If some contexts are not yet complete.
-                    context_incomplete_mask = context_incomplete_mask.unsqueeze(1).unsqueeze(2).float()  # (B, 1, 1)
-                    context_embedding = context_plus_audio_embedded[
-                        :, min_context_len + idx : min_context_len + idx + 1, :
-                    ]  # (B, 1, E)
-                    next_input = context_incomplete_mask * context_embedding + (1 - context_incomplete_mask) * new_emb
-                    if phoneme_channel_input_t is not None:
-                        next_input += phoneme_channel_input_t
-                    if use_cfg:
-                        next_input_unconditional = (
-                            context_incomplete_mask * dummy_context_embedding_unconditional
-                            + (1 - context_incomplete_mask) * new_emb_unconditional
-                        )
-                        next_input = torch.cat([next_input, next_input_unconditional], dim=0)  # (2B, 1, E)
-                else:
-                    next_input = new_emb
-                    if phoneme_channel_input_t is not None:
-                        next_input += phoneme_channel_input_t
-                    if use_cfg:
-                        next_input = torch.cat([next_input, new_emb_unconditional], dim=0)  # (2B, 1, E)
+                all_predictions.append(audio_codes_next)
 
-                # Update cache_position for current step (needed for NemotronH cached forward)
-                cache_position = torch.tensor([current_cache_seq_len], device=context_embedding.device)
+                # Prepare input for next decoder step
+                next_input = self._prepare_next_decoder_input(
+                    audio_codes_next=audio_codes_next,
+                    context_plus_audio_embedded=context_plus_audio_embedded,
+                    context_plus_audio_lens=context_plus_audio_lens,
+                    min_context_len=min_context_len,
+                    idx=idx,
+                    current_text_input_mode=current_text_input_mode,
+                    remaining_text_embedded=remaining_text_embedded,
+                    current_text_positions=current_text_positions,
+                    phoneme_channel_input_t=phoneme_channel_input_t,
+                    use_cfg=use_cfg,
+                    dummy_context_embedding_unconditional=dummy_context_embedding_unconditional,
+                )
 
+                # Forward pass for next token
+                cache_position = torch.tensor([current_cache_seq_len], device=device)
                 transformer_out = self.forward(
                     inputs_embeds=next_input,
                     attention_mask=None,
@@ -2256,43 +2465,48 @@ def infer_batch(
                 )
                 last_hidden = transformer_out.last_hidden_state
                 past_kv = transformer_out.past_key_values
-
-                # Increment sequence length for next iteration
                 current_cache_seq_len += 1
-                if len(end_indices) == audio_codes_next.size(0):
-                    print("All items finished at timestep {}".format(idx))
+
+                # Check if all items have finished
+                if len(end_indices) == actual_batch_size:
+                    if verbose:
+                        logging.info(f"All items finished at timestep {idx}")
                     break
 
-            if self.phoneme_tokenizer is not None:
-                for item_idx in range(actual_batch_size):
-                    print(
-                        "Predicted phoneme tokens for item {}: {}".format(item_idx, pred_phoneme_token_lists[item_idx])
-                    )
-                    print("GT phoneme tokens for item {}: {}".format(item_idx, gt_phoneme_token_lists[item_idx]))
-                    predicted_phoneme_text = self.phoneme_tokenizer.decode(pred_phoneme_token_lists[item_idx])
-                    gt_phoneme_text = self.phoneme_tokenizer.decode(gt_phoneme_token_lists[item_idx])
-                    print("Predicted phoneme text for item {}: {}".format(item_idx, predicted_phoneme_text))
-                    print("GT phoneme text for item {}: {}".format(item_idx, gt_phoneme_text))
+            # Log phoneme predictions if verbose
+            if verbose and self.phoneme_tokenizer is not None:
+                self._log_phoneme_predictions(
+                    pred_phoneme_token_lists=pred_phoneme_token_lists,
+                    gt_phoneme_token_lists=gt_phoneme_token_lists,
+                    batch_size=actual_batch_size,
+                )
 
+            # Post-process predictions
             tts_generation_time = time.time() - start_time
             tts_generation_time_per_frame = tts_generation_time / len(all_predictions)
-            pred_codes_start_indices = context_plus_audio_lens - min_context_len  # (B,)
+
+            # Calculate predicted lengths, accounting for context offset
+            pred_codes_start_indices = context_plus_audio_lens - min_context_len
             predicted_lens = [
-                end_indices.get(idx, max_decoder_steps) for idx in range(context_embedding.size(0))
-            ]  #  Ensure that the codec is atleast of length 4
-            predicted_codes_lens = torch.tensor(predicted_lens, device=context_embedding.device).long()
-            predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices  # (B,)
+                end_indices.get(i, max_decoder_steps) for i in range(actual_batch_size)
+            ]
+            predicted_codes_lens = torch.tensor(predicted_lens, device=device).long()
+            predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices
 
+            # Stack and slice predictions to remove context portion
             predicted_codes = torch.stack(all_predictions, dim=-1)  # (B, num_codebooks, T)
             predicted_codes = self.slice_pred_embeddings(
                 predicted_codes.permute(0, 2, 1),
                 context_lens=pred_codes_start_indices,
                 target_lens=predicted_codes_lens,
             )
-            predicted_codes = predicted_codes.permute(0, 2, 1)  # (B, num_codebooks, T)
+            predicted_codes = predicted_codes.permute(0, 2, 1)
+
+            # Remove EOS tokens and convert codes to audio
             predicted_codes, predicted_codes_lens = self.remove_eos_token(predicted_codes, predicted_codes_lens)
             predicted_audio, predicted_audio_lens, _ = self.codes_to_audio(predicted_codes, predicted_codes_lens)
 
+            # Compute RTF metrics
             end_time = time.time()
             total_audio_duration_generated = (
                 predicted_audio_lens.max().item() * predicted_audio_lens.shape[0]
@@ -2305,7 +2519,7 @@ def infer_batch(
                 'tts_generation_time': tts_generation_time,
                 'max_frames_generated': len(all_predictions),
                 'tts_generation_time_per_frame': tts_generation_time_per_frame,
-                'batch_size': context_embedding.size(0),
+                'batch_size': actual_batch_size,
             }
 
             return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics

From 79457c6817fc2ebe6e11be825b9f0057c56cf7d7 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Wed, 4 Feb 2026 04:29:55 -0500
Subject: [PATCH 33/94] revert some changes and remove scripts

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 examples/tts/evalset_config.json              |   48 -
 nemo/collections/tts/models/audio_codec.py    |   16 +-
 .../ipa_scripts/add_ipa_to_lhotse_shards.py   |  358 -
 .../ipa_scripts/analyze_ipa_tokenization.py   |  734 --
 .../ipa_scripts/cuts_dirs_config.json         |   45 -
 .../ipa_scripts/train_ipa_bpe_tokenizer.py    |  522 -
 ...okenizer_2048_en_de_es_fr_hi_it_vi_zh.json | 9954 -----------------
 7 files changed, 5 insertions(+), 11672 deletions(-)
 delete mode 100644 scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py
 delete mode 100644 scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py
 delete mode 100644 scripts/magpietts/ipa_scripts/cuts_dirs_config.json
 delete mode 100644 scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py
 delete mode 100644 scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json

diff --git a/examples/tts/evalset_config.json b/examples/tts/evalset_config.json
index 49822ce9cf25..4be3056020ce 100644
--- a/examples/tts/evalset_config.json
+++ b/examples/tts/evalset_config.json
@@ -13,53 +13,5 @@
         "manifest_path": "/home/TestData/an4_dataset/an4_val_context_v1_longform_tiny.json",
         "audio_dir": "/",
         "feature_dir": null
-    },
-    "riva_multibpe": {
-        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/riva_hard_multi_bpe.ndjson",
-        "audio_dir": "/Data/RIVA-TTS",
-        "feature_dir": "/Data/RIVA-TTS",
-        "tokenizer_names": ["nemotron_nano_30b"]
-    },
-    "riva_hard_digits": {
-        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-digits-path-corrected.ndjson",
-        "audio_dir": "/Data/RIVA-TTS",
-        "feature_dir": "/Data/RIVA-TTS",
-        "tokenizer_names": ["nemotron_nano_30b"]
-    },
-    "riva_hard_letters": {
-        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-letters-path-corrected.ndjson",
-        "audio_dir": "/Data/RIVA-TTS",
-        "feature_dir": "/Data/RIVA-TTS",
-        "tokenizer_names": ["nemotron_nano_30b"]
-    },
-    "riva_hard_money": {
-        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-money-path-corrected.ndjson",
-        "audio_dir": "/Data/RIVA-TTS",
-        "feature_dir": "/Data/RIVA-TTS",
-        "tokenizer_names": ["nemotron_nano_30b"]
-    },
-    "riva_hard_short": {
-        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-short-path-corrected.ndjson",
-        "audio_dir": "/Data/RIVA-TTS",
-        "feature_dir": "/Data/RIVA-TTS",
-        "tokenizer_names": ["nemotron_nano_30b"]
-    },
-    "vctk": {
-        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/smallvctk__phoneme__nemo_audio_21fps_8codebooks_2kcodes_v2bWithWavLM_simplet5_withcontextaudiopaths_silence_trimmed.json",
-        "audio_dir": "/Data/VCTK-Corpus-0.92",
-        "feature_dir": "/Data/VCTK-Corpus-0.92",
-        "tokenizer_names": ["nemotron_nano_30b"]
-    },
-    "libritts_seen": {
-        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/LibriTTS_seen_evalset_from_testclean_v2.json",
-        "audio_dir": "/Data/LibriTTS",
-        "feature_dir": "/Data/LibriTTS",
-        "tokenizer_names": ["nemotron_nano_30b"]
-    },
-    "libritts_test_clean": {
-        "manifest_path": "/Data/evaluation_manifests/ipa_manifests/LibriTTS_test_clean_withContextAudioPaths.jsonl",
-        "audio_dir": "/Data/LibriTTS",
-        "feature_dir": "/Data/LibriTTS",
-        "tokenizer_names": ["nemotron_nano_30b"]
     }
 }
diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py
index de11bb4f9229..42b6c81f0f0b 100644
--- a/nemo/collections/tts/models/audio_codec.py
+++ b/nemo/collections/tts/models/audio_codec.py
@@ -110,7 +110,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self.audio_decoder = instantiate(cfg.audio_decoder)
 
         # Discriminator setup
-        # self.discriminator = instantiate(cfg.discriminator)
+        self.discriminator = instantiate(cfg.discriminator)
 
         # Mel loss setup
         loss_resolutions = cfg.loss_resolutions
@@ -182,16 +182,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             self.speaker_encoder = ResNetSpeakerEncoder()
             # load pretrained model
             # self.speaker_encoder.load_checkpoint("https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar")
-            import os
-
-            # TODO: revert this
-            if os.path.exists("/gitrepos/checkpoints/pytorch_model.bin"):
-                self.speaker_encoder.load_checkpoint("/gitrepos/checkpoints/pytorch_model.bin", strict=False)
-            else:
-                self.speaker_encoder.load_checkpoint(
-                    "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin",
-                    strict=False,
-                )
+            self.speaker_encoder.load_checkpoint(
+                "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin",
+                strict=False,
+            )
             # freeze the pretrained speaker encoder
             self.speaker_encoder.freeze()
             logging.info("Speaker encoder loaded and frozen !!")
diff --git a/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py b/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py
deleted file mode 100644
index 10972d1bdc6a..000000000000
--- a/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py
+++ /dev/null
@@ -1,358 +0,0 @@
-#!/usr/bin/env python3
-"""
-Add IPA strings (from espeak/espeak-ng) to Lhotse cuts jsonl.gz shards.
-
-For each cuts directory like:
-  /Data/.../de/.../cuts
-creates:
-  /Data/.../de/.../cuts_with_ipa
-and writes corresponding cuts.000000.jsonl.gz, etc. with an added IPA field.
-
-IPA is added to each supervision under:
-  cut["supervisions"][i]["custom"]["ipa"]
-
-Usage:
-  python add_ipa_to_cuts.py --lang de
-  python add_ipa_to_cuts.py --lang all  # run all languages
-
-Edit the `CUTS_DIRS_BY_LANG` dict below (or replace with argparse/config as desired).
-"""
-
-from __future__ import annotations
-
-import argparse
-import concurrent.futures as cf
-import gzip
-import json
-import os
-import re
-import shutil
-import subprocess
-import sys
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Tuple
-
-# -------------------------
-# USER CONFIG
-# -------------------------
-
-# Default config file path (same directory as this script)
-DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json"
-
-
-def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]:
-    """Load CUTS_DIRS_BY_LANG from a JSON config file."""
-    if config_path is None:
-        config_path = DEFAULT_CONFIG_PATH
-
-    if not config_path.exists():
-        raise FileNotFoundError(f"Config file not found: {config_path}")
-
-    with open(config_path, "r", encoding="utf-8") as f:
-        return json.load(f)
-
-
-# Map your dataset language keys to espeak voice codes (adjust as needed).
-# For German, espeak-ng uses "de" typically.
-ESPEAK_VOICE_BY_LANG: Dict[str, str] = {
-    "de": "de",
-    "en": "en",
-    "es": "es",
-    "fr": "fr",
-    "hi": "hi",
-    "it": "it",
-    "vi": "vi",
-    "zh": "zh",
-    "ru": "ru",
-    "ja": "ja",
-    "ko": "ko",
-    "ar": "ar",
-    "he": "he",
-    "nl": "nl",
-    "pl": "pl",
-    "pt": "pt",
-}
-
-OUTPUT_SUFFIX = "_with_ipa"  # cuts -> cuts_with_ipa
-SHARD_GLOB = "cuts.*.jsonl.gz"
-
-# Parallelism
-MAX_WORKERS = max(1, (os.cpu_count() or 4) - 1)
-# MAX_WORKERS = 8
-
-# If True, skip writing if output shard exists (basic resume)
-SKIP_EXISTING_OUTPUT_SHARDS = False
-# -------------------------
-# IMPLEMENTATION
-# -------------------------
-
-IPA_FLAG = "--ipa"  # espeak-ng uses --ipa, espeak supports --ipa in many builds
-# Use --quiet if available; safe to try.
-COMMON_FLAGS = ["-q"]
-
-# Some espeak builds output extra spaces/newlines; we normalize.
-_WS_RE = re.compile(r"\s+")
-
-
-def _find_espeak_binary() -> str:
-    """Prefer espeak-ng if present, else espeak."""
-    for exe in ("espeak-ng", "espeak"):
-        if shutil.which(exe):
-            return exe
-    raise RuntimeError(
-        "Neither 'espeak-ng' nor 'espeak' was found on PATH. " "Install espeak-ng (recommended) or espeak."
-    )
-
-
-@dataclass(frozen=True)
-class EspeakRunner:
-    exe: str
-    voice: str
-
-    def text_to_ipa(self, text: str) -> str:
-        """
-        Convert text -> IPA using espeak/espeak-ng.
-        """
-        # Note: We pass text via stdin to avoid shell escaping issues.
-        cmd = [self.exe, "-v", self.voice, IPA_FLAG] + COMMON_FLAGS
-        try:
-            proc = subprocess.run(
-                cmd,
-                input=text.encode("utf-8"),
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                check=False,
-            )
-        except Exception as e:
-            raise RuntimeError(f"Failed to run {cmd}: {e}") from e
-
-        if proc.returncode != 0:
-            raise RuntimeError(
-                f"espeak command failed (rc={proc.returncode})\n"
-                f"cmd: {' '.join(cmd)}\n"
-                f"stderr: {proc.stderr.decode('utf-8', errors='replace')}"
-            )
-
-        out = proc.stdout.decode("utf-8", errors="replace").strip()
-        # Normalize whitespace to single spaces
-        out = _WS_RE.sub(" ", out).strip()
-        return out
-
-
-def iter_shards(cuts_dir: Path) -> List[Path]:
-    return sorted(cuts_dir.glob(SHARD_GLOB))
-
-
-def derive_output_dir(cuts_dir: Path) -> Path:
-    # If dir name ends with "cuts", produce "cuts_with_ipa".
-    # Otherwise append suffix to the directory name.
-    name = cuts_dir.name
-    if name == "cuts":
-        out_name = f"cuts{OUTPUT_SUFFIX}"
-    else:
-        out_name = f"{name}{OUTPUT_SUFFIX}"
-    return cuts_dir.parent / out_name
-
-
-def load_json_line(line: str) -> dict:
-    return json.loads(line)
-
-
-def dump_json_line(obj: dict) -> str:
-    # compact, consistent output
-    return json.dumps(obj, ensure_ascii=False)
-
-
-class IPACache:
-    """
-    Process-local cache. Speeds up repeated identical texts.
-    """
-
-    def __init__(self) -> None:
-        self._cache: Dict[Tuple[str, str], str] = {}
-
-    def get(self, voice: str, text: str) -> Optional[str]:
-        return self._cache.get((voice, text))
-
-    def set(self, voice: str, text: str, ipa: str) -> None:
-        self._cache[(voice, text)] = ipa
-
-
-def add_ipa_to_cut(
-    cut: dict,
-    espeak: EspeakRunner,
-    cache: IPACache,
-) -> dict:
-    """
-    Adds IPA to each supervision custom field: custom["ipa"].
-    Uses supervision["custom"]["normalized_text"] if available, otherwise supervision["text"] as source text.
-    For Vietnamese (vi), uses original_text and updates text/normalized_text fields.
-    """
-    sups = cut.get("supervisions") or []
-    is_vietnamese = espeak.voice == "vi"
-    for sup in sups:
-        custom = sup.get("custom")
-        if custom is None:
-            custom = {}
-            sup["custom"] = custom
-
-        # For Vietnamese, use original_text and fix the text fields
-        if is_vietnamese and custom.get("original_text"):
-            text = custom["original_text"]
-            sup["text"] = text
-            custom["normalized_text"] = text
-        else:
-            text = custom.get("normalized_text") or sup.get("text")
-
-        if not text:
-            continue
-
-        # If already has IPA, keep it
-        if "ipa" in custom and isinstance(custom["ipa"], str) and custom["ipa"].strip():
-            continue
-
-        cached = cache.get(espeak.voice, text)
-        if cached is None:
-            cached = espeak.text_to_ipa(text)
-            cache.set(espeak.voice, text, cached)
-
-        custom["ipa"] = cached
-
-    return cut
-
-
-def process_shard(
-    shard_path: Path,
-    out_shard_path: Path,
-    espeak: EspeakRunner,
-) -> Tuple[Path, int]:
-    """
-    Read shard jsonl.gz, add IPA, write out shard jsonl.gz
-    Returns: (out_shard_path, num_lines)
-    """
-    cache = IPACache()
-    n = 0
-
-    with (
-        gzip.open(shard_path, "rt", encoding="utf-8") as fin,
-        gzip.open(out_shard_path, "wt", encoding="utf-8") as fout,
-    ):
-        for line in fin:
-            line = line.strip()
-            if not line:
-                continue
-            cut = load_json_line(line)
-            cut = add_ipa_to_cut(cut, espeak=espeak, cache=cache)
-            fout.write(dump_json_line(cut))
-            fout.write("\n")
-            n += 1
-
-    return out_shard_path, n
-
-
-def process_cuts_dir(lang: str, cuts_dir: Path) -> None:
-    voice = ESPEAK_VOICE_BY_LANG.get(lang, lang)
-    exe = _find_espeak_binary()
-    espeak = EspeakRunner(exe=exe, voice=voice)
-
-    out_dir = derive_output_dir(cuts_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-
-    shards = iter_shards(cuts_dir)
-    if not shards:
-        print(f"[WARN] No shards matched {SHARD_GLOB} in {cuts_dir}", file=sys.stderr)
-        return
-
-    print(f"[INFO] {lang}: {cuts_dir} -> {out_dir}  (shards={len(shards)})")
-
-    jobs: List[Tuple[Path, Path]] = []
-    for shard in shards:
-        out_shard = out_dir / shard.name
-        if SKIP_EXISTING_OUTPUT_SHARDS and out_shard.exists():
-            continue
-        jobs.append((shard, out_shard))
-
-    if not jobs:
-        print(f"[INFO] {lang}: nothing to do in {cuts_dir} (all outputs exist).")
-        return
-
-    # Parallelize per shard
-    with cf.ProcessPoolExecutor(max_workers=MAX_WORKERS) as ex:
-        futures = []
-        for shard, out_shard in jobs:
-            futures.append(ex.submit(_process_shard_worker, shard, out_shard, espeak.exe, espeak.voice))
-
-        for fut in cf.as_completed(futures):
-            out_shard_path, n = fut.result()
-            print(f"[OK] wrote {out_shard_path}  (lines={n})")
-
-
-def _process_shard_worker(shard: Path, out_shard: Path, exe: str, voice: str) -> Tuple[Path, int]:
-    # Re-create runner in worker process
-    espeak = EspeakRunner(exe=exe, voice=voice)
-    return process_shard(shard, out_shard, espeak)
-
-
-def get_available_languages(cuts_dirs: Dict[str, List[str]]) -> List[str]:
-    """Return list of all available language codes."""
-    return list(cuts_dirs.keys())
-
-
-def process_language(lang: str, cuts_dirs: Dict[str, List[str]]) -> bool:
-    """
-    Process all directories for a given language.
-    Returns True if successful, False if there was an issue.
-    """
-    if lang not in cuts_dirs:
-        print(f"[ERROR] Unknown language: {lang}", file=sys.stderr)
-        print(f"[ERROR] Available languages: {get_available_languages(cuts_dirs)}", file=sys.stderr)
-        return False
-
-    dirs = cuts_dirs[lang]
-    for d in dirs:
-        cuts_dir = Path(d)
-        if not cuts_dir.exists():
-            print(f"[WARN] missing dir: {cuts_dir}", file=sys.stderr)
-            continue
-        process_cuts_dir(lang, cuts_dir)
-
-    return True
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Add IPA strings to Lhotse cuts jsonl.gz shards.")
-    parser.add_argument(
-        "--lang",
-        type=str,
-        required=True,
-        help="Language code to process (e.g., 'de', 'en', 'fr') or 'all' for all languages.",
-    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        default=None,
-        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}",
-    )
-    args = parser.parse_args()
-
-    # Load config
-    config_path = Path(args.config) if args.config else None
-    cuts_dirs = load_cuts_dirs_config(config_path)
-    print(f"[INFO] Loaded config with languages: {get_available_languages(cuts_dirs)}")
-
-    if args.lang == "all":
-        # Process all languages
-        for lang in cuts_dirs.keys():
-            print(f"\n{'='*60}")
-            print(f"[INFO] Processing language: {lang}")
-            print(f"{'='*60}")
-            process_language(lang, cuts_dirs)
-    else:
-        success = process_language(args.lang, cuts_dirs)
-        if not success:
-            sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py b/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py
deleted file mode 100644
index e2d53c3099d3..000000000000
--- a/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py
+++ /dev/null
@@ -1,734 +0,0 @@
-#!/usr/bin/env python3
-"""
-Analyze and compare tokenization (tokens per second of audio) between:
-1. Qwen/Qwen2.5-1.5B-Instruct tokenizer on raw text
-2. NVIDIA Nemotron Nano 30B tokenizer on raw text
-3. IPABPETokenizer on phonemized IPA text at different vocab sizes
-
-This script:
-1. Creates a balanced IPA corpus (equal samples per language) from train_langs
-2. Trains IPA BPE tokenizers at vocab sizes 512, 1024, 2048, 4096
-3. For each test language, samples text pairs from cuts_with_ipa directories
-4. Computes tokens per second (tokens / audio duration) for each tokenizer
-5. Outputs comparison statistics showing tokens/second for each tokenizer
-
-Features:
-- Reads data once and reuses across all vocab sizes (efficient)
-- Balances training data across languages (uses min count across all train langs)
-- Supports separate train and test language sets
-- Computes tokens per second using audio duration from cuts
-
-Usage:
-    # Train and test on all languages
-    python analyze_ipa_tokenization.py --output_dir /path/to/output
-
-    # Train on en,de,fr but test on all languages
-    python analyze_ipa_tokenization.py --output_dir /path/to/output --train_langs en,de,fr --test_langs all
-
-    # Train on all, test on specific languages
-    python analyze_ipa_tokenization.py --output_dir /path/to/output --train_langs all --test_langs en,zh
-
-    # Cap training samples per language
-    python analyze_ipa_tokenization.py --output_dir /path/to/output --max_samples_per_lang 50000
-"""
-
-import argparse
-import gzip
-import json
-import os
-import random
-import sys
-from collections import defaultdict
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Dict, Generator, List, Optional, Tuple
-
-import numpy as np
-from tokenizers import Tokenizer
-from tokenizers.models import BPE
-from tokenizers.pre_tokenizers import ByteLevel
-from tokenizers.trainers import BpeTrainer
-from transformers import AutoTokenizer
-
-# -------------------------
-# CONFIGURATION
-# -------------------------
-
-VOCAB_SIZES = [512, 1024, 2048, 4096]
-
-# Default config file path (same directory as this script)
-DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json"
-
-
-def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]:
-    """Load CUTS_DIRS_BY_LANG from a JSON config file."""
-    if config_path is None:
-        config_path = DEFAULT_CONFIG_PATH
-
-    if not config_path.exists():
-        raise FileNotFoundError(f"Config file not found: {config_path}")
-
-    with open(config_path, "r", encoding="utf-8") as f:
-        return json.load(f)
-
-
-OUTPUT_SUFFIX = "_with_ipa"
-SHARD_GLOB = "cuts.*.jsonl.gz"
-
-
-@dataclass
-class TextPair:
-    """A pair of raw text and its IPA phonemization with audio duration."""
-
-    raw_text: str
-    ipa_text: str
-    lang: str
-    duration: float  # audio duration in seconds
-
-
-@dataclass
-class TokenizationStats:
-    """Statistics for tokenization comparison (tokens per second)."""
-
-    lang: str
-    num_samples: int
-    total_duration: float  # sum of all durations in seconds
-    qwen_tokens_per_second: float
-    nemotron_tokens_per_second: float
-    ipa_tokens_per_second: Dict[int, float]  # vocab_size -> tokens/sec
-
-
-def get_ipa_dir(cuts_dir: Path) -> Path:
-    """Convert a cuts directory path to its corresponding cuts_with_ipa path."""
-    name = cuts_dir.name
-    if name == "cuts":
-        out_name = f"cuts{OUTPUT_SUFFIX}"
-    else:
-        out_name = f"{name}{OUTPUT_SUFFIX}"
-    return cuts_dir.parent / out_name
-
-
-def iter_shards(ipa_dir: Path) -> List[Path]:
-    """Get all shard files in a directory."""
-    return sorted(ipa_dir.glob(SHARD_GLOB))
-
-
-def extract_text_pairs_from_shard(shard_path: Path, lang: str) -> Generator[TextPair, None, None]:
-    """
-    Extract text pairs (raw text + IPA) from a single shard file.
-
-    Yields:
-        TextPair objects with raw_text, ipa_text, and duration
-    """
-    with gzip.open(shard_path, "rt", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                cut = json.loads(line)
-                # Get duration from the top-level cut object
-                duration = cut.get("duration", 0.0)
-                supervisions = cut.get("supervisions", [])
-                for sup in supervisions:
-                    custom = sup.get("custom", {})
-                    ipa = custom.get("ipa")
-                    # Get raw text - prefer normalized_text, fallback to text
-                    raw_text = custom.get("normalized_text") or sup.get("text")
-
-                    if ipa and raw_text and isinstance(ipa, str) and isinstance(raw_text, str):
-                        ipa = ipa.strip()
-                        raw_text = raw_text.strip()
-                        if ipa and raw_text and duration > 0:
-                            yield TextPair(raw_text=raw_text, ipa_text=ipa, lang=lang, duration=duration)
-            except json.JSONDecodeError:
-                continue
-
-
-def sample_text_pairs(
-    lang: str,
-    cuts_dirs: Dict[str, List[str]],
-    num_samples: int = 1000,
-    seed: int = 42,
-) -> List[TextPair]:
-    """
-    Sample text pairs from a language's cuts_with_ipa directories.
-
-    Args:
-        lang: Language code
-        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
-        num_samples: Number of samples to collect
-        seed: Random seed for reproducibility
-
-    Returns:
-        List of TextPair objects
-    """
-    random.seed(seed)
-
-    if lang not in cuts_dirs:
-        raise ValueError(f"Unknown language: {lang}")
-
-    # Collect all text pairs from all directories
-    all_pairs = []
-    for cuts_dir_str in cuts_dirs[lang]:
-        cuts_dir = Path(cuts_dir_str)
-        ipa_dir = get_ipa_dir(cuts_dir)
-
-        if not ipa_dir.exists():
-            print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr)
-            continue
-
-        shards = iter_shards(ipa_dir)
-        for shard in shards:
-            for pair in extract_text_pairs_from_shard(shard, lang):
-                all_pairs.append(pair)
-                # Early exit if we have way more than needed
-                if len(all_pairs) >= num_samples * 10:
-                    break
-            if len(all_pairs) >= num_samples * 10:
-                break
-        if len(all_pairs) >= num_samples * 10:
-            break
-
-    # Sample
-    if len(all_pairs) <= num_samples:
-        print(f"[INFO] {lang}: Only found {len(all_pairs)} pairs, using all")
-        return all_pairs
-
-    return random.sample(all_pairs, num_samples)
-
-
-def iter_ipa_strings_for_lang(
-    lang: str,
-    cuts_dirs: Dict[str, List[str]],
-) -> Generator[str, None, None]:
-    """Iterate over all IPA strings for a single language (memory-efficient)."""
-    if lang not in cuts_dirs:
-        return
-
-    for cuts_dir_str in cuts_dirs[lang]:
-        cuts_dir = Path(cuts_dir_str)
-        ipa_dir = get_ipa_dir(cuts_dir)
-
-        if not ipa_dir.exists():
-            continue
-
-        shards = iter_shards(ipa_dir)
-        for shard in shards:
-            with gzip.open(shard, "rt", encoding="utf-8") as f:
-                for line in f:
-                    line = line.strip()
-                    if not line:
-                        continue
-                    try:
-                        cut = json.loads(line)
-                        for sup in cut.get("supervisions", []):
-                            ipa = sup.get("custom", {}).get("ipa")
-                            if ipa and isinstance(ipa, str) and ipa.strip():
-                                yield ipa.strip()
-                    except json.JSONDecodeError:
-                        continue
-
-
-def count_ipa_strings_for_lang(lang: str, cuts_dirs: Dict[str, List[str]], max_count: int = 100000) -> int:
-    """Count IPA strings for a language without loading into memory."""
-    count = 0
-    for _ in iter_ipa_strings_for_lang(lang, cuts_dirs):
-        count += 1
-        if count >= max_count:
-            break
-    return count
-
-
-def simple_sample_ipa_strings(
-    lang: str,
-    cuts_dirs: Dict[str, List[str]],
-    k: int,
-    max_collect: int = 100000,
-    seed: int = 42,
-) -> List[str]:
-    """
-    Simple sampling: collect up to max_collect IPA strings, then randomly sample k.
-
-    This avoids reading through all data like reservoir sampling does.
-
-    Args:
-        lang: Language code
-        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
-        k: Number of samples to select
-        max_collect: Maximum number of strings to collect before sampling
-        seed: Random seed for reproducibility
-
-    Returns:
-        List of up to k sampled IPA strings
-    """
-    rng = random.Random(seed)
-    collected: List[str] = []
-
-    for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs):
-        collected.append(ipa)
-        if len(collected) >= max_collect:
-            break
-
-    # If we have fewer than k, return all
-    if len(collected) <= k:
-        return collected
-
-    # Otherwise, randomly sample k
-    return rng.sample(collected, k)
-
-
-def create_balanced_corpus(
-    train_langs: List[str],
-    cuts_dirs: Dict[str, List[str]],
-    output_file: str,
-    max_samples_per_lang: Optional[int] = None,
-    max_count_per_lang: int = 100000,
-    seed: int = 42,
-) -> Tuple[str, Dict[str, int]]:
-    """
-    Create a balanced IPA corpus file with equal samples from each language.
-
-    Uses a memory-efficient two-pass approach:
-    1. First pass: Count sentences per language (up to max_count_per_lang)
-    2. Second pass: Use simple sampling to select samples
-
-    Args:
-        train_langs: List of language codes to include
-        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
-        output_file: Path to write the balanced corpus
-        max_samples_per_lang: Optional cap on samples per language
-        max_count_per_lang: Max count per language when counting IPA strings
-        seed: Random seed for reproducibility
-
-    Returns:
-        Tuple of (corpus_file_path, dict of lang -> actual_count)
-    """
-    # First pass: Count sentences per language
-    print("[INFO] Pass 1: Counting IPA strings per language...")
-    lang_counts: Dict[str, int] = {}
-
-    for lang in train_langs:
-        if lang not in cuts_dirs:
-            print(f"[WARN] Language {lang} not in config, skipping")
-            continue
-        print(f"[INFO] Counting {lang}...", end=" ", flush=True)
-        count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang)
-        lang_counts[lang] = count
-        print(f"{count} IPA strings")
-
-    if not lang_counts:
-        raise ValueError("No IPA strings found for any language")
-
-    # Find minimum count across languages
-    min_count = min(lang_counts.values())
-    print(f"[INFO] Minimum count across languages: {min_count}")
-
-    # Apply max_samples_per_lang cap if specified
-    samples_per_lang = min_count
-    if max_samples_per_lang is not None and max_samples_per_lang < min_count:
-        samples_per_lang = max_samples_per_lang
-        print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}")
-
-    # Second pass: Sample from each language using simple sampling
-    print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...")
-    actual_counts: Dict[str, int] = {}
-    total_written = 0
-
-    with open(output_file, "w", encoding="utf-8") as f:
-        for lang in lang_counts.keys():
-            print(f"[INFO] Sampling from {lang}...", end=" ", flush=True)
-            # Use different seed per language for variety, but reproducible
-            lang_seed = seed + hash(lang) % 10000
-            sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed)
-
-            for ipa in sampled:
-                f.write(ipa + "\n")
-                total_written += 1
-
-            actual_counts[lang] = len(sampled)
-            print(f"sampled {len(sampled)} strings")
-
-    print(f"[INFO] Total IPA strings written to corpus: {total_written}")
-    print(f"[INFO] Balanced corpus saved to: {output_file}")
-
-    return output_file, actual_counts
-
-
-def train_ipa_bpe_tokenizer(
-    output_dir: str,
-    vocab_size: int,
-    corpus_file: str,
-    min_frequency: int = 2,
-) -> Tokenizer:
-    """
-    Train a byte-level BPE tokenizer on IPA strings from a pre-built corpus file.
-
-    Args:
-        output_dir: Directory to save tokenizer files
-        vocab_size: Target vocabulary size
-        corpus_file: Path to the IPA corpus file (one IPA string per line)
-        min_frequency: Minimum frequency for a token to be included
-
-    Returns:
-        Trained Tokenizer object
-    """
-    tokenizer_dir = os.path.join(output_dir, f"ipa_bpe_v{vocab_size}")
-    os.makedirs(tokenizer_dir, exist_ok=True)
-
-    tokenizer_file = os.path.join(tokenizer_dir, "tokenizer.json")
-
-    # Check if already trained
-    if os.path.exists(tokenizer_file):
-        print(f"[INFO] Loading existing tokenizer from {tokenizer_file}")
-        return Tokenizer.from_file(tokenizer_file)
-
-    # Initialize tokenizer
-    tokenizer = Tokenizer(BPE(unk_token="<unk>"))
-    tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
-
-    special_tokens = ["<pad>", "<blank>", "<unk>"]
-
-    trainer = BpeTrainer(
-        vocab_size=vocab_size,
-        min_frequency=min_frequency,
-        special_tokens=special_tokens,
-        show_progress=True,
-    )
-
-    print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}...")
-    tokenizer.train(files=[corpus_file], trainer=trainer)
-
-    # Save
-    tokenizer.save(tokenizer_file)
-    tokenizer.model.save(tokenizer_dir)
-
-    print(f"[INFO] Saved tokenizer to {tokenizer_dir}")
-
-    return tokenizer
-
-
-def compute_stats(
-    text_pairs: List[TextPair],
-    qwen_tokenizer: AutoTokenizer,
-    nemotron_tokenizer: AutoTokenizer,
-    ipa_tokenizers: Dict[int, Tokenizer],
-    lang: str,
-) -> TokenizationStats:
-    """
-    Compute tokenization statistics (tokens per second) for a set of text pairs.
-    """
-    qwen_counts = []
-    nemotron_counts = []
-    ipa_counts = {vs: [] for vs in ipa_tokenizers.keys()}
-
-    for pair in text_pairs:
-        # Qwen tokenizer on raw text
-        qwen_tokens = qwen_tokenizer.encode(pair.raw_text)
-        qwen_counts.append(len(qwen_tokens))
-
-        # Nemotron tokenizer on raw text
-        nemotron_tokens = nemotron_tokenizer.encode(pair.raw_text)
-        nemotron_counts.append(len(nemotron_tokens))
-
-        # IPA tokenizers on IPA text
-        for vocab_size, tokenizer in ipa_tokenizers.items():
-            ipa_tokens = tokenizer.encode(pair.ipa_text)
-            ipa_counts[vocab_size].append(len(ipa_tokens.ids))
-
-    # Calculate total duration and token counts
-    total_duration = sum(pair.duration for pair in text_pairs)
-    qwen_total = sum(qwen_counts)
-    nemotron_total = sum(nemotron_counts)
-
-    # Compute tokens per second
-    qwen_tps = qwen_total / total_duration if total_duration > 0 else 0.0
-    nemotron_tps = nemotron_total / total_duration if total_duration > 0 else 0.0
-
-    ipa_tps = {}
-    for vocab_size in ipa_tokenizers.keys():
-        ipa_total = sum(ipa_counts[vocab_size])
-        ipa_tps[vocab_size] = ipa_total / total_duration if total_duration > 0 else 0.0
-
-    return TokenizationStats(
-        lang=lang,
-        num_samples=len(text_pairs),
-        total_duration=total_duration,
-        qwen_tokens_per_second=qwen_tps,
-        nemotron_tokens_per_second=nemotron_tps,
-        ipa_tokens_per_second=ipa_tps,
-    )
-
-
-def print_stats_table(all_stats: List[TokenizationStats], vocab_sizes: List[int]):
-    """Print a formatted table of tokens per second statistics."""
-    print("\n" + "=" * 120)
-    print("TOKENS PER SECOND: Qwen2.5-1.5B-Instruct & Nemotron Nano 30B (raw text) vs IPA BPE (phonemized)")
-    print("=" * 120)
-
-    # Header
-    header = f"{'Lang':<6} {'Samples':>8} {'Duration(s)':>12} {'Qwen tok/s':>12} {'Nemo tok/s':>12}"
-    for vs in vocab_sizes:
-        header += f" {'IPA-' + str(vs):>10}"
-    print(header)
-    print("-" * 120)
-
-    # Data rows
-    for stats in all_stats:
-        row = f"{stats.lang:<6} {stats.num_samples:>8} {stats.total_duration:>12.2f} {stats.qwen_tokens_per_second:>12.2f} {stats.nemotron_tokens_per_second:>12.2f}"
-        for vs in vocab_sizes:
-            row += f" {stats.ipa_tokens_per_second[vs]:>10.2f}"
-        print(row)
-
-    # Aggregated stats
-    print("-" * 120)
-    total_samples = sum(s.num_samples for s in all_stats)
-    total_duration = sum(s.total_duration for s in all_stats)
-
-    # Compute overall tokens per second (weighted by duration)
-    total_qwen_tokens = sum(s.qwen_tokens_per_second * s.total_duration for s in all_stats)
-    total_nemotron_tokens = sum(s.nemotron_tokens_per_second * s.total_duration for s in all_stats)
-    overall_qwen_tps = total_qwen_tokens / total_duration if total_duration > 0 else 0
-    overall_nemotron_tps = total_nemotron_tokens / total_duration if total_duration > 0 else 0
-
-    agg_row = f"{'TOTAL':<6} {total_samples:>8} {total_duration:>12.2f} {overall_qwen_tps:>12.2f} {overall_nemotron_tps:>12.2f}"
-    for vs in vocab_sizes:
-        total_ipa_tokens = sum(s.ipa_tokens_per_second[vs] * s.total_duration for s in all_stats)
-        overall_ipa_tps = total_ipa_tokens / total_duration if total_duration > 0 else 0
-        agg_row += f" {overall_ipa_tps:>10.2f}"
-    print(agg_row)
-    print("=" * 120)
-
-    # Summary
-    print("\nSUMMARY:")
-    print(f"  - Total samples analyzed: {total_samples}")
-    print(f"  - Total audio duration: {total_duration:.2f} seconds ({total_duration/3600:.2f} hours)")
-    print(f"  - Qwen tokens/second: {overall_qwen_tps:.2f}")
-    print(f"  - Nemotron tokens/second: {overall_nemotron_tps:.2f}")
-    for vs in vocab_sizes:
-        total_ipa_tokens = sum(s.ipa_tokens_per_second[vs] * s.total_duration for s in all_stats)
-        overall_ipa_tps = total_ipa_tokens / total_duration if total_duration > 0 else 0
-        print(f"  - IPA-{vs} tokens/second: {overall_ipa_tps:.2f}")
-    print()
-
-
-def save_results_json(
-    all_stats: List[TokenizationStats],
-    output_path: str,
-    train_langs: Optional[List[str]] = None,
-    test_langs: Optional[List[str]] = None,
-):
-    """Save results to JSON file with metadata."""
-    output = {
-        "metadata": {
-            "train_langs": train_langs or [],
-            "test_langs": test_langs or [],
-        },
-        "results": [],
-    }
-
-    for stats in all_stats:
-        output["results"].append(
-            {
-                "lang": stats.lang,
-                "num_samples": stats.num_samples,
-                "total_duration_seconds": stats.total_duration,
-                "qwen_tokens_per_second": stats.qwen_tokens_per_second,
-                "nemotron_tokens_per_second": stats.nemotron_tokens_per_second,
-                "ipa_tokens_per_second": {
-                    str(vs): stats.ipa_tokens_per_second[vs] for vs in stats.ipa_tokens_per_second.keys()
-                },
-            }
-        )
-
-    with open(output_path, "w", encoding="utf-8") as f:
-        json.dump(output, f, indent=2)
-    print(f"[INFO] Saved results to {output_path}")
-
-
-def parse_lang_arg(arg: str, available_langs: List[str]) -> List[str]:
-    """Parse a language argument (comma-separated or 'all')."""
-    if arg == "all":
-        return available_langs
-    langs = [l.strip() for l in arg.split(",") if l.strip()]
-    # Validate languages
-    for lang in langs:
-        if lang not in available_langs:
-            raise ValueError(f"Unknown language: {lang}. Available: {available_langs}")
-    return langs
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Compare tokenization between Qwen and IPA BPE tokenizers.")
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        required=True,
-        help="Directory to save tokenizers and results",
-    )
-    parser.add_argument(
-        "--samples_per_lang",
-        type=int,
-        default=1000,
-        help="Number of samples per language for testing (default: 1000)",
-    )
-    parser.add_argument(
-        "--train_langs",
-        type=str,
-        default="all",
-        help="Comma-separated languages for training tokenizer, or 'all' (default: all)",
-    )
-    parser.add_argument(
-        "--test_langs",
-        type=str,
-        default="all",
-        help="Comma-separated languages for testing/analysis, or 'all' (default: all)",
-    )
-    parser.add_argument(
-        "--max_samples_per_lang",
-        type=int,
-        default=None,
-        help="Optional cap on training samples per language (default: use min count across langs)",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="Random seed for sampling (default: 42)",
-    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        default=None,
-        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}",
-    )
-    parser.add_argument(
-        "--max_count_per_lang",
-        type=int,
-        default=100000,
-        help="Max count per language when counting IPA strings (default: 100000)",
-    )
-    args = parser.parse_args()
-
-    os.makedirs(args.output_dir, exist_ok=True)
-
-    # Load config
-    config_path = Path(args.config) if args.config else None
-    cuts_dirs = load_cuts_dirs_config(config_path)
-    available_langs = list(cuts_dirs.keys())
-    print(f"[INFO] Loaded config with languages: {available_langs}")
-
-    # Parse train and test languages
-    try:
-        train_langs = parse_lang_arg(args.train_langs, available_langs)
-        test_langs = parse_lang_arg(args.test_langs, available_langs)
-    except ValueError as e:
-        print(f"[ERROR] {e}")
-        sys.exit(1)
-
-    print(f"[INFO] Training languages: {train_langs}")
-    print(f"[INFO] Testing languages: {test_langs}")
-    print(f"[INFO] Samples per language for testing: {args.samples_per_lang}")
-    print(f"[INFO] Max samples per language for training: {args.max_samples_per_lang or 'auto (min across langs)'}")
-    print(f"[INFO] Vocab sizes: {VOCAB_SIZES}")
-
-    # Step 1: Create balanced IPA corpus once
-    print("\n" + "=" * 60)
-    print("STEP 1: Creating balanced IPA corpus")
-    print("=" * 60)
-
-    corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt")
-
-    # Check if corpus already exists
-    if os.path.exists(corpus_file):
-        print(f"[INFO] Using existing corpus file: {corpus_file}")
-        with open(corpus_file, "r", encoding="utf-8") as f:
-            line_count = sum(1 for _ in f)
-        print(f"[INFO] Corpus contains {line_count} IPA strings")
-    else:
-        corpus_file, lang_counts = create_balanced_corpus(
-            train_langs=train_langs,
-            cuts_dirs=cuts_dirs,
-            output_file=corpus_file,
-            max_samples_per_lang=args.max_samples_per_lang,
-            max_count_per_lang=args.max_count_per_lang,
-            seed=args.seed,
-        )
-
-    # Step 2: Train IPA BPE tokenizers at different vocab sizes (reusing corpus)
-    print("\n" + "=" * 60)
-    print("STEP 2: Training IPA BPE tokenizers")
-    print("=" * 60)
-
-    ipa_tokenizers = {}
-    for vocab_size in VOCAB_SIZES:
-        print(f"\n[INFO] Training tokenizer with vocab_size={vocab_size}")
-        ipa_tokenizers[vocab_size] = train_ipa_bpe_tokenizer(
-            output_dir=args.output_dir,
-            vocab_size=vocab_size,
-            corpus_file=corpus_file,
-            min_frequency=2,
-        )
-
-    # Step 3: Load Qwen and Nemotron tokenizers
-    print("\n" + "=" * 60)
-    print("STEP 3: Loading Qwen and Nemotron tokenizers")
-    print("=" * 60)
-
-    print("[INFO] Loading Qwen/Qwen2.5-1.5B-Instruct tokenizer...")
-    qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
-    print(f"[INFO] Qwen tokenizer vocab size: {qwen_tokenizer.vocab_size}")
-
-    print("[INFO] Loading nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 tokenizer...")
-
-    nemotron_tokenizer = AutoTokenizer.from_pretrained(
-        "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", trust_remote_code=True
-    )
-
-    print(f"[INFO] Nemotron tokenizer vocab size: {nemotron_tokenizer.vocab_size}")
-
-    # Step 4: Sample text pairs and compute statistics (on test languages)
-    print("\n" + "=" * 60)
-    print("STEP 4: Sampling and analyzing (test languages)")
-    print("=" * 60)
-
-    all_stats = []
-    for lang in test_langs:
-        print(f"\n[INFO] Processing language: {lang}")
-
-        # Sample text pairs
-        text_pairs = sample_text_pairs(lang, cuts_dirs, args.samples_per_lang, args.seed)
-
-        if not text_pairs:
-            print(f"[WARN] No text pairs found for {lang}, skipping")
-            continue
-
-        print(f"[INFO] Sampled {len(text_pairs)} text pairs for {lang}")
-
-        # Compute stats
-        stats = compute_stats(text_pairs, qwen_tokenizer, nemotron_tokenizer, ipa_tokenizers, lang)
-        all_stats.append(stats)
-
-        # Print intermediate results
-        print(
-            f"[INFO] {lang}: duration={stats.total_duration:.2f}s, Qwen={stats.qwen_tokens_per_second:.2f} tok/s, Nemotron={stats.nemotron_tokens_per_second:.2f} tok/s"
-        )
-        for vs in VOCAB_SIZES:
-            print(f"       IPA-{vs}={stats.ipa_tokens_per_second[vs]:.2f} tok/s")
-
-    # Step 5: Print and save results
-    print("\n" + "=" * 60)
-    print("STEP 5: Results")
-    print("=" * 60)
-
-    print_stats_table(all_stats, VOCAB_SIZES)
-
-    # Save to JSON with metadata
-    results_path = os.path.join(args.output_dir, "tokenization_comparison.json")
-    save_results_json(all_stats, results_path, train_langs, test_langs)
-
-    print("[INFO] Done!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/magpietts/ipa_scripts/cuts_dirs_config.json b/scripts/magpietts/ipa_scripts/cuts_dirs_config.json
deleted file mode 100644
index 8785de53211e..000000000000
--- a/scripts/magpietts/ipa_scripts/cuts_dirs_config.json
+++ /dev/null
@@ -1,45 +0,0 @@
-{
-    "de": ["/Data/tts_lhotse_datasets/speech_data/de/cmltts_de_train/cuts"],
-    "es": [
-        "/Data/tts_lhotse_datasets/speech_data/es/cmltts_es_train/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/es/riva_ES_RubbyCarlos/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/es/riva_ES_RubbyCarlos/cuts_textContext"
-    ],
-    "fr": [
-        "/Data/tts_lhotse_datasets/speech_data/fr/cmltts_fr_train/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/fr/riva_FR_VirginieSamy/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/fr/riva_FR_VirginieSamy/cuts_textContext"
-    ],
-    "hi": [
-        "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi/filter_1/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi/filter_2/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi_2/filter_1/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi_2/filter_2/cuts"
-    ],
-    "it": ["/Data/tts_lhotse_datasets/speech_data/it/cmltts_it_train/cuts"],
-    "vi": [
-        "/Data/tts_lhotse_datasets/speech_data/vi/Infore1_2_lsvsc/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/vi/Long_ContextAudio/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/vi/Long_ContextAudio/cuts_textContext",
-        "/Data/tts_lhotse_datasets/speech_data/vi/NorthFemale/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/vi/nvyt_vi/nvyt_yt2025/cuts"
-    ],
-    "zh": [
-        "/Data/tts_lhotse_datasets/speech_data/zh/riva_ZH_SiweiHouZhen/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/zh/riva_ZH_SiweiHouZhen/cuts_textContext",
-        "/Data/tts_lhotse_datasets/speech_data/zh/nvyt_zh/filter_1/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/zh/nvyt_zh/filter_2/cuts"
-    ],
-    "en": [
-        "/Data/tts_lhotse_datasets/speech_data/en/nvyt2505/lhotse_shar_shuffle_shardSize256/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/en/hifitts/lhotse_shar_shuffle_shardSize256/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/en/hifitts2/lhotse_shar_shuffle_shardSize256/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/en/jhsdGtc20Amp20Keynote/lhotse_shar_shuffle_shardSize256/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/en/libritts/lhotse_shar_shuffle_shardSize256/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/en/rivaLindyRodney/lhotse_shar_shuffle_shardSize256/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/en/rivaLindyRodney/lhotse_shar_shuffle_shardSize256/cuts_textContext",
-        "/Data/tts_lhotse_datasets/speech_data/en/rivaEmmaMeganSeanTom/lhotse_shar_shuffle_shardSize256/cuts",
-        "/Data/tts_lhotse_datasets/speech_data/en/rivaEmmaMeganSeanTom/lhotse_shar_shuffle_shardSize256/cuts_textContext",
-        "/Data/tts_lhotse_datasets/speech_data/en/jhsdGtc20Amp20Keynote/lhotse_shar_shuffle_shardSize256/cuts_textContext"
-    ]
-}
diff --git a/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py b/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py
deleted file mode 100644
index 825129d2c928..000000000000
--- a/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py
+++ /dev/null
@@ -1,522 +0,0 @@
-#!/usr/bin/env python3
-"""
-Train a byte-level BPE tokenizer on IPA strings from Lhotse cuts_with_ipa shards.
-
-This script:
-1. Reads IPA strings from cuts_with_ipa directories (output of add_ipa_to_lhotse_shards.py)
-2. Optionally balances data across languages (samples equal amounts from each)
-3. Trains a HuggingFace ByteLevelBPETokenizer on all extracted IPA strings
-4. Saves vocab.json and merges.txt to the specified output directory
-
-Features:
-- Language balancing: uses the same number of samples from each language
-- Configurable max samples per language
-
-Usage:
-    python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --vocab_size 1024
-    python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --train_langs en,de --vocab_size 2048
-    python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --train_langs all --max_samples_per_lang 50000
-
-The trained tokenizer can be loaded using the IPABPETokenizer class in:
-    nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
-"""
-
-from __future__ import annotations
-
-import argparse
-import gzip
-import json
-import os
-import random
-import sys
-from pathlib import Path
-from typing import Dict, Generator, List, Optional, Tuple
-
-from tokenizers import Tokenizer
-from tokenizers.decoders import ByteLevel as ByteLevelDecoder
-from tokenizers.models import BPE
-from tokenizers.pre_tokenizers import ByteLevel
-from tokenizers.trainers import BpeTrainer
-
-# -------------------------
-# USER CONFIG - Same structure as add_ipa_to_lhotse_shards.py
-# -------------------------
-
-# Default config file path (same directory as this script)
-DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json"
-
-
-def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]:
-    """Load CUTS_DIRS_BY_LANG from a JSON config file."""
-    if config_path is None:
-        config_path = DEFAULT_CONFIG_PATH
-
-    if not config_path.exists():
-        raise FileNotFoundError(f"Config file not found: {config_path}")
-
-    with open(config_path, "r", encoding="utf-8") as f:
-        return json.load(f)
-
-
-OUTPUT_SUFFIX = "_with_ipa"  # cuts -> cuts_with_ipa
-SHARD_GLOB = "cuts.*.jsonl.gz"
-
-
-def get_ipa_dir(cuts_dir: Path) -> Path:
-    """Convert a cuts directory path to its corresponding cuts_with_ipa path."""
-    name = cuts_dir.name
-    if name == "cuts":
-        out_name = f"cuts{OUTPUT_SUFFIX}"
-    elif name.endswith("_textContext"):
-        # Handle cuts_textContext -> cuts_textContext_with_ipa
-        out_name = f"{name}{OUTPUT_SUFFIX}"
-    else:
-        out_name = f"{name}{OUTPUT_SUFFIX}"
-    return cuts_dir.parent / out_name
-
-
-def iter_shards(ipa_dir: Path) -> List[Path]:
-    """Get all shard files in a directory."""
-    return sorted(ipa_dir.glob(SHARD_GLOB))
-
-
-def extract_ipa_from_shard(shard_path: Path) -> Generator[str, None, None]:
-    """
-    Extract all IPA strings from a single shard file.
-
-    Yields:
-        IPA strings from cut["supervisions"][i]["custom"]["ipa"]
-    """
-    with gzip.open(shard_path, "rt", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                cut = json.loads(line)
-                supervisions = cut.get("supervisions", [])
-                for sup in supervisions:
-                    custom = sup.get("custom", {})
-                    ipa = custom.get("ipa")
-                    if ipa and isinstance(ipa, str) and ipa.strip():
-                        yield ipa.strip()
-            except json.JSONDecodeError:
-                continue
-
-
-def extract_ipa_from_dir(ipa_dir: Path) -> Generator[str, None, None]:
-    """Extract all IPA strings from all shards in a directory."""
-    shards = iter_shards(ipa_dir)
-    for shard in shards:
-        yield from extract_ipa_from_shard(shard)
-
-
-def get_available_languages(cuts_dirs: Dict[str, List[str]]) -> List[str]:
-    """Return list of all available language codes."""
-    return list(cuts_dirs.keys())
-
-
-def collect_ipa_strings(
-    cuts_dirs: Dict[str, List[str]],
-    lang: Optional[str] = None,
-) -> Generator[str, None, None]:
-    """
-    Collect all IPA strings from the specified language(s).
-
-    Args:
-        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
-        lang: Language code or None for all languages.
-
-    Yields:
-        IPA strings
-    """
-    if lang is None or lang == "all":
-        langs_to_process = list(cuts_dirs.keys())
-    else:
-        if lang not in cuts_dirs:
-            raise ValueError(f"Unknown language: {lang}. Available: {get_available_languages(cuts_dirs)}")
-        langs_to_process = [lang]
-
-    for lang_code in langs_to_process:
-        print(f"[INFO] Processing language: {lang_code}")
-        for cuts_dir_str in cuts_dirs[lang_code]:
-            cuts_dir = Path(cuts_dir_str)
-            ipa_dir = get_ipa_dir(cuts_dir)
-
-            if not ipa_dir.exists():
-                print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr)
-                continue
-
-            print(f"[INFO] Reading from: {ipa_dir}")
-            count = 0
-            for ipa in extract_ipa_from_dir(ipa_dir):
-                yield ipa
-                count += 1
-            print(f"[INFO] Extracted {count} IPA strings from {ipa_dir}")
-
-
-def iter_ipa_strings_for_lang(
-    lang: str,
-    cuts_dirs: Dict[str, List[str]],
-) -> Generator[str, None, None]:
-    """Iterate over all IPA strings for a single language (memory-efficient)."""
-    if lang not in cuts_dirs:
-        return
-
-    for cuts_dir_str in cuts_dirs[lang]:
-        cuts_dir = Path(cuts_dir_str)
-        ipa_dir = get_ipa_dir(cuts_dir)
-
-        if not ipa_dir.exists():
-            continue
-
-        for ipa in extract_ipa_from_dir(ipa_dir):
-            yield ipa
-
-
-def count_ipa_strings_for_lang(lang: str, cuts_dirs: Dict[str, List[str]], max_count: int = 100000) -> int:
-    """Count IPA strings for a language without loading into memory."""
-    count = 0
-    for _ in iter_ipa_strings_for_lang(lang, cuts_dirs):
-        count += 1
-        if count >= max_count:
-            break
-    return count
-
-
-def simple_sample_ipa_strings(
-    lang: str,
-    cuts_dirs: Dict[str, List[str]],
-    k: int,
-    max_collect: int = 100000,
-    seed: int = 42,
-) -> List[str]:
-    """
-    Simple sampling: collect up to max_collect IPA strings, then randomly sample k.
-
-    This avoids reading through all data like reservoir sampling does.
-
-    Args:
-        lang: Language code
-        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
-        k: Number of samples to select
-        max_collect: Maximum number of strings to collect before sampling
-        seed: Random seed for reproducibility
-
-    Returns:
-        List of up to k sampled IPA strings
-    """
-    rng = random.Random(seed)
-    collected: List[str] = []
-
-    for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs):
-        collected.append(ipa)
-        if len(collected) >= max_collect:
-            break
-
-    # If we have fewer than k, return all
-    if len(collected) <= k:
-        return collected
-
-    # Otherwise, randomly sample k
-    return rng.sample(collected, k)
-
-
-def parse_langs_arg(arg: str, available_langs: List[str]) -> List[str]:
-    """Parse a language argument (comma-separated or 'all')."""
-    if arg == "all":
-        return available_langs
-    langs = [l.strip() for l in arg.split(",") if l.strip()]
-    for lang in langs:
-        if lang not in available_langs:
-            raise ValueError(f"Unknown language: {lang}. Available: {available_langs}")
-    return langs
-
-
-def create_balanced_corpus(
-    train_langs: List[str],
-    cuts_dirs: Dict[str, List[str]],
-    output_file: str,
-    max_samples_per_lang: Optional[int] = None,
-    max_count_per_lang: int = 100000,
-    seed: int = 42,
-) -> Tuple[str, Dict[str, int]]:
-    """
-    Create a balanced IPA corpus file with equal samples from each language.
-
-    Uses a memory-efficient two-pass approach:
-    1. First pass: Count sentences per language (up to max_count_per_lang)
-    2. Second pass: Use simple sampling to select samples
-
-    Args:
-        train_langs: List of language codes to include
-        cuts_dirs: Dictionary mapping language codes to lists of cuts directories
-        output_file: Path to write the balanced corpus
-        max_samples_per_lang: Optional cap on samples per language
-        max_count_per_lang: Max count per language when counting IPA strings
-        seed: Random seed for reproducibility
-
-    Returns:
-        Tuple of (corpus_file_path, dict of lang -> actual_count)
-    """
-    # First pass: Count sentences per language
-    print("[INFO] Pass 1: Counting IPA strings per language...")
-    lang_counts: Dict[str, int] = {}
-
-    for lang in train_langs:
-        if lang not in cuts_dirs:
-            print(f"[WARN] Language {lang} not in config, skipping")
-            continue
-        print(f"[INFO] Counting {lang}...", end=" ", flush=True)
-        count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang)
-        lang_counts[lang] = count
-        print(f"{count} IPA strings")
-
-    if not lang_counts:
-        raise ValueError("No IPA strings found for any language")
-
-    # Find minimum count across languages
-    min_count = min(lang_counts.values())
-    print(f"[INFO] Minimum count across languages: {min_count}")
-
-    # Apply max_samples_per_lang cap if specified
-    samples_per_lang = min_count
-    if max_samples_per_lang is not None and max_samples_per_lang < min_count:
-        samples_per_lang = max_samples_per_lang
-        print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}")
-
-    # Second pass: Sample from each language using simple sampling
-    print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...")
-    actual_counts: Dict[str, int] = {}
-    total_written = 0
-
-    with open(output_file, "w", encoding="utf-8") as f:
-        for lang in lang_counts.keys():
-            print(f"[INFO] Sampling from {lang}...", end=" ", flush=True)
-            # Use different seed per language for variety, but reproducible
-            lang_seed = seed + hash(lang) % 10000
-            sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed)
-
-            for ipa in sampled:
-                f.write(ipa + "\n")
-                total_written += 1
-
-            actual_counts[lang] = len(sampled)
-            print(f"sampled {len(sampled)} strings")
-
-    print(f"[INFO] Total IPA strings written to corpus: {total_written}")
-    print(f"[INFO] Balanced corpus saved to: {output_file}")
-
-    return output_file, actual_counts
-
-
-def train_bpe_tokenizer(
-    corpus_file: str,
-    vocab_size: int = 1024,
-    min_frequency: int = 2,
-    output_dir: str = "./ipa_bpe_tokenizer",
-) -> Tokenizer:
-    """
-    Train a byte-level BPE tokenizer on IPA strings from a corpus file.
-
-    Args:
-        corpus_file: Path to the IPA corpus file (one IPA string per line)
-        vocab_size: Target vocabulary size
-        min_frequency: Minimum frequency for a token to be included
-        output_dir: Directory to save the tokenizer files
-
-    Returns:
-        Trained Tokenizer object
-    """
-    # Create output directory
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Check if tokenizer already exists
-    tokenizer_path = os.path.join(output_dir, "tokenizer.json")
-    if os.path.exists(tokenizer_path):
-        print(f"[INFO] Loading existing tokenizer from {tokenizer_path}")
-        return Tokenizer.from_file(tokenizer_path)
-
-    # Count lines in corpus
-    with open(corpus_file, "r", encoding="utf-8") as f:
-        total_count = sum(1 for _ in f)
-    print(f"[INFO] Corpus contains {total_count} IPA strings")
-
-    if total_count == 0:
-        raise ValueError("Corpus file is empty. Make sure the cuts_with_ipa directories exist.")
-
-    # Initialize a byte-level BPE tokenizer
-    tokenizer = Tokenizer(BPE(unk_token="<unk>"))
-
-    # Use byte-level pre-tokenization (like GPT-2)
-    tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
-
-    # Add byte-level decoder to properly convert back to original text
-    tokenizer.decoder = ByteLevelDecoder()
-
-    # Define special tokens
-    special_tokens = ["<pad>", "<blank>", "<unk>"]
-
-    # Create trainer
-    trainer = BpeTrainer(
-        vocab_size=vocab_size,
-        min_frequency=min_frequency,
-        special_tokens=special_tokens,
-        show_progress=True,
-    )
-
-    # Train the tokenizer
-    print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}, min_frequency={min_frequency}")
-    tokenizer.train(files=[corpus_file], trainer=trainer)
-
-    # Save the tokenizer
-    vocab_path = os.path.join(output_dir, "vocab.json")
-    merges_path = os.path.join(output_dir, "merges.txt")
-
-    # Save using the tokenizer's model save method
-    tokenizer.model.save(output_dir)
-
-    # Also save the full tokenizer for easy loading
-    tokenizer.save(tokenizer_path)
-
-    print(f"[INFO] Tokenizer saved to: {output_dir}")
-    print(f"[INFO]   - vocab.json: {vocab_path}")
-    print(f"[INFO]   - merges.txt: {merges_path}")
-    print(f"[INFO]   - tokenizer.json: {tokenizer_path}")
-    print(f"[INFO] Vocabulary size: {tokenizer.get_vocab_size()}")
-
-    return tokenizer
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Train a byte-level BPE tokenizer on IPA strings from Lhotse cuts_with_ipa shards."
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        required=True,
-        help="Directory to save the trained tokenizer files (vocab.json, merges.txt, tokenizer.json)",
-    )
-    parser.add_argument(
-        "--vocab_size",
-        type=int,
-        default=1024,
-        help="Vocabulary size for the BPE tokenizer (default: 1024)",
-    )
-    parser.add_argument(
-        "--min_frequency",
-        type=int,
-        default=2,
-        help="Minimum frequency for a token to be included in vocabulary (default: 2)",
-    )
-    parser.add_argument(
-        "--train_langs",
-        type=str,
-        default="all",
-        help="Comma-separated language codes for training, or 'all' (default: all)",
-    )
-    parser.add_argument(
-        "--max_samples_per_lang",
-        type=int,
-        default=None,
-        help="Optional cap on samples per language (default: use min count across langs for balance)",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="Random seed for sampling (default: 42)",
-    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        default=None,
-        help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}",
-    )
-    parser.add_argument(
-        "--max_count_per_lang",
-        type=int,
-        default=100000,
-        help="Max count per language when counting IPA strings (default: 100000)",
-    )
-    args = parser.parse_args()
-
-    # Load config
-    config_path = Path(args.config) if args.config else None
-    cuts_dirs = load_cuts_dirs_config(config_path)
-    available_langs = get_available_languages(cuts_dirs)
-
-    # Parse train_langs
-    try:
-        train_langs = parse_langs_arg(args.train_langs, available_langs)
-    except ValueError as e:
-        print(f"[ERROR] {e}")
-        sys.exit(1)
-
-    print(f"[INFO] Training IPA BPE tokenizer")
-    print(f"[INFO]   Output directory: {args.output_dir}")
-    print(f"[INFO]   Vocabulary size: {args.vocab_size}")
-    print(f"[INFO]   Min frequency: {args.min_frequency}")
-    print(f"[INFO]   Training languages: {train_langs}")
-    print(f"[INFO]   Max samples per lang: {args.max_samples_per_lang or 'auto (min across langs)'}")
-    print(f"[INFO]   Max count per lang: {args.max_count_per_lang}")
-    print(f"[INFO]   Available languages: {available_langs}")
-
-    os.makedirs(args.output_dir, exist_ok=True)
-
-    # Step 1: Create balanced corpus
-    print("\n" + "=" * 60)
-    print("STEP 1: Creating balanced IPA corpus")
-    print("=" * 60)
-
-    corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt")
-
-    if os.path.exists(corpus_file):
-        print(f"[INFO] Using existing corpus file: {corpus_file}")
-        with open(corpus_file, "r", encoding="utf-8") as f:
-            line_count = sum(1 for _ in f)
-        print(f"[INFO] Corpus contains {line_count} IPA strings")
-    else:
-        corpus_file, lang_counts = create_balanced_corpus(
-            train_langs=train_langs,
-            cuts_dirs=cuts_dirs,
-            output_file=corpus_file,
-            max_samples_per_lang=args.max_samples_per_lang,
-            max_count_per_lang=args.max_count_per_lang,
-            seed=args.seed,
-        )
-
-    # Step 2: Train tokenizer
-    print("\n" + "=" * 60)
-    print("STEP 2: Training BPE tokenizer")
-    print("=" * 60)
-
-    tokenizer = train_bpe_tokenizer(
-        corpus_file=corpus_file,
-        vocab_size=args.vocab_size,
-        min_frequency=args.min_frequency,
-        output_dir=args.output_dir,
-    )
-
-    # Test the tokenizer
-    print("\n[INFO] Testing tokenizer with sample IPA strings:")
-    test_strings = [
-        "həˈloʊ wɜːld",  # hello world
-        "ˈaɪ pʰiː eɪ",  # IPA
-        "ˈtɛstɪŋ wʌn tuː θriː",  # testing one two three
-    ]
-    for test_str in test_strings:
-        encoded = tokenizer.encode(test_str)
-        decoded = tokenizer.decode(encoded.ids)
-        print(f"  Input:   '{test_str}'")
-        print(f"  Tokens:  {encoded.tokens}")
-        print(f"  IDs:     {encoded.ids}")
-        print(f"  Decoded: '{decoded}'")
-        print()
-
-    print("[INFO] Done!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json
deleted file mode 100644
index 6d7e35116405..000000000000
--- a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json
+++ /dev/null
@@ -1,9954 +0,0 @@
-{
-  "version": "1.0",
-  "truncation": null,
-  "padding": null,
-  "added_tokens": [
-    {
-      "id": 0,
-      "content": "<pad>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 1,
-      "content": "<blank>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 2,
-      "content": "<unk>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    }
-  ],
-  "normalizer": null,
-  "pre_tokenizer": {
-    "type": "ByteLevel",
-    "add_prefix_space": false,
-    "trim_offsets": true,
-    "use_regex": true
-  },
-  "post_processor": null,
-  "decoder": {
-    "type": "ByteLevel",
-    "add_prefix_space": true,
-    "trim_offsets": true,
-    "use_regex": true
-  },
-  "model": {
-    "type": "BPE",
-    "dropout": null,
-    "unk_token": "<unk>",
-    "continuing_subword_prefix": null,
-    "end_of_word_suffix": null,
-    "fuse_unk": false,
-    "byte_fallback": false,
-    "ignore_merges": false,
-    "vocab": {
-      "<pad>": 0,
-      "<blank>": 1,
-      "<unk>": 2,
-      "(": 3,
-      ")": 4,
-      "-": 5,
-      ".": 6,
-      "1": 7,
-      "2": 8,
-      "4": 9,
-      "5": 10,
-      "6": 11,
-      "7": 12,
-      "F": 13,
-      "a": 14,
-      "b": 15,
-      "c": 16,
-      "d": 17,
-      "e": 18,
-      "f": 19,
-      "h": 20,
-      "i": 21,
-      "j": 22,
-      "k": 23,
-      "l": 24,
-      "m": 25,
-      "n": 26,
-      "o": 27,
-      "p": 28,
-      "q": 29,
-      "r": 30,
-      "s": 31,
-      "t": 32,
-      "u": 33,
-      "v": 34,
-      "w": 35,
-      "x": 36,
-      "y": 37,
-      "z": 38,
-      "¡": 39,
-      "£": 40,
-      "¦": 41,
-      "§": 42,
-      "©": 43,
-      "ª": 44,
-      "¬": 45,
-      "°": 46,
-      "²": 47,
-      "³": 48,
-      "¸": 49,
-      "¹": 50,
-      "¾": 51,
-      "Ã": 52,
-      "Å": 53,
-      "É": 54,
-      "Ê": 55,
-      "Ë": 56,
-      "Ì": 57,
-      "Î": 58,
-      "Ï": 59,
-      "Ċ": 60,
-      "Ġ": 61,
-      "Ģ": 62,
-      "ģ": 63,
-      "Ĥ": 64,
-      "ĥ": 65,
-      "ĩ": 66,
-      "Ī": 67,
-      "Ĭ": 68,
-      "ĭ": 69,
-      "Į": 70,
-      "į": 71,
-      "İ": 72,
-      "ı": 73,
-      "Ĳ": 74,
-      "ĳ": 75,
-      "Ĵ": 76,
-      "ĵ": 77,
-      "Ķ": 78,
-      "ķ": 79,
-      "ĸ": 80,
-      "Ĺ": 81,
-      "Ļ": 82,
-      "Ľ": 83,
-      "ľ": 84,
-      "Ŀ": 85,
-      "Ł": 86,
-      "ËĪ": 87,
-      "ËĲ": 88,
-      "ËĪÉ": 89,
-      "ËĮ": 90,
-      "ÉĻ": 91,
-      "ËĪa": 92,
-      "ËĪi": 93,
-      "Ġt": 94,
-      "Éª": 95,
-      "É¾": 96,
-      "ĠÉ": 97,
-      "Ġk": 98,
-      "Éľ": 99,
-      "Ġs": 100,
-      "ËĪe": 101,
-      "ÉĽ": 102,
-      "ËĪo": 103,
-      "Ġl": 104,
-      "ËĪÉĽ": 105,
-      "Ġd": 106,
-      "ÊĬ": 107,
-      "ËĪaËĲ": 108,
-      "Ġp": 109,
-      "Ìĥ": 110,
-      "Ġm": 111,
-      "ËĪu": 112,
-      "Åĭ": 113,
-      "Ã°": 114,
-      "ËĪÉĶ": 115,
-      "ÊĮ": 116,
-      "ËĮa": 117,
-      "Ġh": 118,
-      "ËĪÊĮ": 119,
-      "Ġn": 120,
-      "Êģ": 121,
-      "ËĪÉĳ": 122,
-      "Êĥ": 123,
-      "eËĲ": 124,
-      "Ġa": 125,
-      "Ġb": 126,
-      "ÉĶ": 127,
-      "ËĪÉĻ": 128,
-      "ÉĻn": 129,
-      "Ġf": 130,
-      "ËĪÉª": 131,
-      "É¡": 132,
-      "ËĪeËĲ": 133,
-      "Ġj": 134,
-      "nt": 135,
-      "ĠÃ°": 136,
-      "ĠËĮ": 137,
-      "Ġts": 138,
-      "ĠÉ¡": 139,
-      "Éķ": 140,
-      "ËĪoËĲ": 141,
-      "Ê°": 142,
-      "aËĲ": 143,
-      "ËĪy": 144,
-      "ĠtÉķ": 145,
-      "ËĪiËĲ": 146,
-      "ĠÊ": 147,
-      "Ġv": 148,
-      "Ġw": 149,
-      "st": 150,
-      "Éĳ": 151,
-      "nd": 152,
-      "ËĮi": 153,
-      "Ìª": 154,
-      "ËĮe": 155,
-      "Ġz": 156,
-      "ËĪaÉª": 157,
-      "ËĪiÉĽ": 158,
-      "Î²": 159,
-      "É¹": 160,
-      "ĠËĮa": 161,
-      "Î¸": 162,
-      "ĠhÉĽ": 163,
-      "ÊĪ": 164,
-      "iËĲ": 165,
-      "ËĮo": 166,
-      "ĠÉª": 167,
-      "Éľn": 168,
-      "Ġx": 169,
-      "ĠtÉĻ": 170,
-      "ËĪuËĲ": 171,
-      "ËĮÉĻ": 172,
-      "ĠjËĪi": 173,
-      "ËĮÉĽ": 174,
-      "ĠÉĽ": 175,
-      "ĠËĪa": 176,
-      "ËĮaËĲ": 177,
-      "Ġla": 178,
-      "ĠÃ°e": 179,
-      "ĠhÉĽËĲ": 180,
-      "Ġe": 181,
-      "Ã§": 182,
-      "ÉĻl": 183,
-      "oËĲ": 184,
-      "ËĪÉĳu": 185,
-      "ÊĴ": 186,
-      "uËĲ": 187,
-      "ĠÉĹ": 188,
-      "ĠÉķ": 189,
-      "ËĮeËĲ": 190,
-      "ĠtÉķËĪi": 191,
-      "os": 192,
-      "ËĪÉĶËĲ": 193,
-      "as": 194,
-      "ËĪÊĬ": 195,
-      "Ġi": 196,
-      "ËĪai": 197,
-      "É²": 198,
-      "Éªn": 199,
-      "ts": 200,
-      "ÉľÅĭ": 201,
-      "ĠÉŁ": 202,
-      "ĠÊĥ": 203,
-      "ËĪeÉª": 204,
-      "ÉĽÉ¾": 205,
-      "ËĪÉĽËĲ": 206,
-      "ËĪÉĽÉ¾": 207,
-      "Ġr": 208,
-      "tÊĥ": 209,
-      "ËĮÉĶ": 210,
-      "ĠdÉĻ": 211,
-      "tÉĻ": 212,
-      "ou": 213,
-      "ËĪyÉĻ": 214,
-      "ĠËĮi": 215,
-      "ÉĻÉ¾": 216,
-      "ËĪÉĻÊĬ": 217,
-      "ËĪÊĮÉ¾": 218,
-      "ËĪÉĴ": 219,
-      "Ġth": 220,
-      "ËĪon": 221,
-      "Êĭ": 222,
-      "ËĪÉĳËĲ": 223,
-      "ËĪÊĮh": 224,
-      "wËĪa": 225,
-      "ËĪei": 226,
-      "ll": 227,
-      "ĠÉĲ": 228,
-      "ÉĳËĲ": 229,
-      "an": 230,
-      "ÉŁ": 231,
-      "ĠÊĭ": 232,
-      "Ġko": 233,
-      "kh": 234,
-      "ÉªÅĭ": 235,
-      "ËĪaËĲÉª": 236,
-      "ĠtÊĥ": 237,
-      "ËĪaËĲt": 238,
-      "ĠËĮe": 239,
-      "ĠtÉķh": 240,
-      "ËĪuo": 241,
-      "ËĪonÉ¡": 242,
-      "Éĸ": 243,
-      "at": 244,
-      "Ġke": 245,
-      "ÉĴ": 246,
-      "ĠÉķËĪi": 247,
-      "Ã¸": 248,
-      "ĠÉĳ": 249,
-      "ËĪeËĲk": 250,
-      "Åĵ": 251,
-      "re": 252,
-      "ĠÉ¾": 253,
-      "ĠkÉĶ": 254,
-      "ËĮÊĬ": 255,
-      "sk": 256,
-      "ĠÊĬ": 257,
-      "Ġand": 258,
-      "ÉªÃ§": 259,
-      "Ġme": 260,
-      "ËĪaÉ¾": 261,
-      "ĠËĪÉª": 262,
-      "na": 263,
-      "ĠÎ²": 264,
-      "ĠlËĪi": 265,
-      "jaËĲ": 266,
-      "li": 267,
-      "no": 268,
-      "ĠÉªn": 269,
-      "ĠdËĮi": 270,
-      "ĠÉ²": 271,
-      "tËĲ": 272,
-      "ÉĻm": 273,
-      "ĠlÉĻ": 274,
-      "ĠÃ°ÉĻ": 275,
-      "Éªk": 276,
-      "ËĪÉĽl": 277,
-      "Éľt": 278,
-      "Ġse": 279,
-      "es": 280,
-      "ËĪou": 281,
-      "ËĪaÊĬ": 282,
-      "ĠÉĶ": 283,
-      "Éªt": 284,
-      "ĠÅĭ": 285,
-      "ËĪÉĽn": 286,
-      "Êİ": 287,
-      "Ġkh": 288,
-      "ËĪÉĽnt": 289,
-      "ËĪaËĲÉ¾": 290,
-      "Ġki": 291,
-      "mp": 292,
-      "lt": 293,
-      "É£": 294,
-      "Ġpa": 295,
-      "ËĪÉĻËĲ": 296,
-      "Éªs": 297,
-      "ĠÉĴ": 298,
-      "Ġle": 299,
-      "ÉªÉľ": 300,
-      "ËĪÉĽt": 301,
-      "Ġde": 302,
-      "ĠÉ¹": 303,
-      "ĠtËĪoËĲ": 304,
-      "ĠÊģ": 305,
-      "ÊĥÉĻn": 306,
-      "ĠÊĬnt": 307,
-      "ËĪÉĶÉ¾": 308,
-      "ËĪaÃ°": 309,
-      "ĠaÉª": 310,
-      "ĠÊĲ": 311,
-      "ĠmËĪa": 312,
-      "ra": 313,
-      "ĠkËĪÉª": 314,
-      "kt": 315,
-      "ËĲp": 316,
-      "ĠÊĪ": 317,
-      "ËĪaËĲÊĬ": 318,
-      "ĠkËĪÊĮÉ¾": 319,
-      "ĠËĪÊĮ": 320,
-      "ĠÉĴv": 321,
-      "Ġel": 322,
-      "ks": 323,
-      "Ġkw": 324,
-      "ÉĻt": 325,
-      "ndo": 326,
-      "ei": 327,
-      "ĠËĮaËĲp": 328,
-      "se": 329,
-      "ÉĻÉ¹": 330,
-      "ËĪuei": 331,
-      "ÉĻs": 332,
-      "ĠkËĮo": 333,
-      "ĠÊĤ": 334,
-      "ĠËĮÊĬ": 335,
-      "Ġc": 336,
-      "ĠÉĽn": 337,
-      "ËĪant": 338,
-      "Î¸j": 339,
-      "ËĮoËĲ": 340,
-      "ĠËĪaËĲ": 341,
-      "ĠpÉ¾": 342,
-      "si": 343,
-      "ĠËĪe": 344,
-      "ĠjuËĲ": 345,
-      "ĠkËĮe": 346,
-      "ËĮÉª": 347,
-      "ÉĶn": 348,
-      "ĠsËĪÊĮ": 349,
-      "ĠËĪu": 350,
-      "ni": 351,
-      "Ġst": 352,
-      "ĠdiËĲ": 353,
-      "ĠkeËĲ": 354,
-      "ĠjËĪiou": 355,
-      "ËĪaiÉľ": 356,
-      "ĠdÊĴ": 357,
-      "ĠËĪÉĶ": 358,
-      "va": 359,
-      "ËĲÉ¾": 360,
-      "ËĪÃ¸": 361,
-      "ËĮÉĻÊĬ": 362,
-      "ĠpËĪu": 363,
-      "Ġsu": 364,
-      "Ġma": 365,
-      "ĠÉĻ": 366,
-      "dÊĴ": 367,
-      "ĠpÊ°": 368,
-      "le": 369,
-      "in": 370,
-      "ĠtÉķhËĪi": 371,
-      "ĠwËĪo": 372,
-      "ro": 373,
-      "ËĮy": 374,
-      "É¾a": 375,
-      "ĠsËĪi": 376,
-      "Ã°ÉĻ": 377,
-      "ĠseËĲ": 378,
-      "la": 379,
-      "ĠÊĴ": 380,
-      "mb": 381,
-      "ĠhËĪoËĲ": 382,
-      "ĠbÊ°": 383,
-      "ĠÉĽÉ¾": 384,
-      "ĠÃ°at": 385,
-      "sp": 386,
-      "ÉĶÉ¾": 387,
-      "en": 388,
-      "ĠsÉĻ": 389,
-      "ËĪÉĶÉľ": 390,
-      "ĠlËĮa": 391,
-      "ĠËĮÉĽ": 392,
-      "ĠËĪy": 393,
-      "É¡aËĲ": 394,
-      "ĠdÉĽÉ¾": 395,
-      "ËĪÉĽÊģ": 396,
-      "Éľkh": 397,
-      "ËĪiÉĻ": 398,
-      "ËĪan": 399,
-      "ĠmËĪo": 400,
-      "ËĪaÎ²": 401,
-      "Ġal": 402,
-      "ĠËĪeËĲ": 403,
-      "ĠÎ¸": 404,
-      "ĠnËĪi": 405,
-      "pÊ°": 406,
-      "lla": 407,
-      "Ġpl": 408,
-      "ËĪÅĵ": 409,
-      "jËĪÉĳu": 410,
-      "Ġav": 411,
-      "ĠmËĪi": 412,
-      "ĠfËĪa": 413,
-      "ËĪÉľ": 414,
-      "me": 415,
-      "ËĮÉĻh": 416,
-      "ËĪuÉĻ": 417,
-      "it": 418,
-      "jËĪe": 419,
-      "Ġo": 420,
-      "ËĪÉľËĲ": 421,
-      "ĠtÉķËĪiou": 422,
-      "ÉĶËĲ": 423,
-      "ĠnÉĻ": 424,
-      "ËĪÉĻÉľn": 425,
-      "ĠmÉĻ": 426,
-      "ĠdeËĲ": 427,
-      "mo": 428,
-      "sa": 429,
-      "jËĪÉĶ": 430,
-      "ËĪal": 431,
-      "ĠtÉķËĪiÉĽ": 432,
-      "ĠÉ¡ÉĻ": 433,
-      "Ã°a": 434,
-      "ĠÉªz": 435,
-      "Ġsa": 436,
-      "ri": 437,
-      "ĠËĮil": 438,
-      "ËĮu": 439,
-      "ĠkaËĲ": 440,
-      "ĠÉĻËĲ": 441,
-      "ĠÉĸ": 442,
-      "Ġka": 443,
-      "ËĪÊĮhi": 444,
-      "ĠjeËĲ": 445,
-      "ĠtÊ°": 446,
-      "ne": 447,
-      "kËĲ": 448,
-      "ĠtsËĪai": 449,
-      "ĠËĪeËĲk": 450,
-      "nk": 451,
-      "ti": 452,
-      "ËĪaÉľn": 453,
-      "ĠkËĲ": 454,
-      "É¡ÉĻn": 455,
-      "ËĪia": 456,
-      "ĠÉĶËĲÉ¾": 457,
-      "Êı": 458,
-      "ĠËĮÊĮ": 459,
-      "ĠzËĪaËĲ": 460,
-      "Ġlos": 461,
-      "ÉĽs": 462,
-      "ËĪÉĶn": 463,
-      "ÉĽnt": 464,
-      "ÉĽn": 465,
-      "ĠÉŁËĪoËĲ": 466,
-      "Ã§t": 467,
-      "Ġdas": 468,
-      "ĠxËĮo": 469,
-      "ËĪuÉľ": 470,
-      "ËĪas": 471,
-      "ĠbËĪÊĮ": 472,
-      "ËĪiÉĽÉľn": 473,
-      "ÉĲ": 474,
-      "ĠtsuËĲ": 475,
-      "ĠpËĮÉĽ": 476,
-      "ĠnËĪÉĶ": 477,
-      "ÊĬt": 478,
-      "ma": 479,
-      "ĠnËĪo": 480,
-      "ĠlËĪÉª": 481,
-      "ËĪÉĽs": 482,
-      "Éªl": 483,
-      "ĠÉķËĪiÉĽ": 484,
-      "ĠËĪÊĬ": 485,
-      "ÉĴt": 486,
-      "to": 487,
-      "ĠËĪo": 488,
-      "ËĮon": 489,
-      "ĠkwËĪa": 490,
-      "ĠÉªt": 491,
-      "ĠhoËĲ": 492,
-      "ËĪiËĲk": 493,
-      "ĠËĮaËĲpk": 494,
-      "ËĪaÉªn": 495,
-      "Ã¦": 496,
-      "ÉĻnt": 497,
-      "ta": 498,
-      "lo": 499,
-      "ĠnËĪÉĳ": 500,
-      "ĠlËĪa": 501,
-      "ËĪiÉľ": 502,
-      "ĠwËĪei": 503,
-      "ÉĽÊģ": 504,
-      "ĠtËĪa": 505,
-      "ĠÉ¾ËĮÉĻh": 506,
-      "ĠÉķËĪiÉĳ": 507,
-      "ËĮiËĲ": 508,
-      "ËĮÉĽl": 509,
-      "ĠtÉĻÉľ": 510,
-      "ĠkËĪuo": 511,
-      "ĠtËĪu": 512,
-      "jËĪÉĽ": 513,
-      "ĠËĮin": 514,
-      "É¾e": 515,
-      "ĠkoËĲ": 516,
-      "ĠkËĪa": 517,
-      "É¾i": 518,
-      "ĠtÉķËĪiÉĳ": 519,
-      "lÉĻ": 520,
-      "ĠkÉĻ": 521,
-      "ĠtËĪi": 522,
-      "ĠÅĭËĪyÉĻ": 523,
-      "Ġtsh": 524,
-      "er": 525,
-      "av": 526,
-      "ĠkÉĶn": 527,
-      "ËĪÉĻÉľÅĭ": 528,
-      "Ã°o": 529,
-      "ËĪaËĲn": 530,
-      "ĠbÊ°ËĪi": 531,
-      "ĠkËĲjaËĲ": 532,
-      "ÉĻz": 533,
-      "ĠpÊģ": 534,
-      "ĠdËĪÉª": 535,
-      "ĠziËĲ": 536,
-      "É¡eËĲ": 537,
-      "ĠtËĪÉĻ": 538,
-      "Éªz": 539,
-      "ĠnËĮon": 540,
-      "taËĲ": 541,
-      "bl": 542,
-      "te": 543,
-      "nËĮeËĲ": 544,
-      "ËĪÉªl": 545,
-      "so": 546,
-      "ko": 547,
-      "uÊģ": 548,
-      "ĠÉ£": 549,
-      "ĠpaÊģ": 550,
-      "ĠËĪÉĽ": 551,
-      "jËĪuËĲ": 552,
-      "ËĮÊĮ": 553,
-      "yn": 554,
-      "ËĪiËĲn": 555,
-      "ĠlËĪaÉª": 556,
-      "ËĪÉªÅĭ": 557,
-      "ĠtÉķhËĪy": 558,
-      "ĠnËĪÊĮhi": 559,
-      "ĠdËĮe": 560,
-      "ĠjËĪÉĳu": 561,
-      "ĠtËĪÉĳu": 562,
-      "ĠhËĪo": 563,
-      "Éªd": 564,
-      "ĠthËĪÉĳ": 565,
-      "mËĪe": 566,
-      "ĠËĪÉĻ": 567,
-      "ja": 568,
-      "Ġph": 569,
-      "ÉĽt": 570,
-      "ĠkËĪÊĮ": 571,
-      "tÉĻn": 572,
-      "mËĪÉĳ": 573,
-      "wËĪe": 574,
-      "ĠËĮaÉªn": 575,
-      "ĠÃ°Éªs": 576,
-      "É¡ÉĻ": 577,
-      "ĠnËĪaËĲ": 578,
-      "ĠbËĪaËĲ": 579,
-      "ĠaÎ¸": 580,
-      "ĠmËĮa": 581,
-      "ËĪÊĮha": 582,
-      "ĠdËĮa": 583,
-      "ËĪÊı": 584,
-      "ĠÉ²ËĮy": 585,
-      "ĠpËĪa": 586,
-      "ËĪaÃ°o": 587,
-      "di": 588,
-      "bÉľ": 589,
-      "É³": 590,
-      "ĠwiËĲ": 591,
-      "ĠnËĪÉª": 592,
-      "ĠÉ¡ËĪÉĶÉľ": 593,
-      "tËĲo": 594,
-      "ËĮÉĻm": 595,
-      "ËĪaËĲr": 596,
-      "ĠmÉĽ": 597,
-      "ËĪeËĲÉ¡aËĲ": 598,
-      "ĠsËĮi": 599,
-      "ĠlËĮaËĲ": 600,
-      "nËĮaËĲ": 601,
-      "Ġsp": 602,
-      "tÊģ": 603,
-      "ĠÊİ": 604,
-      "ËĮÉĳËĲ": 605,
-      "Ġkl": 606,
-      "kÊ°": 607,
-      "il": 608,
-      "ĠÊĥt": 609,
-      "ĠËĮÊĬn": 610,
-      "al": 611,
-      "ĠsËĪÉĽ": 612,
-      "ĠmËĪaËĲ": 613,
-      "ĠÅĵ": 614,
-      "ĠÉ¡ËĪÊĮ": 615,
-      "ĠpËĮÉĽr": 616,
-      "É¾ËĪa": 617,
-      "ËĲÊĪ": 618,
-      "ËĪaÎ²a": 619,
-      "ĠwËĪÉĴ": 620,
-      "ĠxËĪuei": 621,
-      "ĠkhËĪo": 622,
-      "Ġlas": 623,
-      "ĠÉĹËĪo": 624,
-      "ĠfÉĽÉ¾": 625,
-      "ĠjËĪiÉĽ": 626,
-      "ĠtËĪe": 627,
-      "ĠkËĮÉĶ": 628,
-      "ĠdeËĲn": 629,
-      "Ġmo": 630,
-      "ĠpËĪi": 631,
-      "ĠtËĪÉĳ": 632,
-      "ËĪÉĽst": 633,
-      "wËĪÉĳ": 634,
-      "ËĪaÉªt": 635,
-      "ÉĻÊĬ": 636,
-      "ĠËĪi": 637,
-      "Éªj": 638,
-      "aÉª": 639,
-      "ËĪaËĲÉľ": 640,
-      "ĠËĪÉªs": 641,
-      "ĠpÉĶÉ¾": 642,
-      "Ã¦Éľn": 643,
-      "ka": 644,
-      "ÅĭÉ¡": 645,
-      "bÉĻn": 646,
-      "ÊĬf": 647,
-      "ĠpÉ¹": 648,
-      "ĠlËĮe": 649,
-      "ËĪiËĲd": 650,
-      "ËĪaËĲre": 651,
-      "ĠmËĪÊĮ": 652,
-      "ÉĻr": 653,
-      "ĠdÉĳ": 654,
-      "ËĪaËĲto": 655,
-      "ĠpËĪeËĲ": 656,
-      "ĠdËĪoËĲ": 657,
-      "ĠsËĮÊĬ": 658,
-      "ĠhËĪi": 659,
-      "ĠsËĪa": 660,
-      "ËĪeËĲn": 661,
-      "dÉĻ": 662,
-      "Ġpj": 663,
-      "ËĪÅĵÊģ": 664,
-      "lÉªÃ§": 665,
-      "ÉĴn": 666,
-      "ĠËĪÉĻr": 667,
-      "tËĪe": 668,
-      "Ġil": 669,
-      "ËĪaËĲl": 670,
-      "ĠsËĮÉĻÊĬ": 671,
-      "sÊĪ": 672,
-      "ĠdËĪuËĲ": 673,
-      "hËĪÉĳ": 674,
-      "ĠxËĪou": 675,
-      "ĠlËĪaiÉľ": 676,
-      "wËĪo": 677,
-      "ËĪÉĽnte": 678,
-      "Ġsy": 679,
-      "ĠzÉªÃ§": 680,
-      "ĠÉ¡ËĪu": 681,
-      "ĠÉķËĪy": 682,
-      "ËĪÉĶËĲl": 683,
-      "ÉĶl": 684,
-      "ĠtËĪo": 685,
-      "ĠÊĭoËĲ": 686,
-      "ĠiËĲ": 687,
-      "wËĪaÃ°a": 688,
-      "ËĪando": 689,
-      "ĠaÎ¸ÉĽnt": 690,
-      "ĠaÎ¸ÉĽntwËĪaÃ°a": 691,
-      "ĠtËĪiÉĽ": 692,
-      "ËĪeiÉľ": 693,
-      "ĠpËĮa": 694,
-      "ĠnËĪaÉª": 695,
-      "wa": 696,
-      "Ġfr": 697,
-      "ĠÊĲËĪÉĻÉľn": 698,
-      "ËĪua": 699,
-      "mi": 700,
-      "ĠmËĪÉĽ": 701,
-      "ËĪeËĲkÊ°": 702,
-      "cÊ°": 703,
-      "ĠwËĪÉĳ": 704,
-      "sta": 705,
-      "Ġtu": 706,
-      "Ġsk": 707,
-      "ËĪÉĶl": 708,
-      "ËĪeËĲÊĪ": 709,
-      "ĠlËĪaËĲÉª": 710,
-      "ĠlËĪaËĲ": 711,
-      "ËĪÉĽËĲs": 712,
-      "ËĪÉĽÉ¾a": 713,
-      "ËĪÉĻÉľt": 714,
-      "Ġyn": 715,
-      "dÉĻn": 716,
-      "Ġdi": 717,
-      "ËĪiËĲs": 718,
-      "ĠÃ°el": 719,
-      "ËĪÊĮr": 720,
-      "ĠhËĪaËĲ": 721,
-      "ĠbÉĻ": 722,
-      "ĠjËĪuËĲ": 723,
-      "lle": 724,
-      "sto": 725,
-      "ËĪÉªt": 726,
-      "ËĪoËĲÉ¾": 727,
-      "bÊ°": 728,
-      "mÉĻn": 729,
-      "ËĮuÉĻ": 730,
-      "ËĮÉĻÉ¾": 731,
-      "ËĪÊĮn": 732,
-      "ĠlËĪaÉªk": 733,
-      "ĠbËĪa": 734,
-      "ÉªÃ°": 735,
-      "Ġlo": 736,
-      "zi": 737,
-      "ËĪÊĮst": 738,
-      "mËĪi": 739,
-      "ÉĶÊģ": 740,
-      "ĠnËĪÉªÃ§t": 741,
-      "ĠtÉ¾": 742,
-      "ĠdËĪeËĲkÊ°": 743,
-      "ĠsËĮe": 744,
-      "ĠnËĪÉĻÊĬ": 745,
-      "Ġu": 746,
-      "Ġsi": 747,
-      "ĠÉªÃ§": 748,
-      "Ġpr": 749,
-      "ĠtÉķËĪy": 750,
-      "ĠmËĪu": 751,
-      "za": 752,
-      "ĠtÊģ": 753,
-      "ĠwÉªÃ°": 754,
-      "tËĪÉĽ": 755,
-      "ĠpËĪÊĮÉ¾": 756,
-      "ĠkËĪÉĶ": 757,
-      "ËĪoËĲr": 758,
-      "ĠhËĮa": 759,
-      "ĠkËĪonÉ¡": 760,
-      "ĠpuÊģ": 761,
-      "Ġdy": 762,
-      "ËĪÉªn": 763,
-      "nte": 764,
-      "ĠkËĮa": 765,
-      "ËĪÉĻÉª": 766,
-      "Ġmi": 767,
-      "ĠÉ¡ËĮuÉĻ": 768,
-      "ĠÊ²": 769,
-      "ĠfËĪÉĳ": 770,
-      "ĠvÉĳËĲ": 771,
-      "ĠËĮaÊĬ": 772,
-      "ËĮuËĲ": 773,
-      "ĠËĪun": 774,
-      "ĠjËĪÊĮha": 775,
-      "juËĲ": 776,
-      "ĠmÉªt": 777,
-      "ĠlËĪÉĽ": 778,
-      "ËĪeËĲÊĥ": 779,
-      "ĠfÉĶËĲ": 780,
-      "mÉĻ": 781,
-      "É¾t": 782,
-      "ĠkËĮon": 783,
-      "ĠlËĪÉĶ": 784,
-      "ĠxËĪÉĳu": 785,
-      "pl": 786,
-      "ĠdËĪi": 787,
-      "ĠlËĪoËĲ": 788,
-      "sÉĻ": 789,
-      "ËĪaËĲva": 790,
-      "ĠlËĪu": 791,
-      "ĠÉ¡ËĮÉĻÊĬ": 792,
-      "Ġhav": 793,
-      "ĠËĮaËĲpkËĮoËĲ": 794,
-      "É¾ËĪi": 795,
-      "ĠfËĪÉĻ": 796,
-      "ĠhËĮÉĻm": 797,
-      "ËĪonÉ¡Éľ": 798,
-      "jo": 799,
-      "ĠsÉĶ": 800,
-      "ËĪaËĲd": 801,
-      "wËĪiÉĻ": 802,
-      "ËĪand": 803,
-      "ËĮaÉªn": 804,
-      "tÉ¾": 805,
-      "ĠËĮÉª": 806,
-      "ĠËĪuna": 807,
-      "ĠxwËĪÉĳ": 808,
-      "ĠjÉĶËĲ": 809,
-      "ÊģËĪi": 810,
-      "ĠkËĪuoÉľ": 811,
-      "ĠaÎ²": 812,
-      "ĠÉ¡ËĪaËĲ": 813,
-      "ano": 814,
-      "tÉĻl": 815,
-      "ĠrËĮe": 816,
-      "ËĮÊĮt": 817,
-      "ĠjËĪiÉĳ": 818,
-      "ĠÉ¾ËĮÉĻhaËĲ": 819,
-      "ĠmËĪe": 820,
-      "ĠËĪyÃ¦Éľn": 821,
-      "ĠfËĪu": 822,
-      "Ġbl": 823,
-      "nËĪi": 824,
-      "sÉĻn": 825,
-      "ĠaÉªn": 826,
-      "ËĪiÊĬ": 827,
-      "ĠÃ°eÉª": 828,
-      "ĠÉªts": 829,
-      "Ġ(": 830,
-      "ËĪyËĲ": 831,
-      "ÉĻd": 832,
-      "ĠËĮo": 833,
-      "ĠÉĽs": 834,
-      "ĠviËĲ": 835,
-      "ËĲÉ¡eËĲ": 836,
-      "kËĪe": 837,
-      "ĠËĪal": 838,
-      "ÉĽl": 839,
-      "ĠÊĮ": 840,
-      "ËĲo": 841,
-      "ĠkËĪo": 842,
-      "ĠÊĪËĪuËĲ": 843,
-      "ĠsËĪÉª": 844,
-      "ËĪeËĲÉ¾": 845,
-      "Éľm": 846,
-      "ËĮÉĻn": 847,
-      "ËĪaËĲi": 848,
-      "ËĪoËĲl": 849,
-      "ÉªËĮeËĲ": 850,
-      "ĠÊ²ËĪy": 851,
-      "ĠkËĪÉĶËĲ": 852,
-      "sËĪi": 853,
-      "ĠlËĪe": 854,
-      "ËĮÉĴt": 855,
-      "ËĪiËĲp": 856,
-      "aÊģ": 857,
-      "ĠÎ¸ËĪÉªÅĭ": 858,
-      "ËĪÉĻËĲÉª": 859,
-      "ËĪÊĮl": 860,
-      "ĠhËĪoËĲtaËĲ": 861,
-      "ËĪoÉª": 862,
-      "nto": 863,
-      "zh": 864,
-      "ĠdeËĲm": 865,
-      "ĠkÉĶm": 866,
-      "Ê°ËĪiËĲk": 867,
-      "ĠdÊĴËĪÊĮst": 868,
-      "pÉ¾": 869,
-      "Ġly": 870,
-      "hËĪu": 871,
-      "ËĪÉĶÃ¸": 872,
-      "ËĪaËĲs": 873,
-      "ĠËĪan": 874,
-      "ĠËĪÉĴ": 875,
-      "Ġkan": 876,
-      "ĠtsËĪuo": 877,
-      "ËĪeËĲva": 878,
-      "ĠÉ¡É¾": 879,
-      "Ġpo": 880,
-      "ĠtÊĥËĪÉĶ": 881,
-      "Êİa": 882,
-      "ĠmËĮi": 883,
-      "Êĥt": 884,
-      "tËĪi": 885,
-      "ĠhËĪÊĮ": 886,
-      "tÊĥe": 887,
-      "ĠfÉĶn": 888,
-      "ve": 889,
-      "ĠnËĮe": 890,
-      "ËĪÉĶÊģ": 891,
-      "iz": 892,
-      "ĠsËĪuo": 893,
-      "ËĪÉĽËĲr": 894,
-      "wËĪaÊģ": 895,
-      "ËĪaÃ°a": 896,
-      "Åĭk": 897,
-      "po": 898,
-      "ĠkËĪi": 899,
-      "ËĪad": 900,
-      "ĠvËĪi": 901,
-      "tÉķ": 902,
-      "ĠkËĪÉĻ": 903,
-      "ĠwËĪu": 904,
-      "ÉĴz": 905,
-      "ĠvÉĳËĲÉ¾": 906,
-      "ÊģËĪÉĽ": 907,
-      "ĠkËĪaËĲ": 908,
-      "ke": 909,
-      "nÉĻ": 910,
-      "ËĪÊĮb": 911,
-      "ËĪuËĲÉ¾": 912,
-      "ËĮÉĻËĲ": 913,
-      "ĠÊĪÊ°ËĪiËĲk": 914,
-      "ĠkËĪu": 915,
-      "ĠbËĮÊĮt": 916,
-      "Ġat": 917,
-      "ĠfÉ¹": 918,
-      "ËĪax": 919,
-      "ĠzoËĲ": 920,
-      "ĠtËĪaËĲ": 921,
-      "ĠÃ°ËĮe": 922,
-      "neËĲ": 923,
-      "ĠÉĳËĲ": 924,
-      "ĠaÊĬf": 925,
-      "am": 926,
-      "ÊĬÅĭ": 927,
-      "ĠÉĶËĲ": 928,
-      "ĠÉķËĪiÉľÅĭ": 929,
-      "ĠËĪÉĶËĲl": 930,
-      "Éªm": 931,
-      "jËĪo": 932,
-      "ËĪiËĲÉŁ": 933,
-      "ĠkwËĮÉĽ": 934,
-      "ĠmËĪas": 935,
-      "ÉĻh": 936,
-      "ĠËĪaÊĬ": 937,
-      "ËĪÉĶÉª": 938,
-      "É¡ÉĻÉ¾": 939,
-      "rÉĻn": 940,
-      "ËĪÉªk": 941,
-      "sse": 942,
-      "ĠpËĪÉĳ": 943,
-      "ĠÉĹËĮe": 944,
-      "ĠÉĹËĪi": 945,
-      "Ġaz": 946,
-      "ĠÉ¡ËĪÊĮjaËĲ": 947,
-      "ze": 948,
-      "ĠÉĹËĮaËĲ": 949,
-      "ĠfËĪi": 950,
-      "ĠËĮÉĴn": 951,
-      "ĠxËĪo": 952,
-      "ĠËĮÊĬna": 953,
-      "ĠtÊ°aËĲ": 954,
-      "ĠsÉĳ": 955,
-      "ËĪeÉªÊĥÉĻn": 956,
-      "ĠtÉķËĪiÉľ": 957,
-      "ĠÉŁaËĲ": 958,
-      "pËĲ": 959,
-      "Ġply": 960,
-      "Î¸ËĪi": 961,
-      "ËĲÉĸ": 962,
-      "ĠtËĪuei": 963,
-      "ĠlËĪÉĻ": 964,
-      "ĠdÉĳËĲ": 965,
-      "ft": 966,
-      "ËĪam": 967,
-      "ĠsËĪÊĮkt": 968,
-      "ĠtËĪou": 969,
-      "ĠpËĪiÉĽ": 970,
-      "ĠËĪai": 971,
-      "ĠwËĪÉĴn": 972,
-      "ĠzËĮaÉªn": 973,
-      "Ġest": 974,
-      "ĠmÉĶ": 975,
-      "ĠtÉķjËĪÉĳu": 976,
-      "Éľp": 977,
-      "ËĪÊĮz": 978,
-      "bi": 979,
-      "ËĪÉĽËĲseËĲ": 980,
-      "ĠlËĪy": 981,
-      "ĠmËĮe": 982,
-      "ĠdËĮÉĽl": 983,
-      "ËĪiËĲl": 984,
-      "ĠkËĮomo": 985,
-      "ĠhËĪaÉľn": 986,
-      "ËĪoËĲne": 987,
-      "ĠkËĪÊĮÉ¾t": 988,
-      "ĠsyÊģ": 989,
-      "ËĮÉĶÉ¾": 990,
-      "ĠÉªf": 991,
-      "uv": 992,
-      "zÉĻn": 993,
-      "ol": 994,
-      "Ïĩ": 995,
-      "im": 996,
-      "ĠmËĪiÉĽ": 997,
-      "ĠÃ°Éª": 998,
-      "ĠvËĪÉĽ": 999,
-      "ÊĬd": 1000,
-      "Ġtr": 1001,
-      "ËĪeËĲs": 1002,
-      "Ã°e": 1003,
-      "de": 1004,
-      "Ê°Ïĩ": 1005,
-      "ÉŁÊ°": 1006,
-      "ËĮÉĻËĲÉªÉľ": 1007,
-      "bËĲ": 1008,
-      "ËĪÊĬk": 1009,
-      "ĠnËĪÉĶÉªÉľ": 1010,
-      "ĠËĮiËĲ": 1011,
-      "ËĪÉĳËĲt": 1012,
-      "ËĪiËĲÉ¾": 1013,
-      "ĠtÉ¹": 1014,
-      "É¾ÉĶ": 1015,
-      "ĠwÉĴz": 1016,
-      "Ġvu": 1017,
-      "bÉĻl": 1018,
-      "bÉĻ": 1019,
-      "É¹i": 1020,
-      "nts": 1021,
-      "ĠsËĪaËĲ": 1022,
-      "dÊ°": 1023,
-      "ĠtÊĬ": 1024,
-      "ĠÊİËĮi": 1025,
-      "Î²a": 1026,
-      "hËĪÉĻÉľÅĭ": 1027,
-      "ĠsËĪiËĲ": 1028,
-      "ĠpËĮaÉ¾a": 1029,
-      "ËĪÉĽÉ¾ÉĶ": 1030,
-      "ËĪÉªs": 1031,
-      "É£o": 1032,
-      "ĠËĮal": 1033,
-      "or": 1034,
-      "ĠbËĪÊĮh": 1035,
-      "ĠkËĪoËĲ": 1036,
-      "ĠtËĪÉĽ": 1037,
-      "ĠpËĪo": 1038,
-      "ĠÊĴÉĻ": 1039,
-      "pÊģ": 1040,
-      "ĠËĪaÉª": 1041,
-      "hËĪÉĳÉľÅĭ": 1042,
-      "ÉĻli": 1043,
-      "ËĪeÉªt": 1044,
-      "ĠjËĪiouÉľ": 1045,
-      "ĠdËĪÉĻ": 1046,
-      "ĠmËĪÉĶËĲ": 1047,
-      "lËĪi": 1048,
-      "ËĮyÉĻ": 1049,
-      "ĠlËĪoËĲÉ¡": 1050,
-      "ĠnËĪÊĮ": 1051,
-      "ĠhËĪÊĬ": 1052,
-      "ĠnËĪÉĻÉľÅĭ": 1053,
-      "ĠÊģÉĻ": 1054,
-      "zËĪi": 1055,
-      "ĠtËĪuËĲ": 1056,
-      "ĠkËĮome": 1057,
-      "ĠlËĪeËĲ": 1058,
-      "ËĪaËĲtaËĲ": 1059,
-      "Ġan": 1060,
-      "ĠËĪyu": 1061,
-      "ĠËĮÊĮÉ¡ÉĻÉ¾": 1062,
-      "ĠËĪÉªn": 1063,
-      "ĠhËĪoÉĻ": 1064,
-      "vÉĻ": 1065,
-      "ËĪÃ¸ËĲ": 1066,
-      "Î¸ja": 1067,
-      "ËĪuÉĻÉľn": 1068,
-      "ĠkÉĻÉ¾": 1069,
-      "ËĪat": 1070,
-      "jËĪÃ¸": 1071,
-      "ËĪÉĽtÊģ": 1072,
-      "ĠpËĪÉĳu": 1073,
-      "stÉĻ": 1074,
-      "ĠwÉĴt": 1075,
-      "ËĪeËĲl": 1076,
-      "ÊĪi": 1077,
-      "ĠxËĪaiÉľ": 1078,
-      "ËĪyÊģ": 1079,
-      "ĠhËĪoËĲÉ¡aËĲ": 1080,
-      "ĠtsËĪi": 1081,
-      "ĠËĪÊĮp": 1082,
-      "ĠnËĮÉĴt": 1083,
-      "ĠlËĪÉªeËĲ": 1084,
-      "ĠhËĪa": 1085,
-      "Ġfl": 1086,
-      "ĠnËĪeËĲ": 1087,
-      "ËĮaËĲÉª": 1088,
-      "ĠtËĪuo": 1089,
-      "tÊĥËĲ": 1090,
-      "sËĪe": 1091,
-      "bÊ°i": 1092,
-      "ĠbËĪÊĮhÊĬt": 1093,
-      "ËĪÉĽnd": 1094,
-      "ĠsËĪÉĶ": 1095,
-      "ÉĻns": 1096,
-      "ËĮÉĻl": 1097,
-      "ÉĽÉľ": 1098,
-      "ĠÉ¡l": 1099,
-      "ËĪÉªÉ¾": 1100,
-      "ËĪaËĲta": 1101,
-      "ÉľËĲ": 1102,
-      "ËĪÉĽnto": 1103,
-      "skËĮoËĲ": 1104,
-      "ËĪÉĽk": 1105,
-      "tsi": 1106,
-      "ĠtËĪonÉ¡": 1107,
-      "ĠbiËĲ": 1108,
-      "ĠhËĪaËĲÉª": 1109,
-      "ĠbËĪi": 1110,
-      "jj": 1111,
-      "Êİi": 1112,
-      "ĠkÊ°": 1113,
-      "ĠsËĪo": 1114,
-      "llo": 1115,
-      "ĠbaÉª": 1116,
-      "ĠÉĽnt": 1117,
-      "ĠËĪiËĲ": 1118,
-      "ĠÉ¡ËĪo": 1119,
-      "É¾eËĲ": 1120,
-      "ĠkÊĭ": 1121,
-      "ĠmËĪeiÉľ": 1122,
-      "ÊĬËĪÉĶËĲ": 1123,
-      "ĠtËĪaÉª": 1124,
-      "Ġsus": 1125,
-      "Ġri": 1126,
-      "ĠvËĮÉĽ": 1127,
-      "ËĪiËĲno": 1128,
-      "vano": 1129,
-      "ĠdËĮiËĲ": 1130,
-      "ĠÊĲËĪaÉľn": 1131,
-      "ÊĤ": 1132,
-      "ĠÉĲb": 1133,
-      "ËĪaËĲh": 1134,
-      "ÉªÊĥ": 1135,
-      "ĠdËĮella": 1136,
-      "tËĲi": 1137,
-      "ĠËĪÊĬn": 1138,
-      "ĠhiËĲ": 1139,
-      "ĠbËĪaËĲt": 1140,
-      "ĠthËĪi": 1141,
-      "Ġam": 1142,
-      "ĠËĪoËĲ": 1143,
-      "Ġhu": 1144,
-      "ĠkËĪÊĮh": 1145,
-      "ĠzËĪÉĳËĲ": 1146,
-      "ĠÉ¡ËĮÉĶ": 1147,
-      "ĠËĪÉĻÊĬ": 1148,
-      "yËĪi": 1149,
-      "ĠlËĪÊĮ": 1150,
-      "ĠdËĪeËĲ": 1151,
-      "ĠsËĪÉĶËĲ": 1152,
-      "skËĮeËĲ": 1153,
-      "É¾o": 1154,
-      "ÊģËĪÉĳ": 1155,
-      "tËĪa": 1156,
-      "ĠkËĪÊĬ": 1157,
-      "ËĪante": 1158,
-      "ĠdÉĶ": 1159,
-      "ĠsËĪeÉª": 1160,
-      "ĠsÉĽt": 1161,
-      "É¹Éª": 1162,
-      "ĠÉ¡ËĮÉĻÊĬÉªÅĭ": 1163,
-      "zo": 1164,
-      "ĠjËĪaËĲ": 1165,
-      "ĠÉĴvÃ°ÉĻ": 1166,
-      "ĠÊĿ": 1167,
-      "ĠÉĽl": 1168,
-      "ĠsËĪoËĲ": 1169,
-      "ĠthËĪiÉľ": 1170,
-      "ĠËĪÉĽl": 1171,
-      "ĠlyËĮi": 1172,
-      "ndÊĴ": 1173,
-      "ĠÉķjËĪÉĳu": 1174,
-      "Î¸a": 1175,
-      "ĠÉ¾ËĮÉĻheËĲ": 1176,
-      "ĠmaÉª": 1177,
-      "jÉĻ": 1178,
-      "ĠËĪÊĮb": 1179,
-      "asjËĪÉĶ": 1180,
-      "dÊģ": 1181,
-      "ĠkhËĪa": 1182,
-      "ĠËĪes": 1183,
-      "vi": 1184,
-      "fi": 1185,
-      "ËĮÉĻb": 1186,
-      "Ġre": 1187,
-      "ĠavËĮÉĽ": 1188,
-      "ĠtËĮi": 1189,
-      "ĠkÉ¾": 1190,
-      "ĠbÉªk": 1191,
-      "ste": 1192,
-      "ËĪeËĲÊĥc": 1193,
-      "pt": 1194,
-      "zÉĻ": 1195,
-      "ĠwËĪaËĲ": 1196,
-      "kl": 1197,
-      "ĠsËĪÊĮm": 1198,
-      "ÉªÊĪ": 1199,
-      "dz": 1200,
-      "vo": 1201,
-      "ËĮaÊĬt": 1202,
-      "nde": 1203,
-      "ĠdÉĽs": 1204,
-      "ĠÉŁËĪaËĲ": 1205,
-      "ĠrËĮi": 1206,
-      "sËĮeËĲ": 1207,
-      "É¡i": 1208,
-      "Ġals": 1209,
-      "ËĪiÃ°o": 1210,
-      "ĠnËĪiÉľn": 1211,
-      "ÊĬl": 1212,
-      "tsËĲ": 1213,
-      "ËĪanto": 1214,
-      "ĠÉĹËĪÉĻÊĬ": 1215,
-      "kËĲi": 1216,
-      "ĠsËĪÊĮb": 1217,
-      "ĠnËĪa": 1218,
-      "ĠlËĮo": 1219,
-      "ĠphËĪi": 1220,
-      "mËĮe": 1221,
-      "Ġfa": 1222,
-      "kÉĻ": 1223,
-      "ĠzËĪu": 1224,
-      "ns": 1225,
-      "ĠÊģe": 1226,
-      "ĠbËĪo": 1227,
-      "ËĪaËĲti": 1228,
-      "Ġman": 1229,
-      "ĠlËĪiÉĳ": 1230,
-      "ĠÉĹËĮyÉĻ": 1231,
-      "ĠfËĪÉĶËĲ": 1232,
-      "ĠkÊĭËĪeËĲÊĥc": 1233,
-      "ĠxËĪÉĳ": 1234,
-      "ĠtÉķËĪu": 1235,
-      "jÉĻÉ¾": 1236,
-      "ĠÉªst": 1237,
-      "wËĪi": 1238,
-      "ĠËĮaÉªnÉĻ": 1239,
-      "ÉªÉ¡": 1240,
-      "ĠsÊĪ": 1241,
-      "ËĪiÉĻl": 1242,
-      "ĠnËĪiÉĽÉľn": 1243,
-      "ĠËĮÉĽËĲ": 1244,
-      "ËĪaÉªnd": 1245,
-      "ĠzËĪi": 1246,
-      "vÉĻn": 1247,
-      "mz": 1248,
-      "Ã°os": 1249,
-      "dÊĴËĲ": 1250,
-      "jËĪa": 1251,
-      "É¾ËĪÉĶ": 1252,
-      "lËĪe": 1253,
-      "Ê²": 1254,
-      "ĠvËĪÉĶ": 1255,
-      "ĠlËĪiÉĽ": 1256,
-      "Î¸e": 1257,
-      "mËĪente": 1258,
-      "ĠÉªnÃ°ÉĻ": 1259,
-      "ĠaÉªm": 1260,
-      "nÉĻn": 1261,
-      "ĠhÉĻm": 1262,
-      "É¾aËĲ": 1263,
-      "ĠsËĪuoÉľ": 1264,
-      "ĠÉ²ËĪi": 1265,
-      "ĠÉ¹ËĪiÉĻl": 1266,
-      "lËĪa": 1267,
-      "ĠbËĪÉĶ": 1268,
-      "ĠkËĪai": 1269,
-      "ÊģËĪa": 1270,
-      "ĠwËĪÉľËĲ": 1271,
-      "ĠaËĲ": 1272,
-      "Ġpas": 1273,
-      "ËĪÊĮs": 1274,
-      "wËĪÉĽÉ¾": 1275,
-      "ĠÉĹËĪe": 1276,
-      "ĠhËĮatÉĻ": 1277,
-      "aÉªn": 1278,
-      "ĠËĪÉĶpÊ°": 1279,
-      "ÊģËĪe": 1280,
-      "ĠÉŁaËĲËĪeËĲÉ¡aËĲ": 1281,
-      "ĠËĪÊĬs": 1282,
-      "ĠtÉķhËĪiÉľ": 1283,
-      "ntÊĥ": 1284,
-      "ĠxËĪuo": 1285,
-      "ËĪuÊģ": 1286,
-      "ĠÉªm": 1287,
-      "É³Éĸ": 1288,
-      "ËĪyÉĻÉľkh": 1289,
-      "ĠËĪyÉĽ": 1290,
-      "ĠmËĮaËĲ": 1291,
-      "ÅĵÊģ": 1292,
-      "ĠËĪalt": 1293,
-      "ĠkÉĻm": 1294,
-      "Êİo": 1295,
-      "ĠÉĲn": 1296,
-      "Ġfy": 1297,
-      "ĠËĮÉĽra": 1298,
-      "ĠÉ¡ËĪÊĬ": 1299,
-      "ĠpËĪÊĮ": 1300,
-      "ls": 1301,
-      "ĠlËĪiËĲ": 1302,
-      "ĠÊĤËĪy": 1303,
-      "ĠbÉªkËĪÊĮz": 1304,
-      "ĠÉ¡ÉĽt": 1305,
-      "ĠbÉ¾": 1306,
-      "tÊ°": 1307,
-      "tÉĻlËĮÉĻb": 1308,
-      "xo": 1309,
-      "skËĮaËĲ": 1310,
-      "É²Ê²": 1311,
-      "ËĪeËĲkÊĪ": 1312,
-      "rÉĻ": 1313,
-      "tÊĥo": 1314,
-      "ĠpÊģÉĶ": 1315,
-      "ĠÉ¹ËĪaÉªt": 1316,
-      "ĠpËĪei": 1317,
-      "ËĮÉªÃ§": 1318,
-      "jËĪÉĽÉ¾": 1319,
-      "tËĲa": 1320,
-      "ĠÉĲbËĮaÊĬt": 1321,
-      "ĠkÊĭËĪeËĲÊĥcÉĻn": 1322,
-      "ĠvËĪe": 1323,
-      "ÊĬÉľ": 1324,
-      "ĠakËĪe": 1325,
-      "ĠpËĪai": 1326,
-      "vËĪÉĽ": 1327,
-      "ĠÎ¸É¹": 1328,
-      "Éªf": 1329,
-      "ĠavËĪÉĽ": 1330,
-      "ĠkËĪe": 1331,
-      "dËĪi": 1332,
-      "ËĪeËĲÉĸ": 1333,
-      "ĠbÉĻt": 1334,
-      "ÊĪÊ°": 1335,
-      "teËĲ": 1336,
-      "Î¸jËĪÉĶn": 1337,
-      "dÉľ": 1338,
-      "ĠjËĪiÉľ": 1339,
-      "Ġve": 1340,
-      "É£ËĪu": 1341,
-      "ËĪÊĮhÉĻl": 1342,
-      "ĠpÉĶ": 1343,
-      "ĠÉ¡r": 1344,
-      "ĠÃ°a": 1345,
-      "ĠvËĪiËĲ": 1346,
-      "ĠËĮÉĳËĲ": 1347,
-      "ËĪÉĻÊĬnt": 1348,
-      "ĠbËĪaËĲÉ¾": 1349,
-      "ĠmËĪÊĮtÉĻlËĮÉĻb": 1350,
-      "ld": 1351,
-      "ĠtÉķËĮÉĶ": 1352,
-      "pa": 1353,
-      "Ã°ËĪad": 1354,
-      "ËĪiÉ¾": 1355,
-      "ĠxËĪu": 1356,
-      "ĠlËĪiÉľÅĭ": 1357,
-      "ËĪeÉªs": 1358,
-      "ĠÉĹËĮeÉľn": 1359,
-      "ĠthËĪiÉĽ": 1360,
-      "tËĲe": 1361,
-      "ĠavËĮÉĽk": 1362,
-      "ĠËĮÉĶ": 1363,
-      "ĠkËĪÉĳu": 1364,
-      "Éªv": 1365,
-      "iËĲz": 1366,
-      "ËĪos": 1367,
-      "ĠÉ¡É¹": 1368,
-      "and": 1369,
-      "ĠlËĪiou": 1370,
-      "ĠËĪoÉľ": 1371,
-      "É¡l": 1372,
-      "ĠpËĪÉĶËĲ": 1373,
-      "ĠmËĮeËĲ": 1374,
-      "ĠkËĪÉĴ": 1375,
-      "nos": 1376,
-      "Ã§ÉĻn": 1377,
-      "fÉĻn": 1378,
-      "ĠsËĪÊĮktËĮeËĲ": 1379,
-      "ĠËĪaÉªn": 1380,
-      "ËĪoËĲre": 1381,
-      "jËĪÉĽn": 1382,
-      "ĠÃ°ËĪÉĽn": 1383,
-      "ĠtÉķhËĪiÉĽÉľn": 1384,
-      "ĠhËĪaÉª": 1385,
-      "É¾ËĪÉĽ": 1386,
-      "ĠsËĪu": 1387,
-      "ĠkËĪÉªjaËĲ": 1388,
-      "ĠpjËĮÊĬ": 1389,
-      "ĠhÉĻmËĮaËĲ": 1390,
-      "ĠËĮÊĮp": 1391,
-      "ĠpËĪÊĮhÉĻl": 1392,
-      "ĠxËĪÉĻ": 1393,
-      "dËĪe": 1394,
-      "ĠmÉĳ": 1395,
-      "ĠÊĬm": 1396,
-      "ndÉĻ": 1397,
-      "ĠdËĪÉĻÊĬnt": 1398,
-      "ËĪeËĲÊĥÉĻn": 1399,
-      "ĠÃ°ats": 1400,
-      "is": 1401,
-      "ĠcËĪaËĲh": 1402,
-      "pe": 1403,
-      "ĠsËĮo": 1404,
-      "ĠÃ°ËĪe": 1405,
-      "ĠsËĪaËĲt": 1406,
-      "ËĪaÊģ": 1407,
-      "ĠsËĪe": 1408,
-      "ÉĻk": 1409,
-      "ÉªÊĭ": 1410,
-      "ĠkËĪoËĲi": 1411,
-      "kÉĶ": 1412,
-      "ĠvËĪaËĲÊĬ": 1413,
-      "ĠfËĪei": 1414,
-      "ĠlËĪeËĲk": 1415,
-      "ĠhËĪiÉĻ": 1416,
-      "ĠaÊĬ": 1417,
-      "ËĪÉĽndo": 1418,
-      "ËĪes": 1419,
-      "ĠzËĪÉĶ": 1420,
-      "ĠËĪÉĽÉ¾a": 1421,
-      "nËĪiÉľn": 1422,
-      "ĠkËĪÊĮm": 1423,
-      "ĠlËĪÉĴ": 1424,
-      "Éªst": 1425,
-      "ĠpÉĳ": 1426,
-      "ĠfËĪÉĶ": 1427,
-      "ĠthËĪonÉ¡": 1428,
-      "nke": 1429,
-      "ËĮÉªk": 1430,
-      "ĠÉ²ËĪÉĻ": 1431,
-      "ËĮÊĮm": 1432,
-      "ËĪiËĲt": 1433,
-      "ĠwËĪÉĴnt": 1434,
-      "ËĪaÎ²an": 1435,
-      "ĠbËĪÊĮr": 1436,
-      "ÉĽnd": 1437,
-      "ĠËĮÉĳËĲbÉľ": 1438,
-      "ĠvËĪaÉª": 1439,
-      "ĠtÊĥËĮi": 1440,
-      "ĠÎ¸ËĪÉªÅĭk": 1441,
-      "sti": 1442,
-      "ĠkÉ¹": 1443,
-      "ĠËĪaÊĬt": 1444,
-      "stÉĻn": 1445,
-      "ĠÊĭËĪÊĮn": 1446,
-      "ĠÉ¡ËĮaËĲ": 1447,
-      "ËĪaËĲÉľÉ²": 1448,
-      "Êģi": 1449,
-      "ĠnËĪÉĶx": 1450,
-      "ĠÉ¹ËĪiÉĻlÉª": 1451,
-      "ĠvËĮi": 1452,
-      "ĠÃ°eÉĻ": 1453,
-      "ËĮÉªtÊĥ": 1454,
-      "ĠvËĪyÉĻ": 1455,
-      "ĠËĮaËĲpkËĮaËĲ": 1456,
-      "ĠfËĮaËĲÉª": 1457,
-      "ĠpËĪÉĶ": 1458,
-      "ĠnËĪÊĮmb": 1459,
-      "Î¸es": 1460,
-      "jËĪÉĽÊģ": 1461,
-      "ĠkËĪÊĬcÊ°": 1462,
-      "mËĪÉĽ": 1463,
-      "ĠvËĪu": 1464,
-      "ĠlÅĵÊģ": 1465,
-      "ĠiËĲm": 1466,
-      "ÊĪÉĻÉ¾": 1467,
-      "tÊĥi": 1468,
-      "ËĲs": 1469,
-      "ĠtËĪy": 1470,
-      "ĠmËĪiÉľÅĭ": 1471,
-      "É¾ËĪe": 1472,
-      "mËĮa": 1473,
-      "ĠmËĮiËĲ": 1474,
-      "ĠÉĽks": 1475,
-      "Éªp": 1476,
-      "ĠkËĪÊĮÉ¾nËĮaËĲ": 1477,
-      "ĠËĮaÊĬx": 1478,
-      "rËĪiËĲ": 1479,
-      "ĠcËĪÊĮl": 1480,
-      "mos": 1481,
-      "ĠkËĪÊĮÉ¾tËĮeËĲ": 1482,
-      "iËĲÉ¾": 1483,
-      "kÉĻn": 1484,
-      "ĠdËĪu": 1485,
-      "naËĲ": 1486,
-      "ĠpwËĪe": 1487,
-      "ËĮÉĶÉª": 1488,
-      "ĠtÉķhËĪiÉĽ": 1489,
-      "ĠÎ²ËĪi": 1490,
-      "ËĪiÉĽÉľt": 1491,
-      "Ġte": 1492,
-      "ËĪaÃ°os": 1493,
-      "mËĪa": 1494,
-      "ĠvËĪo": 1495,
-      "ĠmËĪÉª": 1496,
-      "ĠbËĮi": 1497,
-      "ad": 1498,
-      "do": 1499,
-      "ĠnËĪaÊĬ": 1500,
-      "ĠÊ²ËĪyÉľ": 1501,
-      "wËĪÉĽ": 1502,
-      "ËĪis": 1503,
-      "el": 1504,
-      "Ġpar": 1505,
-      "ĠtËĪai": 1506,
-      "ĠdËĪÉªjaËĲ": 1507,
-      "hËĪi": 1508,
-      "ĠÉ¾ËĪÊĮ": 1509,
-      "ĠdËĪe": 1510,
-      "ËĪaÉªd": 1511,
-      "Ġper": 1512,
-      "ĠsËĮÉĶ": 1513,
-      "we": 1514,
-      "ÊĬm": 1515,
-      "Ġin": 1516,
-      "ĠjËĪuËĲz": 1517,
-      "ËĪiËĲpÉĻl": 1518,
-      "ĠÊĭËĪaËĲl": 1519,
-      "ĠetËĪÉĽ": 1520,
-      "ËĮÉĽm": 1521,
-      "ĠnËĪu": 1522,
-      "ËĪÉĽkt": 1523,
-      "ĠiËĲÉ¾": 1524,
-      "ĠbÉ¹": 1525,
-      "ĠtshËĪi": 1526,
-      "ĠÉĹËĪÉĶÉľ": 1527,
-      "ĠkwËĮa": 1528,
-      "ĠfËĪuÉľ": 1529,
-      "wËĮa": 1530,
-      "ĠdËĪiËĲ": 1531,
-      "ĠÉ¡ËĪyÉĻ": 1532,
-      "ËĮÉĽËĲ": 1533,
-      "rËĪa": 1534,
-      "Ġne": 1535,
-      "ĠzËĪyÉĻ": 1536,
-      "ĠbËĪaÉª": 1537,
-      "ĠÉŁËĪÊĮb": 1538,
-      "ËĪuËĲto": 1539,
-      "ÊĬnt": 1540,
-      "ĠcÊ°": 1541,
-      "ËĪÉĽnti": 1542,
-      "ËĪoÉĻ": 1543,
-      "ĠsËĮÊĮm": 1544,
-      "ĠlÉĳ": 1545,
-      "ËĮeva": 1546,
-      "É¾ÉĽ": 1547,
-      "ntÉľ": 1548,
-      "ĠmËĪÉĽn": 1549,
-      "ËĪÉĳËĲk": 1550,
-      "Ġkil": 1551,
-      "ËĪones": 1552,
-      "ff": 1553,
-      "ĠmËĪÉĽËĲ": 1554,
-      "ĠvËĪÉĻÉª": 1555,
-      "ĠËĪÉĶËĲ": 1556,
-      "ĠËĮÉªnt": 1557,
-      "ÊĬn": 1558,
-      "ĠwÉªl": 1559,
-      "Ġsin": 1560,
-      "ĠËĮalla": 1561,
-      "ĠaÎ²ËĪia": 1562,
-      "pi": 1563,
-      "ËĪoÉľ": 1564,
-      "ÉªjËĮaËĲ": 1565,
-      "ku": 1566,
-      "ĠvËĪÉª": 1567,
-      "Ġtut": 1568,
-      "ĠtËĪeÉľ": 1569,
-      "ĠhËĪÉĶ": 1570,
-      "Î²É¾e": 1571,
-      "sÉĻÉ¾": 1572,
-      "ĠkhËĪai": 1573,
-      "ĠmËĪÉĶ": 1574,
-      "Ġta": 1575,
-      "ĠÉ²ËĪaËĲ": 1576,
-      "Ġnu": 1577,
-      "ËĪuËĲn": 1578,
-      "ĠÉĻËĲÉľ": 1579,
-      "ĠËĪaÊĬf": 1580,
-      "ËĪiËĲdÉľ": 1581,
-      "nti": 1582,
-      "ĠpËĪiËĲpÉĻl": 1583,
-      "Ġkj": 1584,
-      "Ġpe": 1585,
-      "ĠmËĪÉĳ": 1586,
-      "ËĮaÉª": 1587,
-      "ËĪaËĲle": 1588,
-      "ĠvËĮÉĻËĲÉªÉľ": 1589,
-      "mpo": 1590,
-      "ĠkËĪÉªt": 1591,
-      "ĠnËĮÉĽ": 1592,
-      "ĠÉŁËĪaËĲtaËĲ": 1593,
-      "ĠsËĪaËĲtÊ°": 1594,
-      "ĠÉŁËĪi": 1595,
-      "Ġso": 1596,
-      "ĠbËĪÉĽ": 1597,
-      "kËĪi": 1598,
-      "Éªti": 1599,
-      "Ġtsi": 1600,
-      "ĠkÊģ": 1601,
-      "ËĮÉĴ": 1602,
-      "É¡ÉĻl": 1603,
-      "kst": 1604,
-      "ĠmËĪÉĻËĲ": 1605,
-      "ËĪÊĮk": 1606,
-      "ĠnËĪaËĲÊĬ": 1607,
-      "Ġap": 1608,
-      "ĠlËĪÉªkÊ°": 1609,
-      "lli": 1610,
-      "ĠkwËĪal": 1611,
-      "ĠËĪÉĻËĲ": 1612,
-      "ĠtsËĪuei": 1613,
-      "Ġdo": 1614,
-      "ĠkËĲjËĪo": 1615,
-      "ÊĬz": 1616,
-      "ĠpËĪaËĲ": 1617,
-      "ĠmËĪuËĲ": 1618,
-      "ĠÉ¡ÉĻv": 1619,
-      "rËĪi": 1620,
-      "Ġtw": 1621,
-      "ËĮÉªn": 1622,
-      "dËĪÉĳ": 1623,
-      "ĠÃ°ËĪi": 1624,
-      "ĠËĪaËĲi": 1625,
-      "ĠhËĪiÉĽ": 1626,
-      "ĠÃ°ËĮÉĽm": 1627,
-      "ĠpÊ°ËĪÉªÉ¾": 1628,
-      "ÉĴm": 1629,
-      "ĠËĮeËĲ": 1630,
-      "ĠthËĪaiÉľ": 1631,
-      "ĠvËĪas": 1632,
-      "ĠnÉĳËĲ": 1633,
-      "pÉĻn": 1634,
-      "ĠpËĮÉĻÉ¾": 1635,
-      "ĠÉĹËĪaËĲÉª": 1636,
-      "ËĪouÉľ": 1637,
-      "ĠÊĲËĪuÉľ": 1638,
-      "ĠmËĪan": 1639,
-      "ĠtËĪÉĻÉªÉľ": 1640,
-      "ĠlËĪaËĲÊĬ": 1641,
-      "mËĪÉĽnte": 1642,
-      "ĠfËĪam": 1643,
-      "sjËĪÉĶ": 1644,
-      "ĠpËĪÉĻ": 1645,
-      "ËĪeËĲm": 1646,
-      "ĠpËĪÊĮr": 1647,
-      "jËĪi": 1648,
-      "ĠlÉĽ": 1649,
-      "Ġten": 1650,
-      "ËĪoËĲra": 1651,
-      "ki": 1652,
-      "ĠÊĤËĪaËĲÊĬ": 1653,
-      "kÉª": 1654,
-      "bËĲe": 1655,
-      "ËĪalt": 1656,
-      "Ã°Éª": 1657,
-      "pËĪi": 1658,
-      "ĠËĮÉĽnt": 1659,
-      "ĠmËĪei": 1660,
-      "ĠhËĪÉĻÊĬ": 1661,
-      "ĠhËĪÉĽÉ¾": 1662,
-      "jËĪÉĳ": 1663,
-      "ĠhËĪÊĬaËĲ": 1664,
-      "mÉľ": 1665,
-      "ĠdÊ°": 1666,
-      "ĠtÊĥËĪe": 1667,
-      "lËĪÉĽ": 1668,
-      "ËĪaËĲte": 1669,
-      "ĠpËĪuËĲ": 1670,
-      "ĠmËĪÊĬ": 1671,
-      "ËĪaËĲÉªÊĪ": 1672,
-      "diËĲ": 1673,
-      "ĠfÉ¹ÉĴm": 1674,
-      "ĠhËĪÉĳËĲ": 1675,
-      "Î²o": 1676,
-      "ĠmËĪiÉľn": 1677,
-      "ĠÃ°iËĲz": 1678,
-      "ĠkËĪou": 1679,
-      "ËĪiËĲna": 1680,
-      "ĠavËĮeva": 1681,
-      "ĠËĪaËĲÉ¾": 1682,
-      "ĠnËĪuËĲÉ¾": 1683,
-      "ĠÎ²ËĪe": 1684,
-      "ĠzaÉªn": 1685,
-      "ËĪÉĽd": 1686,
-      "ÉĹ": 1687,
-      "ËĪeÉªk": 1688,
-      "sËĮÉĻÊĬ": 1689,
-      "ËĪeËĲÉŁ": 1690,
-      "ĠÊĤËĪÉĻËĲ": 1691,
-      "je": 1692,
-      "cÊ°ËĲ": 1693,
-      "ËĪÉĶr": 1694,
-      "ÉĽËĲ": 1695,
-      "ĠtÉķhËĪyÃ¦Éľn": 1696,
-      "ĠËĮaÉªnÉĻn": 1697,
-      "ĠiËĲn": 1698,
-      "ĠbËĪÊĮc": 1699,
-      "ËĪiËĲm": 1700,
-      "É¾as": 1701,
-      "ËĮÉĻs": 1702,
-      "ĠvËĪeËĲ": 1703,
-      "ĠËĪÉĻrÉľ": 1704,
-      "ĠduËĲ": 1705,
-      "ntÉĻ": 1706,
-      "ĠpÉ¹ËĪÉĴ": 1707,
-      "ĠbËĪÉª": 1708,
-      "ĠwËĪoÉľ": 1709,
-      "nËĮi": 1710,
-      "ĠhÉĲ": 1711,
-      "ĠkËĪÉĽ": 1712,
-      "Ġet": 1713,
-      "jËĪÉĽndo": 1714,
-      "ĠËĪaiÉľ": 1715,
-      "Ġli": 1716,
-      "ĠËĪaÊĬs": 1717,
-      "kËĲo": 1718,
-      "ĠÉĹËĪyÉĻ": 1719,
-      "keËĲ": 1720,
-      "ĠfËĪiËĲl": 1721,
-      "ĠbÊ°ËĪaËĲi": 1722,
-      "ĠÉ¡ÉĻÊĥ": 1723,
-      "ÊĴËĪe": 1724,
-      "ĠnjËĪuËĲ": 1725,
-      "ĠËĪak": 1726,
-      "ĠÉĹËĪaËĲ": 1727,
-      "zËĪa": 1728,
-      "vËĪe": 1729,
-      "ĠhËĮaÊĬ": 1730,
-      "ÉĲÃ§": 1731,
-      "ĠÉ¾ËĪÊĮkÊ°": 1732,
-      "pËĪe": 1733,
-      "ĠtÉĻbi": 1734,
-      "ĠpËĪÊĮhÉĻlËĮeËĲ": 1735,
-      "ĠfËĪÉĽ": 1736,
-      "ĠwËĮÉªtÊĥ": 1737,
-      "ĠtÉķËĪyÉĽÉľ": 1738,
-      "wËĮe": 1739,
-      "ËĮaÉªt": 1740,
-      "ĠnÉĳËĲx": 1741,
-      "ĠkËĪÉĶËĲn": 1742,
-      "ÊĬk": 1743,
-      "ĠbËĪaËĲd": 1744,
-      "ÅĭÉĻn": 1745,
-      "Ġni": 1746,
-      "ĠbËĪe": 1747,
-      "ĠmËĮÊĬ": 1748,
-      "ËĪar": 1749,
-      "ĠmËĮeÉªk": 1750,
-      "ĠsËĪaËĲÉ¾": 1751,
-      "Î²e": 1752,
-      "ĠtÉķhËĪiÉľÅĭ": 1753,
-      "itËĪe": 1754,
-      "kËĮe": 1755,
-      "ËĪÉĽËĲl": 1756,
-      "ËĮÉĴn": 1757,
-      "ËĮÉĳ": 1758,
-      "ĠbËĪÉªl": 1759,
-      "ĠwÊĬd": 1760,
-      "ĠbËĪoËĲl": 1761,
-      "rd": 1762,
-      "iÉĻ": 1763,
-      "Ġda": 1764,
-      "ĠbËĪaËĲÊĬ": 1765,
-      "ĠnËĪÊĮmbÉĻÉ¾": 1766,
-      "ËĪaËĲÉªÉľ": 1767,
-      "ĠÉĽm": 1768,
-      "ĠmiËĲÉ¾": 1769,
-      "ËĪeÉªm": 1770,
-      "los": 1771,
-      "ËĮÉĽt": 1772,
-      "ĠËĮaÊĬs": 1773,
-      "ĠmËĪaÉľt": 1774,
-      "ĠwËĪuÉĻ": 1775,
-      "ĠwËĪeÉª": 1776,
-      "ĠseÉ²": 1777,
-      "ĠbjËĪÉĽ": 1778,
-      "ĠwÉĽn": 1779,
-      "fl": 1780,
-      "ĠkhwËĪa": 1781,
-      "dËĪÉĽ": 1782,
-      "vÉ¹Éª": 1783,
-      "ĠËĪaÉ¾": 1784,
-      "jËĪÉĳuÉľ": 1785,
-      "ĠËĮaËĲpkËĮeËĲ": 1786,
-      "bÊģ": 1787,
-      "ĠtËĪaÉªm": 1788,
-      "ĠËĪÉĳ": 1789,
-      "ĠsËĮa": 1790,
-      "ĠzËĪoÉª": 1791,
-      "ËĪÉĶÉ¾a": 1792,
-      "ĠdËĪÃ¸": 1793,
-      "ËĪÉĶÉ¾t": 1794,
-      "ĠÅĭËĪÉĶ": 1795,
-      "min": 1796,
-      "ĠlËĪÊĬk": 1797,
-      "ËĪÉĶËĲt": 1798,
-      "ĠËĪÉĶtÉ¾": 1799,
-      "ĠfËĪaÉª": 1800,
-      "ĠÉ¡ÉĴt": 1801,
-      "ËĪeËĲÉĻn": 1802,
-      "kËĪÉĶ": 1803,
-      "ĠvËĪÉĽÉ¹i": 1804,
-      "mÉĽ": 1805,
-      "ËĪaÉªz": 1806,
-      "Ġesp": 1807,
-      "É²a": 1808,
-      "ĠlËĪo": 1809,
-      "ËĪÉĽËĲra": 1810,
-      "Î²ËĪi": 1811,
-      "ouÉľ": 1812,
-      "ËĮÉĻk": 1813,
-      "tÊĥuËĲ": 1814,
-      "ĠnËĪyÉĻ": 1815,
-      "ÊĪÉ¾": 1816,
-      "ĠÉ¡ËĪy": 1817,
-      "ĠtËĪoÃ°o": 1818,
-      "ËĪÉªÃ§t": 1819,
-      "ĠmÉªÃ§": 1820,
-      "ĠËĪand": 1821,
-      "ĠkwËĮÉĽl": 1822,
-      "ĠÊĤËĪaËĲ": 1823,
-      "ĠnËĪiÉľ": 1824,
-      "ËĪÉĶp": 1825,
-      "ËĪiËĲz": 1826,
-      "ĠÊĤËĪaÊĬ": 1827,
-      "ĠÉ¾ËĮÉĻhi": 1828,
-      "ĠsËĮÊĬo": 1829,
-      "ĠÉĽÉ¡": 1830,
-      "ĠdÅĵ": 1831,
-      "ĠÉ¡ËĮaËĲÉªÉľ": 1832,
-      "dÉª": 1833,
-      "lËĮa": 1834,
-      "stËĪi": 1835,
-      "ĠdËĮiËĲz": 1836,
-      "ĠtËĮÊĬ": 1837,
-      "Î¸i": 1838,
-      "ĠËĪÉªskËĮoËĲ": 1839,
-      "ndÉĻn": 1840,
-      "Ġtsv": 1841,
-      "ĠhËĪÉĻËĲ": 1842,
-      "ĠÊĥËĪÊĬ": 1843,
-      "ÉĻtËĮeËĲ": 1844,
-      "pËĮÉĽ": 1845,
-      "ËĪaÉ¾ÉĶn": 1846,
-      "ĠpÉĽÊģ": 1847,
-      "Ġy": 1848,
-      "mnËĮeËĲ": 1849,
-      "ËĪÉĽllo": 1850,
-      "ĠÉ¡ËĪÉĻ": 1851,
-      "ĠËĮad": 1852,
-      "ĠÊĥv": 1853,
-      "ËĪÊıÉ¾": 1854,
-      "rËĪe": 1855,
-      "yËĲ": 1856,
-      "ĠpËĪaËĲs": 1857,
-      "ĠËĪÉĽn": 1858,
-      "ÉªdÊĴ": 1859,
-      "ËĪuai": 1860,
-      "Ġfi": 1861,
-      "ĠtËĪyÉĻ": 1862,
-      "ËĪaËĲÉŁ": 1863,
-      "ĠtjËĪe": 1864,
-      "ËĪaËĲnaËĲ": 1865,
-      "stÉ¾": 1866,
-      "Êİe": 1867,
-      "ËĮeÉªt": 1868,
-      "ba": 1869,
-      "Ã°as": 1870,
-      "vÊģ": 1871,
-      "ĠzËĪÉĻËĲ": 1872,
-      "ËĪaËĲli": 1873,
-      "ÉŁÊ°eËĲ": 1874,
-      "ËĪaËĲteËĲ": 1875,
-      "ĠvËĪa": 1876,
-      "Ġsal": 1877,
-      "ËĪaËĲno": 1878,
-      "ĠÉ¡ÉĻz": 1879,
-      "ĠhËĪoËĲti": 1880,
-      "ĠÉ²ËĪiÉĽ": 1881,
-      "tÉľ": 1882,
-      "ĠËĪaËĲp": 1883,
-      "ĠwËĪÉĽl": 1884,
-      "ĠmËĪÉªl": 1885,
-      "ĠfyËĲÉ¾": 1886,
-      "ËĪÉĽËĲsaËĲ": 1887,
-      "ĠbËĮiËĲ": 1888,
-      "ËĪaËĲjaËĲ": 1889,
-      "ËĪÉªp": 1890,
-      "ĠfÊģ": 1891,
-      "tsiËĪoËĲne": 1892,
-      "ĠwËĪuÉľ": 1893,
-      "Ġvi": 1894,
-      "ĠwËĪÉĳÉľn": 1895,
-      "ËĪoËĲn": 1896,
-      "ĠÉĹËĪÉĻÉª": 1897,
-      "ĠÊĿËĪo": 1898,
-      "Ġra": 1899,
-      "mÉĻnt": 1900,
-      "ËĪaÊĬnd": 1901,
-      "ĠpÉĽÉ¾": 1902,
-      "ĠÉĹËĪaËĲÊĬ": 1903,
-      "oËĲÉ¾": 1904,
-      "hËĪo": 1905,
-      "ĠÉĴn": 1906,
-      "ĠÊİe": 1907,
-      "ĠsËĪÉªks": 1908,
-      "É¡n": 1909,
-      "ĠÉ¡ËĪa": 1910,
-      "ĠÎ¸j": 1911,
-      "ĠpËĪe": 1912,
-      "spe": 1913,
-      "ĠvËĪÉĻ": 1914,
-      "ĠfËĪÉª": 1915,
-      "ĠËĮÉªntÊĬ": 1916,
-      "lÉĻn": 1917,
-      "ĠnËĪiËĲd": 1918,
-      "ĠsËĮÊĬa": 1919,
-      "ĠËĪum": 1920,
-      "ĠdËĪeÉª": 1921,
-      "ĠËĪÊĮbÊ°i": 1922,
-      "ËĪÉĳËĲÉ¾": 1923,
-      "ĠbËĪiÉĽÉľt": 1924,
-      "Êİos": 1925,
-      "ĠtshËĪaiÉľ": 1926,
-      "ĠËĮÉªskËĮaËĲ": 1927,
-      "ĠaÊĬÉĻ": 1928,
-      "ĠËĪyÃ¦": 1929,
-      "Ġdyn": 1930,
-      "ĠmËĪiËĲn": 1931,
-      "ĠËĪÊĮcÊ°ËĲ": 1932,
-      "ĠsÉĽ": 1933,
-      "ĠnËĪy": 1934,
-      "ĠnËĮÉĽl": 1935,
-      "É¡É¾": 1936,
-      "ÊĥËĪe": 1937,
-      "ĠÊĤËĮÉĽ": 1938,
-      "ĠËĪÉĽvÉ¹Éª": 1939,
-      "ËĪÉĽlp": 1940,
-      "ĠbËĪak": 1941,
-      "ĠeËĲ": 1942,
-      "ĠfËĪaËĲ": 1943,
-      "ĠkÉĽl": 1944,
-      "ĠËĪeËĲs": 1945,
-      "jËĪaËĲd": 1946,
-      "ĠlËĮi": 1947,
-      "mbÉ¾e": 1948,
-      "ktÉĻ": 1949,
-      "nta": 1950,
-      "tËĪu": 1951,
-      "ĠÃ°ËĪat": 1952,
-      "ĠËĪaÎ²": 1953,
-      "ÉĻÉ¹i": 1954,
-      "ĠkwËĮÉĽlla": 1955,
-      "ĠbÉĻn": 1956,
-      "rËĮÉĽ": 1957,
-      "ĠnÉĶ": 1958,
-      "ĠÉ¡ËĪÉª": 1959,
-      "ĠËĪap": 1960,
-      "É¹ÉĻ": 1961,
-      "ËĪaÉľkh": 1962,
-      "ĠÊĲËĪi": 1963,
-      "ĠËĪÉĳËĲ": 1964,
-      "ÉªÉ¡ÉĻn": 1965,
-      "ĠwËĪai": 1966,
-      "ĠpÉĻt": 1967,
-      "kËĲa": 1968,
-      "ĠbËĪÉĽËĲ": 1969,
-      "ËĪeËĲÊĭ": 1970,
-      "lsÉĻÊĬ": 1971,
-      "ĠcËĪaËĲhÉªËĮeËĲ": 1972,
-      "ĠkÉĻn": 1973,
-      "ĠËĮaÉªnÉĻm": 1974,
-      "ËĪuËĲt": 1975,
-      "ĠhËĪaÊĬ": 1976,
-      "ĠtËĪanto": 1977,
-      "ĠhÉĲz": 1978,
-      "ĠsËĪÊĮÉ¾": 1979,
-      "Ġno": 1980,
-      "ĠtËĪÉĶËĲ": 1981,
-      "ĠzËĪaÉª": 1982,
-      "ĠtÉķËĪiÉĽÉľ": 1983,
-      "ĠkozËĪi": 1984,
-      "ĠkËĪei": 1985,
-      "Ã°ËĪÉĶÉ¾": 1986,
-      "ËĮÉĶÊģ": 1987,
-      "ĠtËĪÊĮÉ¾": 1988,
-      "ĠÊĲËĪÉĻ": 1989,
-      "ĠÉķËĪyÉĽÉľ": 1990,
-      "ĠmËĮÊĬÉŁÊ°eËĲ": 1991,
-      "mf": 1992,
-      "ĠvËĪiËĲdÉľ": 1993,
-      "kËĪa": 1994,
-      "ĠÉĲÉ¡": 1995,
-      "kw": 1996,
-      "ĠÊģÉĽ": 1997,
-      "xÉĻn": 1998,
-      "ĠdÊĬ": 1999,
-      "ĠkËĪÊĮÉ¾nËĮeËĲ": 2000,
-      "jËĪaËĲdaËĲ": 2001,
-      "ĠfÉĻ": 2002,
-      "ĠËĮimp": 2003,
-      "ĠhÉªz": 2004,
-      "ĠÊ°Ïĩ": 2005,
-      "ËĪoËĲni": 2006,
-      "ĠxËĪiÉľ": 2007,
-      "ËĪeËĲsÊĪ": 2008,
-      "ÊıbÉľ": 2009,
-      "ËĮÉĶÉ¾ke": 2010,
-      "ĠÉ¡ËĪÉĻÊĬ": 2011,
-      "ËĪÉªÊĥÉĻn": 2012,
-      "les": 2013,
-      "ĠfËĪiËĲ": 2014,
-      "É¡tÉĻ": 2015,
-      "ËĪeËĲre": 2016,
-      "ĠvËĮaËĲ": 2017,
-      "ĠËĪeÉª": 2018,
-      "ĠmËĪuÉĻÉľn": 2019,
-      "ĠÉ¡ËĪÊĬd": 2020,
-      "ĠmËĮaÉªn": 2021,
-      "zËĪe": 2022,
-      "ĠlËĪiÉľ": 2023,
-      "Ġmu": 2024,
-      "ĠkËĮÉĽl": 2025,
-      "ĠjËĮÉĻh": 2026,
-      "ĠfËĮÉĶÉ¾": 2027,
-      "fÉ¹": 2028,
-      "ĠkËĪaÉªn": 2029,
-      "ĠËĪÉĴlsÉĻÊĬ": 2030,
-      "Î¸ÉªÅĭ": 2031,
-      "ĠthËĪonÉ¡Éľ": 2032,
-      "tËĪÉĳ": 2033,
-      "Î¸jo": 2034,
-      "mËĪÉĶ": 2035,
-      "Ġos": 2036,
-      "ĠsÊĬ": 2037,
-      "ĠsËĪÊĮmÉĻ": 2038,
-      "ĠvËĮÉĽn": 2039,
-      "nËĪo": 2040,
-      "ĠËĪaktÊĥuËĲ": 2041,
-      "É£a": 2042,
-      "ĠtÊ°i": 2043,
-      "ĠfËĮi": 2044,
-      "ĠvËĪÉĽl": 2045,
-      "ĠtËĪutËĲi": 2046,
-      "xos": 2047
-    },
-    "merges": [
-      [
-        "Ë",
-        "Ī"
-      ],
-      [
-        "Ë",
-        "Ĳ"
-      ],
-      [
-        "ËĪ",
-        "É"
-      ],
-      [
-        "Ë",
-        "Į"
-      ],
-      [
-        "É",
-        "Ļ"
-      ],
-      [
-        "ËĪ",
-        "a"
-      ],
-      [
-        "ËĪ",
-        "i"
-      ],
-      [
-        "Ġ",
-        "t"
-      ],
-      [
-        "É",
-        "ª"
-      ],
-      [
-        "É",
-        "¾"
-      ],
-      [
-        "Ġ",
-        "É"
-      ],
-      [
-        "Ġ",
-        "k"
-      ],
-      [
-        "É",
-        "ľ"
-      ],
-      [
-        "Ġ",
-        "s"
-      ],
-      [
-        "ËĪ",
-        "e"
-      ],
-      [
-        "É",
-        "Ľ"
-      ],
-      [
-        "ËĪ",
-        "o"
-      ],
-      [
-        "Ġ",
-        "l"
-      ],
-      [
-        "ËĪÉ",
-        "Ľ"
-      ],
-      [
-        "Ġ",
-        "d"
-      ],
-      [
-        "Ê",
-        "Ĭ"
-      ],
-      [
-        "ËĪa",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "p"
-      ],
-      [
-        "Ì",
-        "ĥ"
-      ],
-      [
-        "Ġ",
-        "m"
-      ],
-      [
-        "ËĪ",
-        "u"
-      ],
-      [
-        "Å",
-        "ĭ"
-      ],
-      [
-        "Ã",
-        "°"
-      ],
-      [
-        "ËĪÉ",
-        "Ķ"
-      ],
-      [
-        "Ê",
-        "Į"
-      ],
-      [
-        "ËĮ",
-        "a"
-      ],
-      [
-        "Ġ",
-        "h"
-      ],
-      [
-        "ËĪ",
-        "ÊĮ"
-      ],
-      [
-        "Ġ",
-        "n"
-      ],
-      [
-        "Ê",
-        "ģ"
-      ],
-      [
-        "ËĪÉ",
-        "ĳ"
-      ],
-      [
-        "Ê",
-        "ĥ"
-      ],
-      [
-        "e",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "a"
-      ],
-      [
-        "Ġ",
-        "b"
-      ],
-      [
-        "É",
-        "Ķ"
-      ],
-      [
-        "ËĪÉ",
-        "Ļ"
-      ],
-      [
-        "ÉĻ",
-        "n"
-      ],
-      [
-        "Ġ",
-        "f"
-      ],
-      [
-        "ËĪÉ",
-        "ª"
-      ],
-      [
-        "É",
-        "¡"
-      ],
-      [
-        "ËĪe",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "j"
-      ],
-      [
-        "n",
-        "t"
-      ],
-      [
-        "Ġ",
-        "Ã°"
-      ],
-      [
-        "Ġ",
-        "ËĮ"
-      ],
-      [
-        "Ġt",
-        "s"
-      ],
-      [
-        "ĠÉ",
-        "¡"
-      ],
-      [
-        "É",
-        "ķ"
-      ],
-      [
-        "ËĪo",
-        "ËĲ"
-      ],
-      [
-        "Ê",
-        "°"
-      ],
-      [
-        "a",
-        "ËĲ"
-      ],
-      [
-        "ËĪ",
-        "y"
-      ],
-      [
-        "Ġt",
-        "Éķ"
-      ],
-      [
-        "ËĪi",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "Ê"
-      ],
-      [
-        "Ġ",
-        "v"
-      ],
-      [
-        "Ġ",
-        "w"
-      ],
-      [
-        "s",
-        "t"
-      ],
-      [
-        "É",
-        "ĳ"
-      ],
-      [
-        "n",
-        "d"
-      ],
-      [
-        "ËĮ",
-        "i"
-      ],
-      [
-        "Ì",
-        "ª"
-      ],
-      [
-        "ËĮ",
-        "e"
-      ],
-      [
-        "Ġ",
-        "z"
-      ],
-      [
-        "ËĪa",
-        "Éª"
-      ],
-      [
-        "ËĪi",
-        "ÉĽ"
-      ],
-      [
-        "Î",
-        "²"
-      ],
-      [
-        "É",
-        "¹"
-      ],
-      [
-        "Ġ",
-        "ËĮa"
-      ],
-      [
-        "Î",
-        "¸"
-      ],
-      [
-        "Ġh",
-        "ÉĽ"
-      ],
-      [
-        "Ê",
-        "Ī"
-      ],
-      [
-        "i",
-        "ËĲ"
-      ],
-      [
-        "ËĮ",
-        "o"
-      ],
-      [
-        "Ġ",
-        "Éª"
-      ],
-      [
-        "Éľ",
-        "n"
-      ],
-      [
-        "Ġ",
-        "x"
-      ],
-      [
-        "Ġt",
-        "ÉĻ"
-      ],
-      [
-        "ËĪu",
-        "ËĲ"
-      ],
-      [
-        "ËĮ",
-        "ÉĻ"
-      ],
-      [
-        "Ġj",
-        "ËĪi"
-      ],
-      [
-        "ËĮ",
-        "ÉĽ"
-      ],
-      [
-        "ĠÉ",
-        "Ľ"
-      ],
-      [
-        "Ġ",
-        "ËĪa"
-      ],
-      [
-        "ËĮa",
-        "ËĲ"
-      ],
-      [
-        "Ġl",
-        "a"
-      ],
-      [
-        "ĠÃ°",
-        "e"
-      ],
-      [
-        "ĠhÉĽ",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "e"
-      ],
-      [
-        "Ã",
-        "§"
-      ],
-      [
-        "ÉĻ",
-        "l"
-      ],
-      [
-        "o",
-        "ËĲ"
-      ],
-      [
-        "ËĪÉĳ",
-        "u"
-      ],
-      [
-        "Ê",
-        "Ĵ"
-      ],
-      [
-        "u",
-        "ËĲ"
-      ],
-      [
-        "ĠÉ",
-        "Ĺ"
-      ],
-      [
-        "ĠÉ",
-        "ķ"
-      ],
-      [
-        "ËĮ",
-        "eËĲ"
-      ],
-      [
-        "ĠtÉķ",
-        "ËĪi"
-      ],
-      [
-        "o",
-        "s"
-      ],
-      [
-        "ËĪÉĶ",
-        "ËĲ"
-      ],
-      [
-        "a",
-        "s"
-      ],
-      [
-        "ËĪ",
-        "ÊĬ"
-      ],
-      [
-        "Ġ",
-        "i"
-      ],
-      [
-        "ËĪa",
-        "i"
-      ],
-      [
-        "É",
-        "²"
-      ],
-      [
-        "Éª",
-        "n"
-      ],
-      [
-        "t",
-        "s"
-      ],
-      [
-        "Éľ",
-        "Åĭ"
-      ],
-      [
-        "ĠÉ",
-        "Ł"
-      ],
-      [
-        "Ġ",
-        "Êĥ"
-      ],
-      [
-        "ËĪe",
-        "Éª"
-      ],
-      [
-        "ÉĽ",
-        "É¾"
-      ],
-      [
-        "ËĪÉĽ",
-        "ËĲ"
-      ],
-      [
-        "ËĪÉĽ",
-        "É¾"
-      ],
-      [
-        "Ġ",
-        "r"
-      ],
-      [
-        "t",
-        "Êĥ"
-      ],
-      [
-        "ËĮ",
-        "ÉĶ"
-      ],
-      [
-        "Ġd",
-        "ÉĻ"
-      ],
-      [
-        "t",
-        "ÉĻ"
-      ],
-      [
-        "o",
-        "u"
-      ],
-      [
-        "ËĪy",
-        "ÉĻ"
-      ],
-      [
-        "ĠËĮ",
-        "i"
-      ],
-      [
-        "ÉĻ",
-        "É¾"
-      ],
-      [
-        "ËĪÉĻ",
-        "ÊĬ"
-      ],
-      [
-        "ËĪÊĮ",
-        "É¾"
-      ],
-      [
-        "ËĪÉ",
-        "Ĵ"
-      ],
-      [
-        "Ġt",
-        "h"
-      ],
-      [
-        "ËĪo",
-        "n"
-      ],
-      [
-        "Ê",
-        "ĭ"
-      ],
-      [
-        "ËĪÉĳ",
-        "ËĲ"
-      ],
-      [
-        "ËĪÊĮ",
-        "h"
-      ],
-      [
-        "w",
-        "ËĪa"
-      ],
-      [
-        "ËĪe",
-        "i"
-      ],
-      [
-        "l",
-        "l"
-      ],
-      [
-        "ĠÉ",
-        "Ĳ"
-      ],
-      [
-        "Éĳ",
-        "ËĲ"
-      ],
-      [
-        "a",
-        "n"
-      ],
-      [
-        "É",
-        "Ł"
-      ],
-      [
-        "ĠÊ",
-        "ĭ"
-      ],
-      [
-        "Ġk",
-        "o"
-      ],
-      [
-        "k",
-        "h"
-      ],
-      [
-        "Éª",
-        "Åĭ"
-      ],
-      [
-        "ËĪaËĲ",
-        "Éª"
-      ],
-      [
-        "Ġt",
-        "Êĥ"
-      ],
-      [
-        "ËĪaËĲ",
-        "t"
-      ],
-      [
-        "ĠËĮ",
-        "e"
-      ],
-      [
-        "ĠtÉķ",
-        "h"
-      ],
-      [
-        "ËĪu",
-        "o"
-      ],
-      [
-        "ËĪon",
-        "É¡"
-      ],
-      [
-        "É",
-        "ĸ"
-      ],
-      [
-        "a",
-        "t"
-      ],
-      [
-        "Ġk",
-        "e"
-      ],
-      [
-        "É",
-        "Ĵ"
-      ],
-      [
-        "ĠÉķ",
-        "ËĪi"
-      ],
-      [
-        "Ã",
-        "¸"
-      ],
-      [
-        "ĠÉ",
-        "ĳ"
-      ],
-      [
-        "ËĪeËĲ",
-        "k"
-      ],
-      [
-        "Å",
-        "ĵ"
-      ],
-      [
-        "r",
-        "e"
-      ],
-      [
-        "Ġ",
-        "É¾"
-      ],
-      [
-        "Ġk",
-        "ÉĶ"
-      ],
-      [
-        "ËĮ",
-        "ÊĬ"
-      ],
-      [
-        "s",
-        "k"
-      ],
-      [
-        "Ġ",
-        "ÊĬ"
-      ],
-      [
-        "Ġa",
-        "nd"
-      ],
-      [
-        "Éª",
-        "Ã§"
-      ],
-      [
-        "Ġm",
-        "e"
-      ],
-      [
-        "ËĪa",
-        "É¾"
-      ],
-      [
-        "Ġ",
-        "ËĪÉª"
-      ],
-      [
-        "n",
-        "a"
-      ],
-      [
-        "Ġ",
-        "Î²"
-      ],
-      [
-        "Ġl",
-        "ËĪi"
-      ],
-      [
-        "j",
-        "aËĲ"
-      ],
-      [
-        "l",
-        "i"
-      ],
-      [
-        "n",
-        "o"
-      ],
-      [
-        "ĠÉª",
-        "n"
-      ],
-      [
-        "Ġd",
-        "ËĮi"
-      ],
-      [
-        "ĠÉ",
-        "²"
-      ],
-      [
-        "t",
-        "ËĲ"
-      ],
-      [
-        "ÉĻ",
-        "m"
-      ],
-      [
-        "Ġl",
-        "ÉĻ"
-      ],
-      [
-        "ĠÃ°",
-        "ÉĻ"
-      ],
-      [
-        "Éª",
-        "k"
-      ],
-      [
-        "ËĪÉĽ",
-        "l"
-      ],
-      [
-        "Éľ",
-        "t"
-      ],
-      [
-        "Ġs",
-        "e"
-      ],
-      [
-        "e",
-        "s"
-      ],
-      [
-        "ËĪo",
-        "u"
-      ],
-      [
-        "ËĪa",
-        "ÊĬ"
-      ],
-      [
-        "ĠÉ",
-        "Ķ"
-      ],
-      [
-        "Éª",
-        "t"
-      ],
-      [
-        "Ġ",
-        "Åĭ"
-      ],
-      [
-        "ËĪÉĽ",
-        "n"
-      ],
-      [
-        "Ê",
-        "İ"
-      ],
-      [
-        "Ġk",
-        "h"
-      ],
-      [
-        "ËĪÉĽ",
-        "nt"
-      ],
-      [
-        "ËĪaËĲ",
-        "É¾"
-      ],
-      [
-        "Ġk",
-        "i"
-      ],
-      [
-        "m",
-        "p"
-      ],
-      [
-        "l",
-        "t"
-      ],
-      [
-        "É",
-        "£"
-      ],
-      [
-        "Ġp",
-        "a"
-      ],
-      [
-        "ËĪÉĻ",
-        "ËĲ"
-      ],
-      [
-        "Éª",
-        "s"
-      ],
-      [
-        "ĠÉ",
-        "Ĵ"
-      ],
-      [
-        "Ġl",
-        "e"
-      ],
-      [
-        "Éª",
-        "Éľ"
-      ],
-      [
-        "ËĪÉĽ",
-        "t"
-      ],
-      [
-        "Ġd",
-        "e"
-      ],
-      [
-        "ĠÉ",
-        "¹"
-      ],
-      [
-        "Ġt",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġ",
-        "Êģ"
-      ],
-      [
-        "Êĥ",
-        "ÉĻn"
-      ],
-      [
-        "ĠÊĬ",
-        "nt"
-      ],
-      [
-        "ËĪÉĶ",
-        "É¾"
-      ],
-      [
-        "ËĪa",
-        "Ã°"
-      ],
-      [
-        "Ġa",
-        "Éª"
-      ],
-      [
-        "ĠÊ",
-        "Ĳ"
-      ],
-      [
-        "Ġm",
-        "ËĪa"
-      ],
-      [
-        "r",
-        "a"
-      ],
-      [
-        "Ġk",
-        "ËĪÉª"
-      ],
-      [
-        "k",
-        "t"
-      ],
-      [
-        "ËĲ",
-        "p"
-      ],
-      [
-        "ĠÊ",
-        "Ī"
-      ],
-      [
-        "ËĪaËĲ",
-        "ÊĬ"
-      ],
-      [
-        "Ġk",
-        "ËĪÊĮÉ¾"
-      ],
-      [
-        "Ġ",
-        "ËĪÊĮ"
-      ],
-      [
-        "ĠÉĴ",
-        "v"
-      ],
-      [
-        "Ġe",
-        "l"
-      ],
-      [
-        "k",
-        "s"
-      ],
-      [
-        "Ġk",
-        "w"
-      ],
-      [
-        "ÉĻ",
-        "t"
-      ],
-      [
-        "nd",
-        "o"
-      ],
-      [
-        "e",
-        "i"
-      ],
-      [
-        "ĠËĮa",
-        "ËĲp"
-      ],
-      [
-        "s",
-        "e"
-      ],
-      [
-        "ÉĻ",
-        "É¹"
-      ],
-      [
-        "ËĪu",
-        "ei"
-      ],
-      [
-        "ÉĻ",
-        "s"
-      ],
-      [
-        "Ġk",
-        "ËĮo"
-      ],
-      [
-        "ĠÊ",
-        "Ĥ"
-      ],
-      [
-        "ĠËĮ",
-        "ÊĬ"
-      ],
-      [
-        "Ġ",
-        "c"
-      ],
-      [
-        "ĠÉĽ",
-        "n"
-      ],
-      [
-        "ËĪa",
-        "nt"
-      ],
-      [
-        "Î¸",
-        "j"
-      ],
-      [
-        "ËĮo",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġp",
-        "É¾"
-      ],
-      [
-        "s",
-        "i"
-      ],
-      [
-        "Ġ",
-        "ËĪe"
-      ],
-      [
-        "Ġj",
-        "uËĲ"
-      ],
-      [
-        "Ġk",
-        "ËĮe"
-      ],
-      [
-        "ËĮ",
-        "Éª"
-      ],
-      [
-        "ÉĶ",
-        "n"
-      ],
-      [
-        "Ġs",
-        "ËĪÊĮ"
-      ],
-      [
-        "Ġ",
-        "ËĪu"
-      ],
-      [
-        "n",
-        "i"
-      ],
-      [
-        "Ġs",
-        "t"
-      ],
-      [
-        "Ġd",
-        "iËĲ"
-      ],
-      [
-        "Ġk",
-        "eËĲ"
-      ],
-      [
-        "ĠjËĪi",
-        "ou"
-      ],
-      [
-        "ËĪai",
-        "Éľ"
-      ],
-      [
-        "Ġd",
-        "ÊĴ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĶ"
-      ],
-      [
-        "v",
-        "a"
-      ],
-      [
-        "ËĲ",
-        "É¾"
-      ],
-      [
-        "ËĪ",
-        "Ã¸"
-      ],
-      [
-        "ËĮÉĻ",
-        "ÊĬ"
-      ],
-      [
-        "Ġp",
-        "ËĪu"
-      ],
-      [
-        "Ġs",
-        "u"
-      ],
-      [
-        "Ġm",
-        "a"
-      ],
-      [
-        "Ġ",
-        "ÉĻ"
-      ],
-      [
-        "d",
-        "ÊĴ"
-      ],
-      [
-        "Ġp",
-        "Ê°"
-      ],
-      [
-        "l",
-        "e"
-      ],
-      [
-        "i",
-        "n"
-      ],
-      [
-        "ĠtÉķh",
-        "ËĪi"
-      ],
-      [
-        "Ġw",
-        "ËĪo"
-      ],
-      [
-        "r",
-        "o"
-      ],
-      [
-        "ËĮ",
-        "y"
-      ],
-      [
-        "É¾",
-        "a"
-      ],
-      [
-        "Ġs",
-        "ËĪi"
-      ],
-      [
-        "Ã°",
-        "ÉĻ"
-      ],
-      [
-        "Ġs",
-        "eËĲ"
-      ],
-      [
-        "l",
-        "a"
-      ],
-      [
-        "ĠÊ",
-        "Ĵ"
-      ],
-      [
-        "m",
-        "b"
-      ],
-      [
-        "Ġh",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġb",
-        "Ê°"
-      ],
-      [
-        "ĠÉĽ",
-        "É¾"
-      ],
-      [
-        "ĠÃ°",
-        "at"
-      ],
-      [
-        "s",
-        "p"
-      ],
-      [
-        "ÉĶ",
-        "É¾"
-      ],
-      [
-        "e",
-        "n"
-      ],
-      [
-        "Ġs",
-        "ÉĻ"
-      ],
-      [
-        "ËĪÉĶ",
-        "Éľ"
-      ],
-      [
-        "Ġl",
-        "ËĮa"
-      ],
-      [
-        "ĠËĮ",
-        "ÉĽ"
-      ],
-      [
-        "Ġ",
-        "ËĪy"
-      ],
-      [
-        "É¡",
-        "aËĲ"
-      ],
-      [
-        "Ġd",
-        "ÉĽÉ¾"
-      ],
-      [
-        "ËĪÉĽ",
-        "Êģ"
-      ],
-      [
-        "Éľ",
-        "kh"
-      ],
-      [
-        "ËĪi",
-        "ÉĻ"
-      ],
-      [
-        "ËĪa",
-        "n"
-      ],
-      [
-        "Ġm",
-        "ËĪo"
-      ],
-      [
-        "ËĪa",
-        "Î²"
-      ],
-      [
-        "Ġa",
-        "l"
-      ],
-      [
-        "Ġ",
-        "ËĪeËĲ"
-      ],
-      [
-        "Ġ",
-        "Î¸"
-      ],
-      [
-        "Ġn",
-        "ËĪi"
-      ],
-      [
-        "p",
-        "Ê°"
-      ],
-      [
-        "ll",
-        "a"
-      ],
-      [
-        "Ġp",
-        "l"
-      ],
-      [
-        "ËĪ",
-        "Åĵ"
-      ],
-      [
-        "j",
-        "ËĪÉĳu"
-      ],
-      [
-        "Ġa",
-        "v"
-      ],
-      [
-        "Ġm",
-        "ËĪi"
-      ],
-      [
-        "Ġf",
-        "ËĪa"
-      ],
-      [
-        "ËĪÉ",
-        "ľ"
-      ],
-      [
-        "m",
-        "e"
-      ],
-      [
-        "ËĮÉĻ",
-        "h"
-      ],
-      [
-        "ËĪu",
-        "ÉĻ"
-      ],
-      [
-        "i",
-        "t"
-      ],
-      [
-        "j",
-        "ËĪe"
-      ],
-      [
-        "Ġ",
-        "o"
-      ],
-      [
-        "ËĪÉľ",
-        "ËĲ"
-      ],
-      [
-        "ĠtÉķËĪi",
-        "ou"
-      ],
-      [
-        "ÉĶ",
-        "ËĲ"
-      ],
-      [
-        "Ġn",
-        "ÉĻ"
-      ],
-      [
-        "ËĪÉĻ",
-        "Éľn"
-      ],
-      [
-        "Ġm",
-        "ÉĻ"
-      ],
-      [
-        "Ġd",
-        "eËĲ"
-      ],
-      [
-        "m",
-        "o"
-      ],
-      [
-        "s",
-        "a"
-      ],
-      [
-        "j",
-        "ËĪÉĶ"
-      ],
-      [
-        "ËĪa",
-        "l"
-      ],
-      [
-        "ĠtÉķ",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ĠÉ¡",
-        "ÉĻ"
-      ],
-      [
-        "Ã°",
-        "a"
-      ],
-      [
-        "ĠÉª",
-        "z"
-      ],
-      [
-        "Ġs",
-        "a"
-      ],
-      [
-        "r",
-        "i"
-      ],
-      [
-        "ĠËĮi",
-        "l"
-      ],
-      [
-        "ËĮ",
-        "u"
-      ],
-      [
-        "Ġk",
-        "aËĲ"
-      ],
-      [
-        "ĠÉĻ",
-        "ËĲ"
-      ],
-      [
-        "ĠÉ",
-        "ĸ"
-      ],
-      [
-        "Ġk",
-        "a"
-      ],
-      [
-        "ËĪÊĮh",
-        "i"
-      ],
-      [
-        "Ġj",
-        "eËĲ"
-      ],
-      [
-        "Ġt",
-        "Ê°"
-      ],
-      [
-        "n",
-        "e"
-      ],
-      [
-        "k",
-        "ËĲ"
-      ],
-      [
-        "Ġts",
-        "ËĪai"
-      ],
-      [
-        "Ġ",
-        "ËĪeËĲk"
-      ],
-      [
-        "n",
-        "k"
-      ],
-      [
-        "t",
-        "i"
-      ],
-      [
-        "ËĪa",
-        "Éľn"
-      ],
-      [
-        "Ġk",
-        "ËĲ"
-      ],
-      [
-        "É¡",
-        "ÉĻn"
-      ],
-      [
-        "ËĪi",
-        "a"
-      ],
-      [
-        "ĠÉĶ",
-        "ËĲÉ¾"
-      ],
-      [
-        "Ê",
-        "ı"
-      ],
-      [
-        "ĠËĮ",
-        "ÊĮ"
-      ],
-      [
-        "Ġz",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġl",
-        "os"
-      ],
-      [
-        "ÉĽ",
-        "s"
-      ],
-      [
-        "ËĪÉĶ",
-        "n"
-      ],
-      [
-        "ÉĽ",
-        "nt"
-      ],
-      [
-        "ÉĽ",
-        "n"
-      ],
-      [
-        "ĠÉŁ",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ã§",
-        "t"
-      ],
-      [
-        "Ġd",
-        "as"
-      ],
-      [
-        "Ġx",
-        "ËĮo"
-      ],
-      [
-        "ËĪu",
-        "Éľ"
-      ],
-      [
-        "ËĪa",
-        "s"
-      ],
-      [
-        "Ġb",
-        "ËĪÊĮ"
-      ],
-      [
-        "ËĪiÉĽ",
-        "Éľn"
-      ],
-      [
-        "É",
-        "Ĳ"
-      ],
-      [
-        "Ġts",
-        "uËĲ"
-      ],
-      [
-        "Ġp",
-        "ËĮÉĽ"
-      ],
-      [
-        "Ġn",
-        "ËĪÉĶ"
-      ],
-      [
-        "ÊĬ",
-        "t"
-      ],
-      [
-        "m",
-        "a"
-      ],
-      [
-        "Ġn",
-        "ËĪo"
-      ],
-      [
-        "Ġl",
-        "ËĪÉª"
-      ],
-      [
-        "ËĪÉĽ",
-        "s"
-      ],
-      [
-        "Éª",
-        "l"
-      ],
-      [
-        "ĠÉķ",
-        "ËĪiÉĽ"
-      ],
-      [
-        "Ġ",
-        "ËĪÊĬ"
-      ],
-      [
-        "ÉĴ",
-        "t"
-      ],
-      [
-        "t",
-        "o"
-      ],
-      [
-        "Ġ",
-        "ËĪo"
-      ],
-      [
-        "ËĮo",
-        "n"
-      ],
-      [
-        "Ġk",
-        "wËĪa"
-      ],
-      [
-        "ĠÉª",
-        "t"
-      ],
-      [
-        "Ġh",
-        "oËĲ"
-      ],
-      [
-        "ËĪiËĲ",
-        "k"
-      ],
-      [
-        "ĠËĮaËĲp",
-        "k"
-      ],
-      [
-        "ËĪaÉª",
-        "n"
-      ],
-      [
-        "Ã",
-        "¦"
-      ],
-      [
-        "ÉĻn",
-        "t"
-      ],
-      [
-        "t",
-        "a"
-      ],
-      [
-        "l",
-        "o"
-      ],
-      [
-        "Ġn",
-        "ËĪÉĳ"
-      ],
-      [
-        "Ġl",
-        "ËĪa"
-      ],
-      [
-        "ËĪi",
-        "Éľ"
-      ],
-      [
-        "Ġw",
-        "ËĪei"
-      ],
-      [
-        "ÉĽ",
-        "Êģ"
-      ],
-      [
-        "Ġt",
-        "ËĪa"
-      ],
-      [
-        "ĠÉ¾",
-        "ËĮÉĻh"
-      ],
-      [
-        "ĠÉķËĪi",
-        "Éĳ"
-      ],
-      [
-        "ËĮi",
-        "ËĲ"
-      ],
-      [
-        "ËĮÉĽ",
-        "l"
-      ],
-      [
-        "ĠtÉĻ",
-        "Éľ"
-      ],
-      [
-        "Ġk",
-        "ËĪuo"
-      ],
-      [
-        "Ġt",
-        "ËĪu"
-      ],
-      [
-        "j",
-        "ËĪÉĽ"
-      ],
-      [
-        "ĠËĮi",
-        "n"
-      ],
-      [
-        "É¾",
-        "e"
-      ],
-      [
-        "Ġk",
-        "oËĲ"
-      ],
-      [
-        "Ġk",
-        "ËĪa"
-      ],
-      [
-        "É¾",
-        "i"
-      ],
-      [
-        "ĠtÉķËĪi",
-        "Éĳ"
-      ],
-      [
-        "l",
-        "ÉĻ"
-      ],
-      [
-        "Ġk",
-        "ÉĻ"
-      ],
-      [
-        "Ġt",
-        "ËĪi"
-      ],
-      [
-        "ĠÅĭ",
-        "ËĪyÉĻ"
-      ],
-      [
-        "Ġts",
-        "h"
-      ],
-      [
-        "e",
-        "r"
-      ],
-      [
-        "a",
-        "v"
-      ],
-      [
-        "ĠkÉĶ",
-        "n"
-      ],
-      [
-        "ËĪÉĻ",
-        "ÉľÅĭ"
-      ],
-      [
-        "Ã°",
-        "o"
-      ],
-      [
-        "ËĪaËĲ",
-        "n"
-      ],
-      [
-        "ĠbÊ°",
-        "ËĪi"
-      ],
-      [
-        "ĠkËĲ",
-        "jaËĲ"
-      ],
-      [
-        "ÉĻ",
-        "z"
-      ],
-      [
-        "Ġp",
-        "Êģ"
-      ],
-      [
-        "Ġd",
-        "ËĪÉª"
-      ],
-      [
-        "Ġz",
-        "iËĲ"
-      ],
-      [
-        "É¡",
-        "eËĲ"
-      ],
-      [
-        "Ġt",
-        "ËĪÉĻ"
-      ],
-      [
-        "Éª",
-        "z"
-      ],
-      [
-        "Ġn",
-        "ËĮon"
-      ],
-      [
-        "t",
-        "aËĲ"
-      ],
-      [
-        "b",
-        "l"
-      ],
-      [
-        "t",
-        "e"
-      ],
-      [
-        "n",
-        "ËĮeËĲ"
-      ],
-      [
-        "ËĪÉª",
-        "l"
-      ],
-      [
-        "s",
-        "o"
-      ],
-      [
-        "k",
-        "o"
-      ],
-      [
-        "u",
-        "Êģ"
-      ],
-      [
-        "ĠÉ",
-        "£"
-      ],
-      [
-        "Ġpa",
-        "Êģ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĽ"
-      ],
-      [
-        "j",
-        "ËĪuËĲ"
-      ],
-      [
-        "ËĮ",
-        "ÊĮ"
-      ],
-      [
-        "y",
-        "n"
-      ],
-      [
-        "ËĪiËĲ",
-        "n"
-      ],
-      [
-        "Ġl",
-        "ËĪaÉª"
-      ],
-      [
-        "ËĪÉª",
-        "Åĭ"
-      ],
-      [
-        "ĠtÉķh",
-        "ËĪy"
-      ],
-      [
-        "Ġn",
-        "ËĪÊĮhi"
-      ],
-      [
-        "Ġd",
-        "ËĮe"
-      ],
-      [
-        "Ġj",
-        "ËĪÉĳu"
-      ],
-      [
-        "Ġt",
-        "ËĪÉĳu"
-      ],
-      [
-        "Ġh",
-        "ËĪo"
-      ],
-      [
-        "Éª",
-        "d"
-      ],
-      [
-        "Ġth",
-        "ËĪÉĳ"
-      ],
-      [
-        "m",
-        "ËĪe"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĻ"
-      ],
-      [
-        "j",
-        "a"
-      ],
-      [
-        "Ġp",
-        "h"
-      ],
-      [
-        "ÉĽ",
-        "t"
-      ],
-      [
-        "Ġk",
-        "ËĪÊĮ"
-      ],
-      [
-        "t",
-        "ÉĻn"
-      ],
-      [
-        "m",
-        "ËĪÉĳ"
-      ],
-      [
-        "w",
-        "ËĪe"
-      ],
-      [
-        "ĠËĮa",
-        "Éªn"
-      ],
-      [
-        "ĠÃ°",
-        "Éªs"
-      ],
-      [
-        "É¡",
-        "ÉĻ"
-      ],
-      [
-        "Ġn",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġb",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġa",
-        "Î¸"
-      ],
-      [
-        "Ġm",
-        "ËĮa"
-      ],
-      [
-        "ËĪÊĮh",
-        "a"
-      ],
-      [
-        "Ġd",
-        "ËĮa"
-      ],
-      [
-        "ËĪ",
-        "Êı"
-      ],
-      [
-        "ĠÉ²",
-        "ËĮy"
-      ],
-      [
-        "Ġp",
-        "ËĪa"
-      ],
-      [
-        "ËĪaÃ°",
-        "o"
-      ],
-      [
-        "d",
-        "i"
-      ],
-      [
-        "b",
-        "Éľ"
-      ],
-      [
-        "É",
-        "³"
-      ],
-      [
-        "Ġw",
-        "iËĲ"
-      ],
-      [
-        "Ġn",
-        "ËĪÉª"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÉĶÉľ"
-      ],
-      [
-        "tËĲ",
-        "o"
-      ],
-      [
-        "ËĮÉĻ",
-        "m"
-      ],
-      [
-        "ËĪaËĲ",
-        "r"
-      ],
-      [
-        "Ġm",
-        "ÉĽ"
-      ],
-      [
-        "ËĪeËĲ",
-        "É¡aËĲ"
-      ],
-      [
-        "Ġs",
-        "ËĮi"
-      ],
-      [
-        "Ġl",
-        "ËĮaËĲ"
-      ],
-      [
-        "n",
-        "ËĮaËĲ"
-      ],
-      [
-        "Ġs",
-        "p"
-      ],
-      [
-        "t",
-        "Êģ"
-      ],
-      [
-        "ĠÊ",
-        "İ"
-      ],
-      [
-        "ËĮ",
-        "ÉĳËĲ"
-      ],
-      [
-        "Ġk",
-        "l"
-      ],
-      [
-        "k",
-        "Ê°"
-      ],
-      [
-        "i",
-        "l"
-      ],
-      [
-        "ĠÊĥ",
-        "t"
-      ],
-      [
-        "ĠËĮÊĬ",
-        "n"
-      ],
-      [
-        "a",
-        "l"
-      ],
-      [
-        "Ġs",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġm",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġ",
-        "Åĵ"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÊĮ"
-      ],
-      [
-        "ĠpËĮÉĽ",
-        "r"
-      ],
-      [
-        "É¾",
-        "ËĪa"
-      ],
-      [
-        "ËĲ",
-        "ÊĪ"
-      ],
-      [
-        "ËĪaÎ²",
-        "a"
-      ],
-      [
-        "Ġw",
-        "ËĪÉĴ"
-      ],
-      [
-        "Ġx",
-        "ËĪuei"
-      ],
-      [
-        "Ġkh",
-        "ËĪo"
-      ],
-      [
-        "Ġla",
-        "s"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪo"
-      ],
-      [
-        "Ġf",
-        "ÉĽÉ¾"
-      ],
-      [
-        "Ġj",
-        "ËĪiÉĽ"
-      ],
-      [
-        "Ġt",
-        "ËĪe"
-      ],
-      [
-        "Ġk",
-        "ËĮÉĶ"
-      ],
-      [
-        "ĠdeËĲ",
-        "n"
-      ],
-      [
-        "Ġm",
-        "o"
-      ],
-      [
-        "Ġp",
-        "ËĪi"
-      ],
-      [
-        "Ġt",
-        "ËĪÉĳ"
-      ],
-      [
-        "ËĪÉĽ",
-        "st"
-      ],
-      [
-        "w",
-        "ËĪÉĳ"
-      ],
-      [
-        "ËĪaÉª",
-        "t"
-      ],
-      [
-        "ÉĻ",
-        "ÊĬ"
-      ],
-      [
-        "Ġ",
-        "ËĪi"
-      ],
-      [
-        "Éª",
-        "j"
-      ],
-      [
-        "a",
-        "Éª"
-      ],
-      [
-        "ËĪaËĲ",
-        "Éľ"
-      ],
-      [
-        "ĠËĪÉª",
-        "s"
-      ],
-      [
-        "Ġp",
-        "ÉĶÉ¾"
-      ],
-      [
-        "Ã¦",
-        "Éľn"
-      ],
-      [
-        "k",
-        "a"
-      ],
-      [
-        "Åĭ",
-        "É¡"
-      ],
-      [
-        "b",
-        "ÉĻn"
-      ],
-      [
-        "ÊĬ",
-        "f"
-      ],
-      [
-        "Ġp",
-        "É¹"
-      ],
-      [
-        "Ġl",
-        "ËĮe"
-      ],
-      [
-        "ËĪiËĲ",
-        "d"
-      ],
-      [
-        "ËĪaËĲ",
-        "re"
-      ],
-      [
-        "Ġm",
-        "ËĪÊĮ"
-      ],
-      [
-        "ÉĻ",
-        "r"
-      ],
-      [
-        "Ġd",
-        "Éĳ"
-      ],
-      [
-        "ËĪaËĲt",
-        "o"
-      ],
-      [
-        "Ġp",
-        "ËĪeËĲ"
-      ],
-      [
-        "Ġd",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġs",
-        "ËĮÊĬ"
-      ],
-      [
-        "Ġh",
-        "ËĪi"
-      ],
-      [
-        "Ġs",
-        "ËĪa"
-      ],
-      [
-        "ËĪeËĲ",
-        "n"
-      ],
-      [
-        "d",
-        "ÉĻ"
-      ],
-      [
-        "Ġp",
-        "j"
-      ],
-      [
-        "ËĪÅĵ",
-        "Êģ"
-      ],
-      [
-        "l",
-        "ÉªÃ§"
-      ],
-      [
-        "ÉĴ",
-        "n"
-      ],
-      [
-        "ĠËĪÉĻ",
-        "r"
-      ],
-      [
-        "t",
-        "ËĪe"
-      ],
-      [
-        "Ġi",
-        "l"
-      ],
-      [
-        "ËĪaËĲ",
-        "l"
-      ],
-      [
-        "Ġs",
-        "ËĮÉĻÊĬ"
-      ],
-      [
-        "s",
-        "ÊĪ"
-      ],
-      [
-        "Ġd",
-        "ËĪuËĲ"
-      ],
-      [
-        "h",
-        "ËĪÉĳ"
-      ],
-      [
-        "Ġx",
-        "ËĪou"
-      ],
-      [
-        "Ġl",
-        "ËĪaiÉľ"
-      ],
-      [
-        "w",
-        "ËĪo"
-      ],
-      [
-        "ËĪÉĽnt",
-        "e"
-      ],
-      [
-        "Ġs",
-        "y"
-      ],
-      [
-        "Ġz",
-        "ÉªÃ§"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪu"
-      ],
-      [
-        "ĠÉķ",
-        "ËĪy"
-      ],
-      [
-        "ËĪÉĶËĲ",
-        "l"
-      ],
-      [
-        "ÉĶ",
-        "l"
-      ],
-      [
-        "Ġt",
-        "ËĪo"
-      ],
-      [
-        "ĠÊĭ",
-        "oËĲ"
-      ],
-      [
-        "Ġ",
-        "iËĲ"
-      ],
-      [
-        "wËĪa",
-        "Ã°a"
-      ],
-      [
-        "ËĪa",
-        "ndo"
-      ],
-      [
-        "ĠaÎ¸",
-        "ÉĽnt"
-      ],
-      [
-        "ĠaÎ¸ÉĽnt",
-        "wËĪaÃ°a"
-      ],
-      [
-        "Ġt",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ËĪei",
-        "Éľ"
-      ],
-      [
-        "Ġp",
-        "ËĮa"
-      ],
-      [
-        "Ġn",
-        "ËĪaÉª"
-      ],
-      [
-        "w",
-        "a"
-      ],
-      [
-        "Ġf",
-        "r"
-      ],
-      [
-        "ĠÊĲ",
-        "ËĪÉĻÉľn"
-      ],
-      [
-        "ËĪu",
-        "a"
-      ],
-      [
-        "m",
-        "i"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĽ"
-      ],
-      [
-        "ËĪeËĲk",
-        "Ê°"
-      ],
-      [
-        "c",
-        "Ê°"
-      ],
-      [
-        "Ġw",
-        "ËĪÉĳ"
-      ],
-      [
-        "st",
-        "a"
-      ],
-      [
-        "Ġt",
-        "u"
-      ],
-      [
-        "Ġs",
-        "k"
-      ],
-      [
-        "ËĪÉĶ",
-        "l"
-      ],
-      [
-        "ËĪeËĲ",
-        "ÊĪ"
-      ],
-      [
-        "Ġl",
-        "ËĪaËĲÉª"
-      ],
-      [
-        "Ġl",
-        "ËĪaËĲ"
-      ],
-      [
-        "ËĪÉĽËĲ",
-        "s"
-      ],
-      [
-        "ËĪÉĽÉ¾",
-        "a"
-      ],
-      [
-        "ËĪÉĻ",
-        "Éľt"
-      ],
-      [
-        "Ġ",
-        "yn"
-      ],
-      [
-        "d",
-        "ÉĻn"
-      ],
-      [
-        "Ġd",
-        "i"
-      ],
-      [
-        "ËĪiËĲ",
-        "s"
-      ],
-      [
-        "ĠÃ°e",
-        "l"
-      ],
-      [
-        "ËĪÊĮ",
-        "r"
-      ],
-      [
-        "Ġh",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġb",
-        "ÉĻ"
-      ],
-      [
-        "Ġj",
-        "ËĪuËĲ"
-      ],
-      [
-        "ll",
-        "e"
-      ],
-      [
-        "st",
-        "o"
-      ],
-      [
-        "ËĪÉª",
-        "t"
-      ],
-      [
-        "ËĪoËĲ",
-        "É¾"
-      ],
-      [
-        "b",
-        "Ê°"
-      ],
-      [
-        "m",
-        "ÉĻn"
-      ],
-      [
-        "ËĮu",
-        "ÉĻ"
-      ],
-      [
-        "ËĮÉĻ",
-        "É¾"
-      ],
-      [
-        "ËĪÊĮ",
-        "n"
-      ],
-      [
-        "ĠlËĪaÉª",
-        "k"
-      ],
-      [
-        "Ġb",
-        "ËĪa"
-      ],
-      [
-        "Éª",
-        "Ã°"
-      ],
-      [
-        "Ġl",
-        "o"
-      ],
-      [
-        "z",
-        "i"
-      ],
-      [
-        "ËĪÊĮ",
-        "st"
-      ],
-      [
-        "m",
-        "ËĪi"
-      ],
-      [
-        "ÉĶ",
-        "Êģ"
-      ],
-      [
-        "ĠnËĪÉª",
-        "Ã§t"
-      ],
-      [
-        "Ġt",
-        "É¾"
-      ],
-      [
-        "Ġd",
-        "ËĪeËĲkÊ°"
-      ],
-      [
-        "Ġs",
-        "ËĮe"
-      ],
-      [
-        "Ġn",
-        "ËĪÉĻÊĬ"
-      ],
-      [
-        "Ġ",
-        "u"
-      ],
-      [
-        "Ġs",
-        "i"
-      ],
-      [
-        "ĠÉª",
-        "Ã§"
-      ],
-      [
-        "Ġp",
-        "r"
-      ],
-      [
-        "ĠtÉķ",
-        "ËĪy"
-      ],
-      [
-        "Ġm",
-        "ËĪu"
-      ],
-      [
-        "z",
-        "a"
-      ],
-      [
-        "Ġt",
-        "Êģ"
-      ],
-      [
-        "Ġw",
-        "ÉªÃ°"
-      ],
-      [
-        "t",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġp",
-        "ËĪÊĮÉ¾"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĶ"
-      ],
-      [
-        "ËĪoËĲ",
-        "r"
-      ],
-      [
-        "Ġh",
-        "ËĮa"
-      ],
-      [
-        "Ġk",
-        "ËĪonÉ¡"
-      ],
-      [
-        "Ġp",
-        "uÊģ"
-      ],
-      [
-        "Ġd",
-        "y"
-      ],
-      [
-        "ËĪÉª",
-        "n"
-      ],
-      [
-        "nt",
-        "e"
-      ],
-      [
-        "Ġk",
-        "ËĮa"
-      ],
-      [
-        "ËĪÉĻ",
-        "Éª"
-      ],
-      [
-        "Ġm",
-        "i"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĮuÉĻ"
-      ],
-      [
-        "ĠÊ",
-        "²"
-      ],
-      [
-        "Ġf",
-        "ËĪÉĳ"
-      ],
-      [
-        "Ġv",
-        "ÉĳËĲ"
-      ],
-      [
-        "ĠËĮa",
-        "ÊĬ"
-      ],
-      [
-        "ËĮ",
-        "uËĲ"
-      ],
-      [
-        "ĠËĪu",
-        "n"
-      ],
-      [
-        "Ġj",
-        "ËĪÊĮha"
-      ],
-      [
-        "j",
-        "uËĲ"
-      ],
-      [
-        "Ġm",
-        "Éªt"
-      ],
-      [
-        "Ġl",
-        "ËĪÉĽ"
-      ],
-      [
-        "ËĪeËĲ",
-        "Êĥ"
-      ],
-      [
-        "Ġf",
-        "ÉĶËĲ"
-      ],
-      [
-        "m",
-        "ÉĻ"
-      ],
-      [
-        "É¾",
-        "t"
-      ],
-      [
-        "ĠkËĮo",
-        "n"
-      ],
-      [
-        "Ġl",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġx",
-        "ËĪÉĳu"
-      ],
-      [
-        "p",
-        "l"
-      ],
-      [
-        "Ġd",
-        "ËĪi"
-      ],
-      [
-        "Ġl",
-        "ËĪoËĲ"
-      ],
-      [
-        "s",
-        "ÉĻ"
-      ],
-      [
-        "ËĪaËĲ",
-        "va"
-      ],
-      [
-        "Ġl",
-        "ËĪu"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĮÉĻÊĬ"
-      ],
-      [
-        "Ġh",
-        "av"
-      ],
-      [
-        "ĠËĮaËĲpk",
-        "ËĮoËĲ"
-      ],
-      [
-        "É¾",
-        "ËĪi"
-      ],
-      [
-        "Ġf",
-        "ËĪÉĻ"
-      ],
-      [
-        "Ġh",
-        "ËĮÉĻm"
-      ],
-      [
-        "ËĪonÉ¡",
-        "Éľ"
-      ],
-      [
-        "j",
-        "o"
-      ],
-      [
-        "Ġs",
-        "ÉĶ"
-      ],
-      [
-        "ËĪaËĲ",
-        "d"
-      ],
-      [
-        "w",
-        "ËĪiÉĻ"
-      ],
-      [
-        "ËĪa",
-        "nd"
-      ],
-      [
-        "ËĮa",
-        "Éªn"
-      ],
-      [
-        "t",
-        "É¾"
-      ],
-      [
-        "ĠËĮ",
-        "Éª"
-      ],
-      [
-        "ĠËĪu",
-        "na"
-      ],
-      [
-        "Ġx",
-        "wËĪÉĳ"
-      ],
-      [
-        "Ġj",
-        "ÉĶËĲ"
-      ],
-      [
-        "Êģ",
-        "ËĪi"
-      ],
-      [
-        "ĠkËĪuo",
-        "Éľ"
-      ],
-      [
-        "Ġa",
-        "Î²"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪaËĲ"
-      ],
-      [
-        "an",
-        "o"
-      ],
-      [
-        "t",
-        "ÉĻl"
-      ],
-      [
-        "Ġr",
-        "ËĮe"
-      ],
-      [
-        "ËĮÊĮ",
-        "t"
-      ],
-      [
-        "ĠjËĪi",
-        "Éĳ"
-      ],
-      [
-        "ĠÉ¾ËĮÉĻh",
-        "aËĲ"
-      ],
-      [
-        "Ġm",
-        "ËĪe"
-      ],
-      [
-        "ĠËĪy",
-        "Ã¦Éľn"
-      ],
-      [
-        "Ġf",
-        "ËĪu"
-      ],
-      [
-        "Ġb",
-        "l"
-      ],
-      [
-        "n",
-        "ËĪi"
-      ],
-      [
-        "s",
-        "ÉĻn"
-      ],
-      [
-        "Ġa",
-        "Éªn"
-      ],
-      [
-        "ËĪi",
-        "ÊĬ"
-      ],
-      [
-        "ĠÃ°e",
-        "Éª"
-      ],
-      [
-        "ĠÉª",
-        "ts"
-      ],
-      [
-        "Ġ",
-        "("
-      ],
-      [
-        "ËĪy",
-        "ËĲ"
-      ],
-      [
-        "ÉĻ",
-        "d"
-      ],
-      [
-        "ĠËĮ",
-        "o"
-      ],
-      [
-        "ĠÉĽ",
-        "s"
-      ],
-      [
-        "Ġv",
-        "iËĲ"
-      ],
-      [
-        "ËĲ",
-        "É¡eËĲ"
-      ],
-      [
-        "k",
-        "ËĪe"
-      ],
-      [
-        "ĠËĪa",
-        "l"
-      ],
-      [
-        "ÉĽ",
-        "l"
-      ],
-      [
-        "Ġ",
-        "ÊĮ"
-      ],
-      [
-        "ËĲ",
-        "o"
-      ],
-      [
-        "Ġk",
-        "ËĪo"
-      ],
-      [
-        "ĠÊĪ",
-        "ËĪuËĲ"
-      ],
-      [
-        "Ġs",
-        "ËĪÉª"
-      ],
-      [
-        "ËĪeËĲ",
-        "É¾"
-      ],
-      [
-        "Éľ",
-        "m"
-      ],
-      [
-        "ËĮ",
-        "ÉĻn"
-      ],
-      [
-        "ËĪaËĲ",
-        "i"
-      ],
-      [
-        "ËĪoËĲ",
-        "l"
-      ],
-      [
-        "Éª",
-        "ËĮeËĲ"
-      ],
-      [
-        "ĠÊ²",
-        "ËĪy"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "s",
-        "ËĪi"
-      ],
-      [
-        "Ġl",
-        "ËĪe"
-      ],
-      [
-        "ËĮ",
-        "ÉĴt"
-      ],
-      [
-        "ËĪiËĲ",
-        "p"
-      ],
-      [
-        "a",
-        "Êģ"
-      ],
-      [
-        "ĠÎ¸",
-        "ËĪÉªÅĭ"
-      ],
-      [
-        "ËĪÉĻËĲ",
-        "Éª"
-      ],
-      [
-        "ËĪÊĮ",
-        "l"
-      ],
-      [
-        "ĠhËĪoËĲ",
-        "taËĲ"
-      ],
-      [
-        "ËĪo",
-        "Éª"
-      ],
-      [
-        "nt",
-        "o"
-      ],
-      [
-        "z",
-        "h"
-      ],
-      [
-        "ĠdeËĲ",
-        "m"
-      ],
-      [
-        "ĠkÉĶ",
-        "m"
-      ],
-      [
-        "Ê°",
-        "ËĪiËĲk"
-      ],
-      [
-        "ĠdÊĴ",
-        "ËĪÊĮst"
-      ],
-      [
-        "p",
-        "É¾"
-      ],
-      [
-        "Ġl",
-        "y"
-      ],
-      [
-        "h",
-        "ËĪu"
-      ],
-      [
-        "ËĪÉĶ",
-        "Ã¸"
-      ],
-      [
-        "ËĪaËĲ",
-        "s"
-      ],
-      [
-        "ĠËĪa",
-        "n"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĴ"
-      ],
-      [
-        "Ġk",
-        "an"
-      ],
-      [
-        "Ġts",
-        "ËĪuo"
-      ],
-      [
-        "ËĪeËĲ",
-        "va"
-      ],
-      [
-        "ĠÉ¡",
-        "É¾"
-      ],
-      [
-        "Ġp",
-        "o"
-      ],
-      [
-        "ĠtÊĥ",
-        "ËĪÉĶ"
-      ],
-      [
-        "Êİ",
-        "a"
-      ],
-      [
-        "Ġm",
-        "ËĮi"
-      ],
-      [
-        "Êĥ",
-        "t"
-      ],
-      [
-        "t",
-        "ËĪi"
-      ],
-      [
-        "Ġh",
-        "ËĪÊĮ"
-      ],
-      [
-        "tÊĥ",
-        "e"
-      ],
-      [
-        "Ġf",
-        "ÉĶn"
-      ],
-      [
-        "v",
-        "e"
-      ],
-      [
-        "Ġn",
-        "ËĮe"
-      ],
-      [
-        "ËĪÉĶ",
-        "Êģ"
-      ],
-      [
-        "i",
-        "z"
-      ],
-      [
-        "Ġs",
-        "ËĪuo"
-      ],
-      [
-        "ËĪÉĽËĲ",
-        "r"
-      ],
-      [
-        "wËĪa",
-        "Êģ"
-      ],
-      [
-        "ËĪaÃ°",
-        "a"
-      ],
-      [
-        "Åĭ",
-        "k"
-      ],
-      [
-        "p",
-        "o"
-      ],
-      [
-        "Ġk",
-        "ËĪi"
-      ],
-      [
-        "ËĪa",
-        "d"
-      ],
-      [
-        "Ġv",
-        "ËĪi"
-      ],
-      [
-        "t",
-        "Éķ"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĻ"
-      ],
-      [
-        "Ġw",
-        "ËĪu"
-      ],
-      [
-        "ÉĴ",
-        "z"
-      ],
-      [
-        "ĠvÉĳËĲ",
-        "É¾"
-      ],
-      [
-        "Êģ",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġk",
-        "ËĪaËĲ"
-      ],
-      [
-        "k",
-        "e"
-      ],
-      [
-        "n",
-        "ÉĻ"
-      ],
-      [
-        "ËĪÊĮ",
-        "b"
-      ],
-      [
-        "ËĪuËĲ",
-        "É¾"
-      ],
-      [
-        "ËĮÉĻ",
-        "ËĲ"
-      ],
-      [
-        "ĠÊĪ",
-        "Ê°ËĪiËĲk"
-      ],
-      [
-        "Ġk",
-        "ËĪu"
-      ],
-      [
-        "Ġb",
-        "ËĮÊĮt"
-      ],
-      [
-        "Ġa",
-        "t"
-      ],
-      [
-        "Ġf",
-        "É¹"
-      ],
-      [
-        "ËĪa",
-        "x"
-      ],
-      [
-        "Ġz",
-        "oËĲ"
-      ],
-      [
-        "Ġt",
-        "ËĪaËĲ"
-      ],
-      [
-        "ĠÃ°",
-        "ËĮe"
-      ],
-      [
-        "n",
-        "eËĲ"
-      ],
-      [
-        "ĠÉĳ",
-        "ËĲ"
-      ],
-      [
-        "Ġa",
-        "ÊĬf"
-      ],
-      [
-        "a",
-        "m"
-      ],
-      [
-        "ÊĬ",
-        "Åĭ"
-      ],
-      [
-        "ĠÉĶ",
-        "ËĲ"
-      ],
-      [
-        "ĠÉķËĪi",
-        "ÉľÅĭ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĶËĲl"
-      ],
-      [
-        "Éª",
-        "m"
-      ],
-      [
-        "j",
-        "ËĪo"
-      ],
-      [
-        "ËĪiËĲ",
-        "ÉŁ"
-      ],
-      [
-        "Ġkw",
-        "ËĮÉĽ"
-      ],
-      [
-        "ĠmËĪa",
-        "s"
-      ],
-      [
-        "ÉĻ",
-        "h"
-      ],
-      [
-        "ĠËĪa",
-        "ÊĬ"
-      ],
-      [
-        "ËĪÉĶ",
-        "Éª"
-      ],
-      [
-        "É¡",
-        "ÉĻÉ¾"
-      ],
-      [
-        "r",
-        "ÉĻn"
-      ],
-      [
-        "ËĪÉª",
-        "k"
-      ],
-      [
-        "s",
-        "se"
-      ],
-      [
-        "Ġp",
-        "ËĪÉĳ"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĮe"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪi"
-      ],
-      [
-        "Ġa",
-        "z"
-      ],
-      [
-        "ĠÉ¡ËĪÊĮ",
-        "jaËĲ"
-      ],
-      [
-        "z",
-        "e"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĮaËĲ"
-      ],
-      [
-        "Ġf",
-        "ËĪi"
-      ],
-      [
-        "ĠËĮ",
-        "ÉĴn"
-      ],
-      [
-        "Ġx",
-        "ËĪo"
-      ],
-      [
-        "ĠËĮÊĬ",
-        "na"
-      ],
-      [
-        "ĠtÊ°",
-        "aËĲ"
-      ],
-      [
-        "Ġs",
-        "Éĳ"
-      ],
-      [
-        "ËĪeÉª",
-        "ÊĥÉĻn"
-      ],
-      [
-        "ĠtÉķËĪi",
-        "Éľ"
-      ],
-      [
-        "ĠÉŁ",
-        "aËĲ"
-      ],
-      [
-        "p",
-        "ËĲ"
-      ],
-      [
-        "Ġpl",
-        "y"
-      ],
-      [
-        "Î¸",
-        "ËĪi"
-      ],
-      [
-        "ËĲ",
-        "Éĸ"
-      ],
-      [
-        "Ġt",
-        "ËĪuei"
-      ],
-      [
-        "Ġl",
-        "ËĪÉĻ"
-      ],
-      [
-        "Ġd",
-        "ÉĳËĲ"
-      ],
-      [
-        "f",
-        "t"
-      ],
-      [
-        "ËĪa",
-        "m"
-      ],
-      [
-        "ĠsËĪÊĮ",
-        "kt"
-      ],
-      [
-        "Ġt",
-        "ËĪou"
-      ],
-      [
-        "Ġp",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ĠËĪa",
-        "i"
-      ],
-      [
-        "ĠwËĪÉĴ",
-        "n"
-      ],
-      [
-        "Ġz",
-        "ËĮaÉªn"
-      ],
-      [
-        "Ġe",
-        "st"
-      ],
-      [
-        "Ġm",
-        "ÉĶ"
-      ],
-      [
-        "ĠtÉķ",
-        "jËĪÉĳu"
-      ],
-      [
-        "Éľ",
-        "p"
-      ],
-      [
-        "ËĪÊĮ",
-        "z"
-      ],
-      [
-        "b",
-        "i"
-      ],
-      [
-        "ËĪÉĽËĲs",
-        "eËĲ"
-      ],
-      [
-        "Ġl",
-        "ËĪy"
-      ],
-      [
-        "Ġm",
-        "ËĮe"
-      ],
-      [
-        "Ġd",
-        "ËĮÉĽl"
-      ],
-      [
-        "ËĪiËĲ",
-        "l"
-      ],
-      [
-        "ĠkËĮo",
-        "mo"
-      ],
-      [
-        "Ġh",
-        "ËĪaÉľn"
-      ],
-      [
-        "ËĪoËĲ",
-        "ne"
-      ],
-      [
-        "ĠkËĪÊĮÉ¾",
-        "t"
-      ],
-      [
-        "Ġsy",
-        "Êģ"
-      ],
-      [
-        "ËĮÉĶ",
-        "É¾"
-      ],
-      [
-        "ĠÉª",
-        "f"
-      ],
-      [
-        "u",
-        "v"
-      ],
-      [
-        "z",
-        "ÉĻn"
-      ],
-      [
-        "o",
-        "l"
-      ],
-      [
-        "Ï",
-        "ĩ"
-      ],
-      [
-        "i",
-        "m"
-      ],
-      [
-        "Ġm",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ĠÃ°",
-        "Éª"
-      ],
-      [
-        "Ġv",
-        "ËĪÉĽ"
-      ],
-      [
-        "ÊĬ",
-        "d"
-      ],
-      [
-        "Ġt",
-        "r"
-      ],
-      [
-        "ËĪeËĲ",
-        "s"
-      ],
-      [
-        "Ã°",
-        "e"
-      ],
-      [
-        "d",
-        "e"
-      ],
-      [
-        "Ê°",
-        "Ïĩ"
-      ],
-      [
-        "ÉŁ",
-        "Ê°"
-      ],
-      [
-        "ËĮÉĻËĲ",
-        "ÉªÉľ"
-      ],
-      [
-        "b",
-        "ËĲ"
-      ],
-      [
-        "ËĪÊĬ",
-        "k"
-      ],
-      [
-        "ĠnËĪÉĶ",
-        "ÉªÉľ"
-      ],
-      [
-        "ĠËĮ",
-        "iËĲ"
-      ],
-      [
-        "ËĪÉĳËĲ",
-        "t"
-      ],
-      [
-        "ËĪiËĲ",
-        "É¾"
-      ],
-      [
-        "Ġt",
-        "É¹"
-      ],
-      [
-        "É¾",
-        "ÉĶ"
-      ],
-      [
-        "Ġw",
-        "ÉĴz"
-      ],
-      [
-        "Ġv",
-        "u"
-      ],
-      [
-        "b",
-        "ÉĻl"
-      ],
-      [
-        "b",
-        "ÉĻ"
-      ],
-      [
-        "É¹",
-        "i"
-      ],
-      [
-        "nt",
-        "s"
-      ],
-      [
-        "Ġs",
-        "ËĪaËĲ"
-      ],
-      [
-        "d",
-        "Ê°"
-      ],
-      [
-        "Ġt",
-        "ÊĬ"
-      ],
-      [
-        "ĠÊİ",
-        "ËĮi"
-      ],
-      [
-        "Î²",
-        "a"
-      ],
-      [
-        "h",
-        "ËĪÉĻÉľÅĭ"
-      ],
-      [
-        "Ġs",
-        "ËĪiËĲ"
-      ],
-      [
-        "ĠpËĮa",
-        "É¾a"
-      ],
-      [
-        "ËĪÉĽÉ¾",
-        "ÉĶ"
-      ],
-      [
-        "ËĪÉª",
-        "s"
-      ],
-      [
-        "É£",
-        "o"
-      ],
-      [
-        "ĠËĮa",
-        "l"
-      ],
-      [
-        "o",
-        "r"
-      ],
-      [
-        "Ġb",
-        "ËĪÊĮh"
-      ],
-      [
-        "Ġk",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġt",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġp",
-        "ËĪo"
-      ],
-      [
-        "ĠÊĴ",
-        "ÉĻ"
-      ],
-      [
-        "p",
-        "Êģ"
-      ],
-      [
-        "Ġ",
-        "ËĪaÉª"
-      ],
-      [
-        "hËĪÉĳ",
-        "ÉľÅĭ"
-      ],
-      [
-        "ÉĻl",
-        "i"
-      ],
-      [
-        "ËĪeÉª",
-        "t"
-      ],
-      [
-        "ĠjËĪiou",
-        "Éľ"
-      ],
-      [
-        "Ġd",
-        "ËĪÉĻ"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "l",
-        "ËĪi"
-      ],
-      [
-        "ËĮy",
-        "ÉĻ"
-      ],
-      [
-        "ĠlËĪoËĲ",
-        "É¡"
-      ],
-      [
-        "Ġn",
-        "ËĪÊĮ"
-      ],
-      [
-        "Ġh",
-        "ËĪÊĬ"
-      ],
-      [
-        "Ġn",
-        "ËĪÉĻÉľÅĭ"
-      ],
-      [
-        "ĠÊģ",
-        "ÉĻ"
-      ],
-      [
-        "z",
-        "ËĪi"
-      ],
-      [
-        "Ġt",
-        "ËĪuËĲ"
-      ],
-      [
-        "ĠkËĮo",
-        "me"
-      ],
-      [
-        "Ġl",
-        "ËĪeËĲ"
-      ],
-      [
-        "ËĪaËĲt",
-        "aËĲ"
-      ],
-      [
-        "Ġa",
-        "n"
-      ],
-      [
-        "ĠËĪy",
-        "u"
-      ],
-      [
-        "ĠËĮÊĮ",
-        "É¡ÉĻÉ¾"
-      ],
-      [
-        "ĠËĪÉª",
-        "n"
-      ],
-      [
-        "ĠhËĪo",
-        "ÉĻ"
-      ],
-      [
-        "v",
-        "ÉĻ"
-      ],
-      [
-        "ËĪÃ¸",
-        "ËĲ"
-      ],
-      [
-        "Î¸j",
-        "a"
-      ],
-      [
-        "ËĪuÉĻ",
-        "Éľn"
-      ],
-      [
-        "Ġk",
-        "ÉĻÉ¾"
-      ],
-      [
-        "ËĪa",
-        "t"
-      ],
-      [
-        "j",
-        "ËĪÃ¸"
-      ],
-      [
-        "ËĪÉĽt",
-        "Êģ"
-      ],
-      [
-        "Ġp",
-        "ËĪÉĳu"
-      ],
-      [
-        "st",
-        "ÉĻ"
-      ],
-      [
-        "Ġw",
-        "ÉĴt"
-      ],
-      [
-        "ËĪeËĲ",
-        "l"
-      ],
-      [
-        "ÊĪ",
-        "i"
-      ],
-      [
-        "Ġx",
-        "ËĪaiÉľ"
-      ],
-      [
-        "ËĪy",
-        "Êģ"
-      ],
-      [
-        "ĠhËĪoËĲ",
-        "É¡aËĲ"
-      ],
-      [
-        "Ġts",
-        "ËĪi"
-      ],
-      [
-        "ĠËĪÊĮ",
-        "p"
-      ],
-      [
-        "Ġn",
-        "ËĮÉĴt"
-      ],
-      [
-        "ĠlËĪÉª",
-        "eËĲ"
-      ],
-      [
-        "Ġh",
-        "ËĪa"
-      ],
-      [
-        "Ġf",
-        "l"
-      ],
-      [
-        "Ġn",
-        "ËĪeËĲ"
-      ],
-      [
-        "ËĮaËĲ",
-        "Éª"
-      ],
-      [
-        "Ġt",
-        "ËĪuo"
-      ],
-      [
-        "tÊĥ",
-        "ËĲ"
-      ],
-      [
-        "s",
-        "ËĪe"
-      ],
-      [
-        "bÊ°",
-        "i"
-      ],
-      [
-        "ĠbËĪÊĮh",
-        "ÊĬt"
-      ],
-      [
-        "ËĪÉĽ",
-        "nd"
-      ],
-      [
-        "Ġs",
-        "ËĪÉĶ"
-      ],
-      [
-        "ÉĻn",
-        "s"
-      ],
-      [
-        "ËĮÉĻ",
-        "l"
-      ],
-      [
-        "ÉĽ",
-        "Éľ"
-      ],
-      [
-        "ĠÉ¡",
-        "l"
-      ],
-      [
-        "ËĪÉª",
-        "É¾"
-      ],
-      [
-        "ËĪaËĲt",
-        "a"
-      ],
-      [
-        "Éľ",
-        "ËĲ"
-      ],
-      [
-        "ËĪÉĽnt",
-        "o"
-      ],
-      [
-        "sk",
-        "ËĮoËĲ"
-      ],
-      [
-        "ËĪÉĽ",
-        "k"
-      ],
-      [
-        "ts",
-        "i"
-      ],
-      [
-        "Ġt",
-        "ËĪonÉ¡"
-      ],
-      [
-        "Ġb",
-        "iËĲ"
-      ],
-      [
-        "Ġh",
-        "ËĪaËĲÉª"
-      ],
-      [
-        "Ġb",
-        "ËĪi"
-      ],
-      [
-        "j",
-        "j"
-      ],
-      [
-        "Êİ",
-        "i"
-      ],
-      [
-        "Ġk",
-        "Ê°"
-      ],
-      [
-        "Ġs",
-        "ËĪo"
-      ],
-      [
-        "ll",
-        "o"
-      ],
-      [
-        "Ġb",
-        "aÉª"
-      ],
-      [
-        "ĠÉĽ",
-        "nt"
-      ],
-      [
-        "Ġ",
-        "ËĪiËĲ"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪo"
-      ],
-      [
-        "É¾",
-        "eËĲ"
-      ],
-      [
-        "Ġk",
-        "Êĭ"
-      ],
-      [
-        "Ġm",
-        "ËĪeiÉľ"
-      ],
-      [
-        "ÊĬ",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "Ġt",
-        "ËĪaÉª"
-      ],
-      [
-        "Ġsu",
-        "s"
-      ],
-      [
-        "Ġr",
-        "i"
-      ],
-      [
-        "Ġv",
-        "ËĮÉĽ"
-      ],
-      [
-        "ËĪiËĲ",
-        "no"
-      ],
-      [
-        "v",
-        "ano"
-      ],
-      [
-        "ĠdËĮi",
-        "ËĲ"
-      ],
-      [
-        "ĠÊĲ",
-        "ËĪaÉľn"
-      ],
-      [
-        "Ê",
-        "Ĥ"
-      ],
-      [
-        "ĠÉĲ",
-        "b"
-      ],
-      [
-        "ËĪaËĲ",
-        "h"
-      ],
-      [
-        "Éª",
-        "Êĥ"
-      ],
-      [
-        "ĠdËĮe",
-        "lla"
-      ],
-      [
-        "tËĲ",
-        "i"
-      ],
-      [
-        "ĠËĪÊĬ",
-        "n"
-      ],
-      [
-        "Ġh",
-        "iËĲ"
-      ],
-      [
-        "Ġb",
-        "ËĪaËĲt"
-      ],
-      [
-        "Ġth",
-        "ËĪi"
-      ],
-      [
-        "Ġa",
-        "m"
-      ],
-      [
-        "Ġ",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġh",
-        "u"
-      ],
-      [
-        "Ġk",
-        "ËĪÊĮh"
-      ],
-      [
-        "Ġz",
-        "ËĪÉĳËĲ"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĮÉĶ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĻÊĬ"
-      ],
-      [
-        "y",
-        "ËĪi"
-      ],
-      [
-        "Ġl",
-        "ËĪÊĮ"
-      ],
-      [
-        "Ġd",
-        "ËĪeËĲ"
-      ],
-      [
-        "Ġs",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "sk",
-        "ËĮeËĲ"
-      ],
-      [
-        "É¾",
-        "o"
-      ],
-      [
-        "Êģ",
-        "ËĪÉĳ"
-      ],
-      [
-        "t",
-        "ËĪa"
-      ],
-      [
-        "Ġk",
-        "ËĪÊĬ"
-      ],
-      [
-        "ËĪant",
-        "e"
-      ],
-      [
-        "Ġd",
-        "ÉĶ"
-      ],
-      [
-        "Ġs",
-        "ËĪeÉª"
-      ],
-      [
-        "Ġs",
-        "ÉĽt"
-      ],
-      [
-        "É¹",
-        "Éª"
-      ],
-      [
-        "ĠÉ¡ËĮÉĻÊĬ",
-        "ÉªÅĭ"
-      ],
-      [
-        "z",
-        "o"
-      ],
-      [
-        "Ġj",
-        "ËĪaËĲ"
-      ],
-      [
-        "ĠÉĴv",
-        "Ã°ÉĻ"
-      ],
-      [
-        "ĠÊ",
-        "Ŀ"
-      ],
-      [
-        "ĠÉĽ",
-        "l"
-      ],
-      [
-        "Ġs",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġth",
-        "ËĪiÉľ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĽl"
-      ],
-      [
-        "Ġly",
-        "ËĮi"
-      ],
-      [
-        "nd",
-        "ÊĴ"
-      ],
-      [
-        "ĠÉķ",
-        "jËĪÉĳu"
-      ],
-      [
-        "Î¸",
-        "a"
-      ],
-      [
-        "ĠÉ¾ËĮÉĻh",
-        "eËĲ"
-      ],
-      [
-        "Ġma",
-        "Éª"
-      ],
-      [
-        "j",
-        "ÉĻ"
-      ],
-      [
-        "ĠËĪÊĮ",
-        "b"
-      ],
-      [
-        "as",
-        "jËĪÉĶ"
-      ],
-      [
-        "d",
-        "Êģ"
-      ],
-      [
-        "Ġkh",
-        "ËĪa"
-      ],
-      [
-        "ĠËĪe",
-        "s"
-      ],
-      [
-        "v",
-        "i"
-      ],
-      [
-        "f",
-        "i"
-      ],
-      [
-        "ËĮÉĻ",
-        "b"
-      ],
-      [
-        "Ġr",
-        "e"
-      ],
-      [
-        "Ġav",
-        "ËĮÉĽ"
-      ],
-      [
-        "Ġt",
-        "ËĮi"
-      ],
-      [
-        "Ġk",
-        "É¾"
-      ],
-      [
-        "Ġb",
-        "Éªk"
-      ],
-      [
-        "st",
-        "e"
-      ],
-      [
-        "ËĪeËĲÊĥ",
-        "c"
-      ],
-      [
-        "p",
-        "t"
-      ],
-      [
-        "z",
-        "ÉĻ"
-      ],
-      [
-        "Ġw",
-        "ËĪaËĲ"
-      ],
-      [
-        "k",
-        "l"
-      ],
-      [
-        "ĠsËĪÊĮ",
-        "m"
-      ],
-      [
-        "Éª",
-        "ÊĪ"
-      ],
-      [
-        "d",
-        "z"
-      ],
-      [
-        "v",
-        "o"
-      ],
-      [
-        "ËĮa",
-        "ÊĬt"
-      ],
-      [
-        "nd",
-        "e"
-      ],
-      [
-        "Ġd",
-        "ÉĽs"
-      ],
-      [
-        "ĠÉŁ",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġr",
-        "ËĮi"
-      ],
-      [
-        "s",
-        "ËĮeËĲ"
-      ],
-      [
-        "É¡",
-        "i"
-      ],
-      [
-        "Ġal",
-        "s"
-      ],
-      [
-        "ËĪi",
-        "Ã°o"
-      ],
-      [
-        "ĠnËĪi",
-        "Éľn"
-      ],
-      [
-        "ÊĬ",
-        "l"
-      ],
-      [
-        "ts",
-        "ËĲ"
-      ],
-      [
-        "ËĪant",
-        "o"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪÉĻÊĬ"
-      ],
-      [
-        "kËĲ",
-        "i"
-      ],
-      [
-        "ĠsËĪÊĮ",
-        "b"
-      ],
-      [
-        "Ġn",
-        "ËĪa"
-      ],
-      [
-        "Ġl",
-        "ËĮo"
-      ],
-      [
-        "Ġph",
-        "ËĪi"
-      ],
-      [
-        "m",
-        "ËĮe"
-      ],
-      [
-        "Ġf",
-        "a"
-      ],
-      [
-        "k",
-        "ÉĻ"
-      ],
-      [
-        "Ġz",
-        "ËĪu"
-      ],
-      [
-        "n",
-        "s"
-      ],
-      [
-        "ĠÊģ",
-        "e"
-      ],
-      [
-        "Ġb",
-        "ËĪo"
-      ],
-      [
-        "ËĪaËĲt",
-        "i"
-      ],
-      [
-        "Ġm",
-        "an"
-      ],
-      [
-        "ĠlËĪi",
-        "Éĳ"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĮyÉĻ"
-      ],
-      [
-        "Ġf",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "ĠkÊĭ",
-        "ËĪeËĲÊĥc"
-      ],
-      [
-        "Ġx",
-        "ËĪÉĳ"
-      ],
-      [
-        "ĠtÉķ",
-        "ËĪu"
-      ],
-      [
-        "j",
-        "ÉĻÉ¾"
-      ],
-      [
-        "ĠÉª",
-        "st"
-      ],
-      [
-        "w",
-        "ËĪi"
-      ],
-      [
-        "ĠËĮaÉªn",
-        "ÉĻ"
-      ],
-      [
-        "Éª",
-        "É¡"
-      ],
-      [
-        "Ġs",
-        "ÊĪ"
-      ],
-      [
-        "ËĪi",
-        "ÉĻl"
-      ],
-      [
-        "Ġn",
-        "ËĪiÉĽÉľn"
-      ],
-      [
-        "ĠËĮÉĽ",
-        "ËĲ"
-      ],
-      [
-        "ËĪaÉª",
-        "nd"
-      ],
-      [
-        "Ġz",
-        "ËĪi"
-      ],
-      [
-        "v",
-        "ÉĻn"
-      ],
-      [
-        "m",
-        "z"
-      ],
-      [
-        "Ã°",
-        "os"
-      ],
-      [
-        "dÊĴ",
-        "ËĲ"
-      ],
-      [
-        "j",
-        "ËĪa"
-      ],
-      [
-        "É¾",
-        "ËĪÉĶ"
-      ],
-      [
-        "l",
-        "ËĪe"
-      ],
-      [
-        "Ê",
-        "²"
-      ],
-      [
-        "Ġv",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġl",
-        "ËĪiÉĽ"
-      ],
-      [
-        "Î¸",
-        "e"
-      ],
-      [
-        "mËĪe",
-        "nte"
-      ],
-      [
-        "ĠÉªn",
-        "Ã°ÉĻ"
-      ],
-      [
-        "ĠaÉª",
-        "m"
-      ],
-      [
-        "n",
-        "ÉĻn"
-      ],
-      [
-        "Ġh",
-        "ÉĻm"
-      ],
-      [
-        "É¾",
-        "aËĲ"
-      ],
-      [
-        "ĠsËĪuo",
-        "Éľ"
-      ],
-      [
-        "ĠÉ²",
-        "ËĪi"
-      ],
-      [
-        "ĠÉ¹",
-        "ËĪiÉĻl"
-      ],
-      [
-        "l",
-        "ËĪa"
-      ],
-      [
-        "Ġb",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġk",
-        "ËĪai"
-      ],
-      [
-        "Êģ",
-        "ËĪa"
-      ],
-      [
-        "Ġw",
-        "ËĪÉľËĲ"
-      ],
-      [
-        "Ġa",
-        "ËĲ"
-      ],
-      [
-        "Ġp",
-        "as"
-      ],
-      [
-        "ËĪÊĮ",
-        "s"
-      ],
-      [
-        "w",
-        "ËĪÉĽÉ¾"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪe"
-      ],
-      [
-        "ĠhËĮa",
-        "tÉĻ"
-      ],
-      [
-        "a",
-        "Éªn"
-      ],
-      [
-        "ĠËĪÉĶ",
-        "pÊ°"
-      ],
-      [
-        "Êģ",
-        "ËĪe"
-      ],
-      [
-        "ĠÉŁaËĲ",
-        "ËĪeËĲÉ¡aËĲ"
-      ],
-      [
-        "ĠËĪÊĬ",
-        "s"
-      ],
-      [
-        "ĠtÉķhËĪi",
-        "Éľ"
-      ],
-      [
-        "nt",
-        "Êĥ"
-      ],
-      [
-        "Ġx",
-        "ËĪuo"
-      ],
-      [
-        "ËĪu",
-        "Êģ"
-      ],
-      [
-        "ĠÉª",
-        "m"
-      ],
-      [
-        "É³",
-        "Éĸ"
-      ],
-      [
-        "ËĪyÉĻ",
-        "Éľkh"
-      ],
-      [
-        "ĠËĪy",
-        "ÉĽ"
-      ],
-      [
-        "Ġm",
-        "ËĮaËĲ"
-      ],
-      [
-        "Åĵ",
-        "Êģ"
-      ],
-      [
-        "ĠËĪa",
-        "lt"
-      ],
-      [
-        "Ġk",
-        "ÉĻm"
-      ],
-      [
-        "Êİ",
-        "o"
-      ],
-      [
-        "ĠÉĲ",
-        "n"
-      ],
-      [
-        "Ġf",
-        "y"
-      ],
-      [
-        "ĠËĮÉĽ",
-        "ra"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÊĬ"
-      ],
-      [
-        "Ġp",
-        "ËĪÊĮ"
-      ],
-      [
-        "l",
-        "s"
-      ],
-      [
-        "Ġl",
-        "ËĪiËĲ"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĪy"
-      ],
-      [
-        "ĠbÉªk",
-        "ËĪÊĮz"
-      ],
-      [
-        "ĠÉ¡",
-        "ÉĽt"
-      ],
-      [
-        "Ġb",
-        "É¾"
-      ],
-      [
-        "t",
-        "Ê°"
-      ],
-      [
-        "tÉĻl",
-        "ËĮÉĻb"
-      ],
-      [
-        "x",
-        "o"
-      ],
-      [
-        "sk",
-        "ËĮaËĲ"
-      ],
-      [
-        "É²",
-        "Ê²"
-      ],
-      [
-        "ËĪeËĲk",
-        "ÊĪ"
-      ],
-      [
-        "r",
-        "ÉĻ"
-      ],
-      [
-        "tÊĥ",
-        "o"
-      ],
-      [
-        "ĠpÊģ",
-        "ÉĶ"
-      ],
-      [
-        "ĠÉ¹",
-        "ËĪaÉªt"
-      ],
-      [
-        "Ġp",
-        "ËĪei"
-      ],
-      [
-        "ËĮ",
-        "ÉªÃ§"
-      ],
-      [
-        "j",
-        "ËĪÉĽÉ¾"
-      ],
-      [
-        "tËĲ",
-        "a"
-      ],
-      [
-        "ĠÉĲb",
-        "ËĮaÊĬt"
-      ],
-      [
-        "ĠkÊĭËĪeËĲÊĥc",
-        "ÉĻn"
-      ],
-      [
-        "Ġv",
-        "ËĪe"
-      ],
-      [
-        "ÊĬ",
-        "Éľ"
-      ],
-      [
-        "Ġa",
-        "kËĪe"
-      ],
-      [
-        "Ġp",
-        "ËĪai"
-      ],
-      [
-        "v",
-        "ËĪÉĽ"
-      ],
-      [
-        "ĠÎ¸",
-        "É¹"
-      ],
-      [
-        "Éª",
-        "f"
-      ],
-      [
-        "Ġav",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġk",
-        "ËĪe"
-      ],
-      [
-        "d",
-        "ËĪi"
-      ],
-      [
-        "ËĪeËĲ",
-        "Éĸ"
-      ],
-      [
-        "Ġb",
-        "ÉĻt"
-      ],
-      [
-        "ÊĪ",
-        "Ê°"
-      ],
-      [
-        "t",
-        "eËĲ"
-      ],
-      [
-        "Î¸j",
-        "ËĪÉĶn"
-      ],
-      [
-        "d",
-        "Éľ"
-      ],
-      [
-        "ĠjËĪi",
-        "Éľ"
-      ],
-      [
-        "Ġv",
-        "e"
-      ],
-      [
-        "É£",
-        "ËĪu"
-      ],
-      [
-        "ËĪÊĮh",
-        "ÉĻl"
-      ],
-      [
-        "Ġp",
-        "ÉĶ"
-      ],
-      [
-        "ĠÉ¡",
-        "r"
-      ],
-      [
-        "ĠÃ°",
-        "a"
-      ],
-      [
-        "Ġv",
-        "ËĪiËĲ"
-      ],
-      [
-        "ĠËĮ",
-        "ÉĳËĲ"
-      ],
-      [
-        "ËĪÉĻÊĬ",
-        "nt"
-      ],
-      [
-        "Ġb",
-        "ËĪaËĲÉ¾"
-      ],
-      [
-        "ĠmËĪÊĮ",
-        "tÉĻlËĮÉĻb"
-      ],
-      [
-        "l",
-        "d"
-      ],
-      [
-        "ĠtÉķ",
-        "ËĮÉĶ"
-      ],
-      [
-        "p",
-        "a"
-      ],
-      [
-        "Ã°",
-        "ËĪad"
-      ],
-      [
-        "ËĪi",
-        "É¾"
-      ],
-      [
-        "Ġx",
-        "ËĪu"
-      ],
-      [
-        "ĠlËĪi",
-        "ÉľÅĭ"
-      ],
-      [
-        "ËĪeÉª",
-        "s"
-      ],
-      [
-        "ĠÉĹËĮe",
-        "Éľn"
-      ],
-      [
-        "Ġth",
-        "ËĪiÉĽ"
-      ],
-      [
-        "tËĲ",
-        "e"
-      ],
-      [
-        "ĠavËĮÉĽ",
-        "k"
-      ],
-      [
-        "ĠËĮ",
-        "ÉĶ"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĳu"
-      ],
-      [
-        "Éª",
-        "v"
-      ],
-      [
-        "iËĲ",
-        "z"
-      ],
-      [
-        "ËĪo",
-        "s"
-      ],
-      [
-        "ĠÉ¡",
-        "É¹"
-      ],
-      [
-        "a",
-        "nd"
-      ],
-      [
-        "ĠlËĪi",
-        "ou"
-      ],
-      [
-        "ĠËĪo",
-        "Éľ"
-      ],
-      [
-        "É¡",
-        "l"
-      ],
-      [
-        "Ġp",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "Ġm",
-        "ËĮeËĲ"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĴ"
-      ],
-      [
-        "n",
-        "os"
-      ],
-      [
-        "Ã§",
-        "ÉĻn"
-      ],
-      [
-        "f",
-        "ÉĻn"
-      ],
-      [
-        "ĠsËĪÊĮkt",
-        "ËĮeËĲ"
-      ],
-      [
-        "Ġ",
-        "ËĪaÉªn"
-      ],
-      [
-        "ËĪoËĲ",
-        "re"
-      ],
-      [
-        "j",
-        "ËĪÉĽn"
-      ],
-      [
-        "ĠÃ°",
-        "ËĪÉĽn"
-      ],
-      [
-        "ĠtÉķh",
-        "ËĪiÉĽÉľn"
-      ],
-      [
-        "Ġh",
-        "ËĪaÉª"
-      ],
-      [
-        "É¾",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġs",
-        "ËĪu"
-      ],
-      [
-        "ĠkËĪÉª",
-        "jaËĲ"
-      ],
-      [
-        "Ġpj",
-        "ËĮÊĬ"
-      ],
-      [
-        "ĠhÉĻm",
-        "ËĮaËĲ"
-      ],
-      [
-        "ĠËĮÊĮ",
-        "p"
-      ],
-      [
-        "Ġp",
-        "ËĪÊĮhÉĻl"
-      ],
-      [
-        "Ġx",
-        "ËĪÉĻ"
-      ],
-      [
-        "d",
-        "ËĪe"
-      ],
-      [
-        "Ġm",
-        "Éĳ"
-      ],
-      [
-        "ĠÊĬ",
-        "m"
-      ],
-      [
-        "nd",
-        "ÉĻ"
-      ],
-      [
-        "Ġd",
-        "ËĪÉĻÊĬnt"
-      ],
-      [
-        "ËĪeËĲ",
-        "ÊĥÉĻn"
-      ],
-      [
-        "ĠÃ°a",
-        "ts"
-      ],
-      [
-        "i",
-        "s"
-      ],
-      [
-        "Ġc",
-        "ËĪaËĲh"
-      ],
-      [
-        "p",
-        "e"
-      ],
-      [
-        "Ġs",
-        "ËĮo"
-      ],
-      [
-        "ĠÃ°",
-        "ËĪe"
-      ],
-      [
-        "Ġs",
-        "ËĪaËĲt"
-      ],
-      [
-        "ËĪa",
-        "Êģ"
-      ],
-      [
-        "Ġs",
-        "ËĪe"
-      ],
-      [
-        "ÉĻ",
-        "k"
-      ],
-      [
-        "Éª",
-        "Êĭ"
-      ],
-      [
-        "ĠkËĪoËĲ",
-        "i"
-      ],
-      [
-        "k",
-        "ÉĶ"
-      ],
-      [
-        "Ġv",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "Ġf",
-        "ËĪei"
-      ],
-      [
-        "Ġl",
-        "ËĪeËĲk"
-      ],
-      [
-        "Ġh",
-        "ËĪiÉĻ"
-      ],
-      [
-        "Ġa",
-        "ÊĬ"
-      ],
-      [
-        "ËĪÉĽ",
-        "ndo"
-      ],
-      [
-        "ËĪe",
-        "s"
-      ],
-      [
-        "Ġz",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĽÉ¾a"
-      ],
-      [
-        "nËĪi",
-        "Éľn"
-      ],
-      [
-        "ĠkËĪÊĮ",
-        "m"
-      ],
-      [
-        "Ġl",
-        "ËĪÉĴ"
-      ],
-      [
-        "Éª",
-        "st"
-      ],
-      [
-        "Ġp",
-        "Éĳ"
-      ],
-      [
-        "Ġf",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġth",
-        "ËĪonÉ¡"
-      ],
-      [
-        "nk",
-        "e"
-      ],
-      [
-        "ËĮ",
-        "Éªk"
-      ],
-      [
-        "ĠÉ²",
-        "ËĪÉĻ"
-      ],
-      [
-        "ËĮÊĮ",
-        "m"
-      ],
-      [
-        "ËĪiËĲ",
-        "t"
-      ],
-      [
-        "ĠwËĪÉĴ",
-        "nt"
-      ],
-      [
-        "ËĪaÎ²",
-        "an"
-      ],
-      [
-        "ĠbËĪÊĮ",
-        "r"
-      ],
-      [
-        "ÉĽ",
-        "nd"
-      ],
-      [
-        "ĠËĮÉĳËĲ",
-        "bÉľ"
-      ],
-      [
-        "Ġv",
-        "ËĪaÉª"
-      ],
-      [
-        "ĠtÊĥ",
-        "ËĮi"
-      ],
-      [
-        "ĠÎ¸ËĪÉªÅĭ",
-        "k"
-      ],
-      [
-        "st",
-        "i"
-      ],
-      [
-        "Ġk",
-        "É¹"
-      ],
-      [
-        "ĠËĪa",
-        "ÊĬt"
-      ],
-      [
-        "st",
-        "ÉĻn"
-      ],
-      [
-        "ĠÊĭ",
-        "ËĪÊĮn"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĮaËĲ"
-      ],
-      [
-        "ËĪaËĲÉľ",
-        "É²"
-      ],
-      [
-        "Êģ",
-        "i"
-      ],
-      [
-        "ĠnËĪÉĶ",
-        "x"
-      ],
-      [
-        "ĠÉ¹ËĪiÉĻl",
-        "Éª"
-      ],
-      [
-        "Ġv",
-        "ËĮi"
-      ],
-      [
-        "ĠÃ°e",
-        "ÉĻ"
-      ],
-      [
-        "ËĮÉª",
-        "tÊĥ"
-      ],
-      [
-        "Ġv",
-        "ËĪyÉĻ"
-      ],
-      [
-        "ĠËĮaËĲpk",
-        "ËĮaËĲ"
-      ],
-      [
-        "Ġf",
-        "ËĮaËĲÉª"
-      ],
-      [
-        "Ġp",
-        "ËĪÉĶ"
-      ],
-      [
-        "ĠnËĪÊĮ",
-        "mb"
-      ],
-      [
-        "Î¸",
-        "es"
-      ],
-      [
-        "j",
-        "ËĪÉĽÊģ"
-      ],
-      [
-        "ĠkËĪÊĬ",
-        "cÊ°"
-      ],
-      [
-        "m",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġv",
-        "ËĪu"
-      ],
-      [
-        "Ġl",
-        "ÅĵÊģ"
-      ],
-      [
-        "ĠiËĲ",
-        "m"
-      ],
-      [
-        "ÊĪ",
-        "ÉĻÉ¾"
-      ],
-      [
-        "tÊĥ",
-        "i"
-      ],
-      [
-        "ËĲ",
-        "s"
-      ],
-      [
-        "Ġt",
-        "ËĪy"
-      ],
-      [
-        "ĠmËĪi",
-        "ÉľÅĭ"
-      ],
-      [
-        "É¾",
-        "ËĪe"
-      ],
-      [
-        "m",
-        "ËĮa"
-      ],
-      [
-        "Ġm",
-        "ËĮiËĲ"
-      ],
-      [
-        "ĠÉĽ",
-        "ks"
-      ],
-      [
-        "Éª",
-        "p"
-      ],
-      [
-        "ĠkËĪÊĮÉ¾",
-        "nËĮaËĲ"
-      ],
-      [
-        "ĠËĮaÊĬ",
-        "x"
-      ],
-      [
-        "r",
-        "ËĪiËĲ"
-      ],
-      [
-        "Ġc",
-        "ËĪÊĮl"
-      ],
-      [
-        "m",
-        "os"
-      ],
-      [
-        "ĠkËĪÊĮÉ¾t",
-        "ËĮeËĲ"
-      ],
-      [
-        "iËĲ",
-        "É¾"
-      ],
-      [
-        "k",
-        "ÉĻn"
-      ],
-      [
-        "Ġd",
-        "ËĪu"
-      ],
-      [
-        "n",
-        "aËĲ"
-      ],
-      [
-        "Ġp",
-        "wËĪe"
-      ],
-      [
-        "ËĮÉĶ",
-        "Éª"
-      ],
-      [
-        "ĠtÉķh",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ĠÎ²",
-        "ËĪi"
-      ],
-      [
-        "ËĪiÉĽ",
-        "Éľt"
-      ],
-      [
-        "Ġt",
-        "e"
-      ],
-      [
-        "ËĪaÃ°",
-        "os"
-      ],
-      [
-        "m",
-        "ËĪa"
-      ],
-      [
-        "Ġv",
-        "ËĪo"
-      ],
-      [
-        "Ġm",
-        "ËĪÉª"
-      ],
-      [
-        "Ġb",
-        "ËĮi"
-      ],
-      [
-        "a",
-        "d"
-      ],
-      [
-        "d",
-        "o"
-      ],
-      [
-        "Ġn",
-        "ËĪaÊĬ"
-      ],
-      [
-        "ĠÊ²ËĪy",
-        "Éľ"
-      ],
-      [
-        "w",
-        "ËĪÉĽ"
-      ],
-      [
-        "ËĪi",
-        "s"
-      ],
-      [
-        "e",
-        "l"
-      ],
-      [
-        "Ġpa",
-        "r"
-      ],
-      [
-        "Ġt",
-        "ËĪai"
-      ],
-      [
-        "ĠdËĪÉª",
-        "jaËĲ"
-      ],
-      [
-        "h",
-        "ËĪi"
-      ],
-      [
-        "ĠÉ¾",
-        "ËĪÊĮ"
-      ],
-      [
-        "Ġd",
-        "ËĪe"
-      ],
-      [
-        "ËĪaÉª",
-        "d"
-      ],
-      [
-        "Ġp",
-        "er"
-      ],
-      [
-        "Ġs",
-        "ËĮÉĶ"
-      ],
-      [
-        "w",
-        "e"
-      ],
-      [
-        "ÊĬ",
-        "m"
-      ],
-      [
-        "Ġi",
-        "n"
-      ],
-      [
-        "ĠjËĪuËĲ",
-        "z"
-      ],
-      [
-        "ËĪiËĲp",
-        "ÉĻl"
-      ],
-      [
-        "ĠÊĭ",
-        "ËĪaËĲl"
-      ],
-      [
-        "Ġe",
-        "tËĪÉĽ"
-      ],
-      [
-        "ËĮÉĽ",
-        "m"
-      ],
-      [
-        "Ġn",
-        "ËĪu"
-      ],
-      [
-        "ËĪÉĽ",
-        "kt"
-      ],
-      [
-        "ĠiËĲ",
-        "É¾"
-      ],
-      [
-        "Ġb",
-        "É¹"
-      ],
-      [
-        "Ġtsh",
-        "ËĪi"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪÉĶÉľ"
-      ],
-      [
-        "Ġkw",
-        "ËĮa"
-      ],
-      [
-        "Ġf",
-        "ËĪuÉľ"
-      ],
-      [
-        "w",
-        "ËĮa"
-      ],
-      [
-        "Ġd",
-        "ËĪiËĲ"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪyÉĻ"
-      ],
-      [
-        "ËĮÉĽ",
-        "ËĲ"
-      ],
-      [
-        "r",
-        "ËĪa"
-      ],
-      [
-        "Ġn",
-        "e"
-      ],
-      [
-        "Ġz",
-        "ËĪyÉĻ"
-      ],
-      [
-        "Ġb",
-        "ËĪaÉª"
-      ],
-      [
-        "ĠÉŁ",
-        "ËĪÊĮb"
-      ],
-      [
-        "ËĪuËĲ",
-        "to"
-      ],
-      [
-        "ÊĬ",
-        "nt"
-      ],
-      [
-        "Ġc",
-        "Ê°"
-      ],
-      [
-        "ËĪÉĽnt",
-        "i"
-      ],
-      [
-        "ËĪo",
-        "ÉĻ"
-      ],
-      [
-        "Ġs",
-        "ËĮÊĮm"
-      ],
-      [
-        "Ġl",
-        "Éĳ"
-      ],
-      [
-        "ËĮe",
-        "va"
-      ],
-      [
-        "É¾",
-        "ÉĽ"
-      ],
-      [
-        "nt",
-        "Éľ"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĽn"
-      ],
-      [
-        "ËĪÉĳËĲ",
-        "k"
-      ],
-      [
-        "Ġki",
-        "l"
-      ],
-      [
-        "ËĪon",
-        "es"
-      ],
-      [
-        "f",
-        "f"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĽËĲ"
-      ],
-      [
-        "Ġv",
-        "ËĪÉĻÉª"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "ĠËĮÉª",
-        "nt"
-      ],
-      [
-        "ÊĬ",
-        "n"
-      ],
-      [
-        "Ġw",
-        "Éªl"
-      ],
-      [
-        "Ġs",
-        "in"
-      ],
-      [
-        "ĠËĮa",
-        "lla"
-      ],
-      [
-        "ĠaÎ²",
-        "ËĪia"
-      ],
-      [
-        "p",
-        "i"
-      ],
-      [
-        "ËĪo",
-        "Éľ"
-      ],
-      [
-        "Éªj",
-        "ËĮaËĲ"
-      ],
-      [
-        "k",
-        "u"
-      ],
-      [
-        "Ġv",
-        "ËĪÉª"
-      ],
-      [
-        "Ġtu",
-        "t"
-      ],
-      [
-        "ĠtËĪe",
-        "Éľ"
-      ],
-      [
-        "Ġh",
-        "ËĪÉĶ"
-      ],
-      [
-        "Î²",
-        "É¾e"
-      ],
-      [
-        "s",
-        "ÉĻÉ¾"
-      ],
-      [
-        "Ġkh",
-        "ËĪai"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġt",
-        "a"
-      ],
-      [
-        "ĠÉ²",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġn",
-        "u"
-      ],
-      [
-        "ËĪuËĲ",
-        "n"
-      ],
-      [
-        "ĠÉĻËĲ",
-        "Éľ"
-      ],
-      [
-        "ĠËĪa",
-        "ÊĬf"
-      ],
-      [
-        "ËĪiËĲd",
-        "Éľ"
-      ],
-      [
-        "nt",
-        "i"
-      ],
-      [
-        "Ġp",
-        "ËĪiËĲpÉĻl"
-      ],
-      [
-        "Ġk",
-        "j"
-      ],
-      [
-        "Ġp",
-        "e"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĳ"
-      ],
-      [
-        "ËĮa",
-        "Éª"
-      ],
-      [
-        "ËĪaËĲ",
-        "le"
-      ],
-      [
-        "Ġv",
-        "ËĮÉĻËĲÉªÉľ"
-      ],
-      [
-        "mp",
-        "o"
-      ],
-      [
-        "ĠkËĪÉª",
-        "t"
-      ],
-      [
-        "Ġn",
-        "ËĮÉĽ"
-      ],
-      [
-        "ĠÉŁ",
-        "ËĪaËĲtaËĲ"
-      ],
-      [
-        "ĠsËĪaËĲt",
-        "Ê°"
-      ],
-      [
-        "ĠÉŁ",
-        "ËĪi"
-      ],
-      [
-        "Ġs",
-        "o"
-      ],
-      [
-        "Ġb",
-        "ËĪÉĽ"
-      ],
-      [
-        "k",
-        "ËĪi"
-      ],
-      [
-        "Éªt",
-        "i"
-      ],
-      [
-        "Ġts",
-        "i"
-      ],
-      [
-        "Ġk",
-        "Êģ"
-      ],
-      [
-        "ËĮ",
-        "ÉĴ"
-      ],
-      [
-        "É¡",
-        "ÉĻl"
-      ],
-      [
-        "k",
-        "st"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĻËĲ"
-      ],
-      [
-        "ËĪÊĮ",
-        "k"
-      ],
-      [
-        "Ġn",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "Ġa",
-        "p"
-      ],
-      [
-        "ĠlËĪÉª",
-        "kÊ°"
-      ],
-      [
-        "ll",
-        "i"
-      ],
-      [
-        "ĠkwËĪa",
-        "l"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĻËĲ"
-      ],
-      [
-        "Ġts",
-        "ËĪuei"
-      ],
-      [
-        "Ġd",
-        "o"
-      ],
-      [
-        "ĠkËĲ",
-        "jËĪo"
-      ],
-      [
-        "ÊĬ",
-        "z"
-      ],
-      [
-        "Ġp",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġm",
-        "ËĪuËĲ"
-      ],
-      [
-        "ĠÉ¡ÉĻ",
-        "v"
-      ],
-      [
-        "r",
-        "ËĪi"
-      ],
-      [
-        "Ġt",
-        "w"
-      ],
-      [
-        "ËĮ",
-        "Éªn"
-      ],
-      [
-        "d",
-        "ËĪÉĳ"
-      ],
-      [
-        "ĠÃ°",
-        "ËĪi"
-      ],
-      [
-        "ĠËĪaËĲ",
-        "i"
-      ],
-      [
-        "Ġh",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ĠÃ°",
-        "ËĮÉĽm"
-      ],
-      [
-        "ĠpÊ°",
-        "ËĪÉªÉ¾"
-      ],
-      [
-        "ÉĴ",
-        "m"
-      ],
-      [
-        "ĠËĮ",
-        "eËĲ"
-      ],
-      [
-        "Ġth",
-        "ËĪaiÉľ"
-      ],
-      [
-        "Ġv",
-        "ËĪas"
-      ],
-      [
-        "Ġn",
-        "ÉĳËĲ"
-      ],
-      [
-        "p",
-        "ÉĻn"
-      ],
-      [
-        "Ġp",
-        "ËĮÉĻÉ¾"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪaËĲÉª"
-      ],
-      [
-        "ËĪou",
-        "Éľ"
-      ],
-      [
-        "ĠÊĲ",
-        "ËĪuÉľ"
-      ],
-      [
-        "ĠmËĪa",
-        "n"
-      ],
-      [
-        "ĠtËĪÉĻ",
-        "ÉªÉľ"
-      ],
-      [
-        "Ġl",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "m",
-        "ËĪÉĽnte"
-      ],
-      [
-        "ĠfËĪa",
-        "m"
-      ],
-      [
-        "s",
-        "jËĪÉĶ"
-      ],
-      [
-        "Ġp",
-        "ËĪÉĻ"
-      ],
-      [
-        "ËĪeËĲ",
-        "m"
-      ],
-      [
-        "Ġp",
-        "ËĪÊĮr"
-      ],
-      [
-        "j",
-        "ËĪi"
-      ],
-      [
-        "Ġl",
-        "ÉĽ"
-      ],
-      [
-        "Ġt",
-        "en"
-      ],
-      [
-        "ËĪoËĲ",
-        "ra"
-      ],
-      [
-        "k",
-        "i"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "k",
-        "Éª"
-      ],
-      [
-        "bËĲ",
-        "e"
-      ],
-      [
-        "ËĪa",
-        "lt"
-      ],
-      [
-        "Ã°",
-        "Éª"
-      ],
-      [
-        "p",
-        "ËĪi"
-      ],
-      [
-        "ĠËĮÉĽ",
-        "nt"
-      ],
-      [
-        "Ġm",
-        "ËĪei"
-      ],
-      [
-        "Ġh",
-        "ËĪÉĻÊĬ"
-      ],
-      [
-        "Ġh",
-        "ËĪÉĽÉ¾"
-      ],
-      [
-        "j",
-        "ËĪÉĳ"
-      ],
-      [
-        "ĠhËĪÊĬ",
-        "aËĲ"
-      ],
-      [
-        "m",
-        "Éľ"
-      ],
-      [
-        "Ġd",
-        "Ê°"
-      ],
-      [
-        "ĠtÊĥ",
-        "ËĪe"
-      ],
-      [
-        "l",
-        "ËĪÉĽ"
-      ],
-      [
-        "ËĪaËĲt",
-        "e"
-      ],
-      [
-        "Ġp",
-        "ËĪuËĲ"
-      ],
-      [
-        "Ġm",
-        "ËĪÊĬ"
-      ],
-      [
-        "ËĪaËĲÉª",
-        "ÊĪ"
-      ],
-      [
-        "d",
-        "iËĲ"
-      ],
-      [
-        "ĠfÉ¹",
-        "ÉĴm"
-      ],
-      [
-        "Ġh",
-        "ËĪÉĳËĲ"
-      ],
-      [
-        "Î²",
-        "o"
-      ],
-      [
-        "ĠmËĪi",
-        "Éľn"
-      ],
-      [
-        "ĠÃ°",
-        "iËĲz"
-      ],
-      [
-        "Ġk",
-        "ËĪou"
-      ],
-      [
-        "ËĪiËĲ",
-        "na"
-      ],
-      [
-        "Ġav",
-        "ËĮeva"
-      ],
-      [
-        "Ġ",
-        "ËĪaËĲÉ¾"
-      ],
-      [
-        "Ġn",
-        "ËĪuËĲÉ¾"
-      ],
-      [
-        "ĠÎ²",
-        "ËĪe"
-      ],
-      [
-        "Ġz",
-        "aÉªn"
-      ],
-      [
-        "ËĪÉĽ",
-        "d"
-      ],
-      [
-        "É",
-        "Ĺ"
-      ],
-      [
-        "ËĪeÉª",
-        "k"
-      ],
-      [
-        "s",
-        "ËĮÉĻÊĬ"
-      ],
-      [
-        "ËĪeËĲ",
-        "ÉŁ"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĪÉĻËĲ"
-      ],
-      [
-        "j",
-        "e"
-      ],
-      [
-        "cÊ°",
-        "ËĲ"
-      ],
-      [
-        "ËĪÉĶ",
-        "r"
-      ],
-      [
-        "ÉĽ",
-        "ËĲ"
-      ],
-      [
-        "ĠtÉķhËĪy",
-        "Ã¦Éľn"
-      ],
-      [
-        "ĠËĮaÉªn",
-        "ÉĻn"
-      ],
-      [
-        "ĠiËĲ",
-        "n"
-      ],
-      [
-        "ĠbËĪÊĮ",
-        "c"
-      ],
-      [
-        "ËĪiËĲ",
-        "m"
-      ],
-      [
-        "É¾",
-        "as"
-      ],
-      [
-        "ËĮÉĻ",
-        "s"
-      ],
-      [
-        "Ġv",
-        "ËĪeËĲ"
-      ],
-      [
-        "ĠËĪÉĻr",
-        "Éľ"
-      ],
-      [
-        "Ġd",
-        "uËĲ"
-      ],
-      [
-        "nt",
-        "ÉĻ"
-      ],
-      [
-        "ĠpÉ¹",
-        "ËĪÉĴ"
-      ],
-      [
-        "Ġb",
-        "ËĪÉª"
-      ],
-      [
-        "ĠwËĪo",
-        "Éľ"
-      ],
-      [
-        "n",
-        "ËĮi"
-      ],
-      [
-        "Ġh",
-        "ÉĲ"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġe",
-        "t"
-      ],
-      [
-        "jËĪÉĽ",
-        "ndo"
-      ],
-      [
-        "ĠËĪai",
-        "Éľ"
-      ],
-      [
-        "Ġl",
-        "i"
-      ],
-      [
-        "ĠËĪaÊĬ",
-        "s"
-      ],
-      [
-        "kËĲ",
-        "o"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪyÉĻ"
-      ],
-      [
-        "k",
-        "eËĲ"
-      ],
-      [
-        "Ġf",
-        "ËĪiËĲl"
-      ],
-      [
-        "ĠbÊ°",
-        "ËĪaËĲi"
-      ],
-      [
-        "ĠÉ¡ÉĻ",
-        "Êĥ"
-      ],
-      [
-        "ÊĴ",
-        "ËĪe"
-      ],
-      [
-        "Ġn",
-        "jËĪuËĲ"
-      ],
-      [
-        "ĠËĪa",
-        "k"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪaËĲ"
-      ],
-      [
-        "z",
-        "ËĪa"
-      ],
-      [
-        "v",
-        "ËĪe"
-      ],
-      [
-        "ĠhËĮa",
-        "ÊĬ"
-      ],
-      [
-        "ÉĲ",
-        "Ã§"
-      ],
-      [
-        "ĠÉ¾ËĪÊĮ",
-        "kÊ°"
-      ],
-      [
-        "p",
-        "ËĪe"
-      ],
-      [
-        "ĠtÉĻ",
-        "bi"
-      ],
-      [
-        "ĠpËĪÊĮhÉĻl",
-        "ËĮeËĲ"
-      ],
-      [
-        "Ġf",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġw",
-        "ËĮÉªtÊĥ"
-      ],
-      [
-        "ĠtÉķËĪy",
-        "ÉĽÉľ"
-      ],
-      [
-        "w",
-        "ËĮe"
-      ],
-      [
-        "ËĮa",
-        "Éªt"
-      ],
-      [
-        "ĠnÉĳËĲ",
-        "x"
-      ],
-      [
-        "ĠkËĪÉĶËĲ",
-        "n"
-      ],
-      [
-        "ÊĬ",
-        "k"
-      ],
-      [
-        "ĠbËĪaËĲ",
-        "d"
-      ],
-      [
-        "Åĭ",
-        "ÉĻn"
-      ],
-      [
-        "Ġn",
-        "i"
-      ],
-      [
-        "Ġb",
-        "ËĪe"
-      ],
-      [
-        "Ġm",
-        "ËĮÊĬ"
-      ],
-      [
-        "ËĪa",
-        "r"
-      ],
-      [
-        "ĠmËĮe",
-        "Éªk"
-      ],
-      [
-        "Ġs",
-        "ËĪaËĲÉ¾"
-      ],
-      [
-        "Î²",
-        "e"
-      ],
-      [
-        "ĠtÉķhËĪi",
-        "ÉľÅĭ"
-      ],
-      [
-        "it",
-        "ËĪe"
-      ],
-      [
-        "k",
-        "ËĮe"
-      ],
-      [
-        "ËĪÉĽËĲ",
-        "l"
-      ],
-      [
-        "ËĮ",
-        "ÉĴn"
-      ],
-      [
-        "ËĮ",
-        "Éĳ"
-      ],
-      [
-        "Ġb",
-        "ËĪÉªl"
-      ],
-      [
-        "Ġw",
-        "ÊĬd"
-      ],
-      [
-        "Ġb",
-        "ËĪoËĲl"
-      ],
-      [
-        "r",
-        "d"
-      ],
-      [
-        "i",
-        "ÉĻ"
-      ],
-      [
-        "Ġd",
-        "a"
-      ],
-      [
-        "Ġb",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "ĠnËĪÊĮmb",
-        "ÉĻÉ¾"
-      ],
-      [
-        "ËĪaËĲÉª",
-        "Éľ"
-      ],
-      [
-        "ĠÉĽ",
-        "m"
-      ],
-      [
-        "Ġm",
-        "iËĲÉ¾"
-      ],
-      [
-        "ËĪeÉª",
-        "m"
-      ],
-      [
-        "l",
-        "os"
-      ],
-      [
-        "ËĮÉĽ",
-        "t"
-      ],
-      [
-        "ĠËĮaÊĬ",
-        "s"
-      ],
-      [
-        "ĠmËĪa",
-        "Éľt"
-      ],
-      [
-        "Ġw",
-        "ËĪuÉĻ"
-      ],
-      [
-        "Ġw",
-        "ËĪeÉª"
-      ],
-      [
-        "Ġse",
-        "É²"
-      ],
-      [
-        "Ġb",
-        "jËĪÉĽ"
-      ],
-      [
-        "Ġw",
-        "ÉĽn"
-      ],
-      [
-        "f",
-        "l"
-      ],
-      [
-        "Ġkh",
-        "wËĪa"
-      ],
-      [
-        "d",
-        "ËĪÉĽ"
-      ],
-      [
-        "v",
-        "É¹Éª"
-      ],
-      [
-        "ĠËĪa",
-        "É¾"
-      ],
-      [
-        "jËĪÉĳu",
-        "Éľ"
-      ],
-      [
-        "ĠËĮaËĲpk",
-        "ËĮeËĲ"
-      ],
-      [
-        "b",
-        "Êģ"
-      ],
-      [
-        "ĠtËĪaÉª",
-        "m"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĳ"
-      ],
-      [
-        "Ġs",
-        "ËĮa"
-      ],
-      [
-        "Ġz",
-        "ËĪoÉª"
-      ],
-      [
-        "ËĪÉĶÉ¾",
-        "a"
-      ],
-      [
-        "Ġd",
-        "ËĪÃ¸"
-      ],
-      [
-        "ËĪÉĶÉ¾",
-        "t"
-      ],
-      [
-        "ĠÅĭ",
-        "ËĪÉĶ"
-      ],
-      [
-        "m",
-        "in"
-      ],
-      [
-        "Ġl",
-        "ËĪÊĬk"
-      ],
-      [
-        "ËĪÉĶËĲ",
-        "t"
-      ],
-      [
-        "ĠËĪÉĶ",
-        "tÉ¾"
-      ],
-      [
-        "Ġf",
-        "ËĪaÉª"
-      ],
-      [
-        "ĠÉ¡",
-        "ÉĴt"
-      ],
-      [
-        "ËĪeËĲ",
-        "ÉĻn"
-      ],
-      [
-        "k",
-        "ËĪÉĶ"
-      ],
-      [
-        "ĠvËĪÉĽ",
-        "É¹i"
-      ],
-      [
-        "m",
-        "ÉĽ"
-      ],
-      [
-        "ËĪaÉª",
-        "z"
-      ],
-      [
-        "Ġe",
-        "sp"
-      ],
-      [
-        "É²",
-        "a"
-      ],
-      [
-        "Ġl",
-        "ËĪo"
-      ],
-      [
-        "ËĪÉĽËĲ",
-        "ra"
-      ],
-      [
-        "Î²",
-        "ËĪi"
-      ],
-      [
-        "ou",
-        "Éľ"
-      ],
-      [
-        "ËĮÉĻ",
-        "k"
-      ],
-      [
-        "tÊĥ",
-        "uËĲ"
-      ],
-      [
-        "Ġn",
-        "ËĪyÉĻ"
-      ],
-      [
-        "ÊĪ",
-        "É¾"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪy"
-      ],
-      [
-        "ĠtËĪo",
-        "Ã°o"
-      ],
-      [
-        "ËĪÉª",
-        "Ã§t"
-      ],
-      [
-        "Ġm",
-        "ÉªÃ§"
-      ],
-      [
-        "ĠËĪa",
-        "nd"
-      ],
-      [
-        "Ġkw",
-        "ËĮÉĽl"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĪaËĲ"
-      ],
-      [
-        "ĠnËĪi",
-        "Éľ"
-      ],
-      [
-        "ËĪÉĶ",
-        "p"
-      ],
-      [
-        "ËĪiËĲ",
-        "z"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĪaÊĬ"
-      ],
-      [
-        "ĠÉ¾ËĮÉĻh",
-        "i"
-      ],
-      [
-        "ĠsËĮÊĬ",
-        "o"
-      ],
-      [
-        "ĠÉĽ",
-        "É¡"
-      ],
-      [
-        "Ġd",
-        "Åĵ"
-      ],
-      [
-        "ĠÉ¡ËĮaËĲ",
-        "ÉªÉľ"
-      ],
-      [
-        "d",
-        "Éª"
-      ],
-      [
-        "l",
-        "ËĮa"
-      ],
-      [
-        "st",
-        "ËĪi"
-      ],
-      [
-        "ĠdËĮiËĲ",
-        "z"
-      ],
-      [
-        "Ġt",
-        "ËĮÊĬ"
-      ],
-      [
-        "Î¸",
-        "i"
-      ],
-      [
-        "ĠËĪÉª",
-        "skËĮoËĲ"
-      ],
-      [
-        "nd",
-        "ÉĻn"
-      ],
-      [
-        "Ġts",
-        "v"
-      ],
-      [
-        "Ġh",
-        "ËĪÉĻËĲ"
-      ],
-      [
-        "ĠÊĥ",
-        "ËĪÊĬ"
-      ],
-      [
-        "ÉĻt",
-        "ËĮeËĲ"
-      ],
-      [
-        "p",
-        "ËĮÉĽ"
-      ],
-      [
-        "ËĪaÉ¾",
-        "ÉĶn"
-      ],
-      [
-        "Ġp",
-        "ÉĽÊģ"
-      ],
-      [
-        "Ġ",
-        "y"
-      ],
-      [
-        "m",
-        "nËĮeËĲ"
-      ],
-      [
-        "ËĪÉĽ",
-        "llo"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÉĻ"
-      ],
-      [
-        "ĠËĮa",
-        "d"
-      ],
-      [
-        "ĠÊĥ",
-        "v"
-      ],
-      [
-        "ËĪÊı",
-        "É¾"
-      ],
-      [
-        "r",
-        "ËĪe"
-      ],
-      [
-        "y",
-        "ËĲ"
-      ],
-      [
-        "Ġp",
-        "ËĪaËĲs"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĽn"
-      ],
-      [
-        "Éª",
-        "dÊĴ"
-      ],
-      [
-        "ËĪua",
-        "i"
-      ],
-      [
-        "Ġf",
-        "i"
-      ],
-      [
-        "Ġt",
-        "ËĪyÉĻ"
-      ],
-      [
-        "ËĪaËĲ",
-        "ÉŁ"
-      ],
-      [
-        "Ġt",
-        "jËĪe"
-      ],
-      [
-        "ËĪaËĲn",
-        "aËĲ"
-      ],
-      [
-        "st",
-        "É¾"
-      ],
-      [
-        "Êİ",
-        "e"
-      ],
-      [
-        "ËĮe",
-        "Éªt"
-      ],
-      [
-        "b",
-        "a"
-      ],
-      [
-        "Ã°",
-        "as"
-      ],
-      [
-        "v",
-        "Êģ"
-      ],
-      [
-        "Ġz",
-        "ËĪÉĻËĲ"
-      ],
-      [
-        "ËĪaËĲ",
-        "li"
-      ],
-      [
-        "ÉŁÊ°",
-        "eËĲ"
-      ],
-      [
-        "ËĪaËĲt",
-        "eËĲ"
-      ],
-      [
-        "Ġv",
-        "ËĪa"
-      ],
-      [
-        "Ġsa",
-        "l"
-      ],
-      [
-        "ËĪaËĲ",
-        "no"
-      ],
-      [
-        "ĠÉ¡ÉĻ",
-        "z"
-      ],
-      [
-        "ĠhËĪoËĲ",
-        "ti"
-      ],
-      [
-        "ĠÉ²",
-        "ËĪiÉĽ"
-      ],
-      [
-        "t",
-        "Éľ"
-      ],
-      [
-        "ĠËĪaËĲ",
-        "p"
-      ],
-      [
-        "Ġw",
-        "ËĪÉĽl"
-      ],
-      [
-        "Ġm",
-        "ËĪÉªl"
-      ],
-      [
-        "Ġfy",
-        "ËĲÉ¾"
-      ],
-      [
-        "ËĪÉĽËĲs",
-        "aËĲ"
-      ],
-      [
-        "Ġb",
-        "ËĮiËĲ"
-      ],
-      [
-        "ËĪaËĲ",
-        "jaËĲ"
-      ],
-      [
-        "ËĪÉª",
-        "p"
-      ],
-      [
-        "Ġf",
-        "Êģ"
-      ],
-      [
-        "tsi",
-        "ËĪoËĲne"
-      ],
-      [
-        "Ġw",
-        "ËĪuÉľ"
-      ],
-      [
-        "Ġv",
-        "i"
-      ],
-      [
-        "ĠwËĪÉĳ",
-        "Éľn"
-      ],
-      [
-        "ËĪoËĲ",
-        "n"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪÉĻÉª"
-      ],
-      [
-        "ĠÊĿ",
-        "ËĪo"
-      ],
-      [
-        "Ġr",
-        "a"
-      ],
-      [
-        "m",
-        "ÉĻnt"
-      ],
-      [
-        "ËĪaÊĬ",
-        "nd"
-      ],
-      [
-        "Ġp",
-        "ÉĽÉ¾"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "oËĲ",
-        "É¾"
-      ],
-      [
-        "h",
-        "ËĪo"
-      ],
-      [
-        "ĠÉĴ",
-        "n"
-      ],
-      [
-        "ĠÊİ",
-        "e"
-      ],
-      [
-        "ĠsËĪÉª",
-        "ks"
-      ],
-      [
-        "É¡",
-        "n"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪa"
-      ],
-      [
-        "Ġ",
-        "Î¸j"
-      ],
-      [
-        "Ġp",
-        "ËĪe"
-      ],
-      [
-        "sp",
-        "e"
-      ],
-      [
-        "Ġv",
-        "ËĪÉĻ"
-      ],
-      [
-        "Ġf",
-        "ËĪÉª"
-      ],
-      [
-        "ĠËĮÉªnt",
-        "ÊĬ"
-      ],
-      [
-        "l",
-        "ÉĻn"
-      ],
-      [
-        "Ġn",
-        "ËĪiËĲd"
-      ],
-      [
-        "ĠsËĮÊĬ",
-        "a"
-      ],
-      [
-        "ĠËĪu",
-        "m"
-      ],
-      [
-        "Ġd",
-        "ËĪeÉª"
-      ],
-      [
-        "ĠËĪÊĮ",
-        "bÊ°i"
-      ],
-      [
-        "ËĪÉĳËĲ",
-        "É¾"
-      ],
-      [
-        "Ġb",
-        "ËĪiÉĽÉľt"
-      ],
-      [
-        "Êİ",
-        "os"
-      ],
-      [
-        "Ġtsh",
-        "ËĪaiÉľ"
-      ],
-      [
-        "ĠËĮÉª",
-        "skËĮaËĲ"
-      ],
-      [
-        "ĠaÊĬ",
-        "ÉĻ"
-      ],
-      [
-        "ĠËĪy",
-        "Ã¦"
-      ],
-      [
-        "Ġd",
-        "yn"
-      ],
-      [
-        "Ġm",
-        "ËĪiËĲn"
-      ],
-      [
-        "ĠËĪÊĮ",
-        "cÊ°ËĲ"
-      ],
-      [
-        "Ġs",
-        "ÉĽ"
-      ],
-      [
-        "Ġn",
-        "ËĪy"
-      ],
-      [
-        "Ġn",
-        "ËĮÉĽl"
-      ],
-      [
-        "É¡",
-        "É¾"
-      ],
-      [
-        "Êĥ",
-        "ËĪe"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĮÉĽ"
-      ],
-      [
-        "ĠËĪÉĽ",
-        "vÉ¹Éª"
-      ],
-      [
-        "ËĪÉĽl",
-        "p"
-      ],
-      [
-        "ĠbËĪa",
-        "k"
-      ],
-      [
-        "Ġ",
-        "eËĲ"
-      ],
-      [
-        "Ġf",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġk",
-        "ÉĽl"
-      ],
-      [
-        "ĠËĪeËĲ",
-        "s"
-      ],
-      [
-        "j",
-        "ËĪaËĲd"
-      ],
-      [
-        "Ġl",
-        "ËĮi"
-      ],
-      [
-        "mb",
-        "É¾e"
-      ],
-      [
-        "k",
-        "tÉĻ"
-      ],
-      [
-        "nt",
-        "a"
-      ],
-      [
-        "t",
-        "ËĪu"
-      ],
-      [
-        "ĠÃ°",
-        "ËĪat"
-      ],
-      [
-        "ĠËĪa",
-        "Î²"
-      ],
-      [
-        "ÉĻÉ¹",
-        "i"
-      ],
-      [
-        "ĠkwËĮÉĽ",
-        "lla"
-      ],
-      [
-        "Ġb",
-        "ÉĻn"
-      ],
-      [
-        "r",
-        "ËĮÉĽ"
-      ],
-      [
-        "Ġn",
-        "ÉĶ"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÉª"
-      ],
-      [
-        "ĠËĪa",
-        "p"
-      ],
-      [
-        "É¹",
-        "ÉĻ"
-      ],
-      [
-        "ËĪa",
-        "Éľkh"
-      ],
-      [
-        "ĠÊĲ",
-        "ËĪi"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĳËĲ"
-      ],
-      [
-        "Éª",
-        "É¡ÉĻn"
-      ],
-      [
-        "Ġw",
-        "ËĪai"
-      ],
-      [
-        "Ġp",
-        "ÉĻt"
-      ],
-      [
-        "kËĲ",
-        "a"
-      ],
-      [
-        "Ġb",
-        "ËĪÉĽËĲ"
-      ],
-      [
-        "ËĪeËĲ",
-        "Êĭ"
-      ],
-      [
-        "ls",
-        "ÉĻÊĬ"
-      ],
-      [
-        "ĠcËĪaËĲh",
-        "ÉªËĮeËĲ"
-      ],
-      [
-        "Ġk",
-        "ÉĻn"
-      ],
-      [
-        "ĠËĮaÉªn",
-        "ÉĻm"
-      ],
-      [
-        "ËĪuËĲ",
-        "t"
-      ],
-      [
-        "Ġh",
-        "ËĪaÊĬ"
-      ],
-      [
-        "Ġt",
-        "ËĪanto"
-      ],
-      [
-        "ĠhÉĲ",
-        "z"
-      ],
-      [
-        "Ġs",
-        "ËĪÊĮÉ¾"
-      ],
-      [
-        "Ġn",
-        "o"
-      ],
-      [
-        "Ġt",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "Ġz",
-        "ËĪaÉª"
-      ],
-      [
-        "ĠtÉķËĪiÉĽ",
-        "Éľ"
-      ],
-      [
-        "Ġko",
-        "zËĪi"
-      ],
-      [
-        "Ġk",
-        "ËĪei"
-      ],
-      [
-        "Ã°",
-        "ËĪÉĶÉ¾"
-      ],
-      [
-        "ËĮÉĶ",
-        "Êģ"
-      ],
-      [
-        "Ġt",
-        "ËĪÊĮÉ¾"
-      ],
-      [
-        "ĠÊĲ",
-        "ËĪÉĻ"
-      ],
-      [
-        "ĠÉķËĪy",
-        "ÉĽÉľ"
-      ],
-      [
-        "ĠmËĮÊĬ",
-        "ÉŁÊ°eËĲ"
-      ],
-      [
-        "m",
-        "f"
-      ],
-      [
-        "Ġv",
-        "ËĪiËĲdÉľ"
-      ],
-      [
-        "k",
-        "ËĪa"
-      ],
-      [
-        "ĠÉĲ",
-        "É¡"
-      ],
-      [
-        "k",
-        "w"
-      ],
-      [
-        "ĠÊģ",
-        "ÉĽ"
-      ],
-      [
-        "x",
-        "ÉĻn"
-      ],
-      [
-        "Ġd",
-        "ÊĬ"
-      ],
-      [
-        "ĠkËĪÊĮÉ¾",
-        "nËĮeËĲ"
-      ],
-      [
-        "jËĪaËĲd",
-        "aËĲ"
-      ],
-      [
-        "Ġf",
-        "ÉĻ"
-      ],
-      [
-        "ĠËĮi",
-        "mp"
-      ],
-      [
-        "Ġh",
-        "Éªz"
-      ],
-      [
-        "Ġ",
-        "Ê°Ïĩ"
-      ],
-      [
-        "ËĪoËĲ",
-        "ni"
-      ],
-      [
-        "Ġx",
-        "ËĪiÉľ"
-      ],
-      [
-        "ËĪeËĲ",
-        "sÊĪ"
-      ],
-      [
-        "Êı",
-        "bÉľ"
-      ],
-      [
-        "ËĮÉĶÉ¾",
-        "ke"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÉĻÊĬ"
-      ],
-      [
-        "ËĪÉª",
-        "ÊĥÉĻn"
-      ],
-      [
-        "l",
-        "es"
-      ],
-      [
-        "Ġf",
-        "ËĪiËĲ"
-      ],
-      [
-        "É¡",
-        "tÉĻ"
-      ],
-      [
-        "ËĪeËĲ",
-        "re"
-      ],
-      [
-        "Ġv",
-        "ËĮaËĲ"
-      ],
-      [
-        "Ġ",
-        "ËĪeÉª"
-      ],
-      [
-        "Ġm",
-        "ËĪuÉĻÉľn"
-      ],
-      [
-        "ĠÉ¡ËĪÊĬ",
-        "d"
-      ],
-      [
-        "ĠmËĮa",
-        "Éªn"
-      ],
-      [
-        "z",
-        "ËĪe"
-      ],
-      [
-        "ĠlËĪi",
-        "Éľ"
-      ],
-      [
-        "Ġm",
-        "u"
-      ],
-      [
-        "Ġk",
-        "ËĮÉĽl"
-      ],
-      [
-        "Ġj",
-        "ËĮÉĻh"
-      ],
-      [
-        "Ġf",
-        "ËĮÉĶÉ¾"
-      ],
-      [
-        "f",
-        "É¹"
-      ],
-      [
-        "Ġk",
-        "ËĪaÉªn"
-      ],
-      [
-        "ĠËĪÉĴ",
-        "lsÉĻÊĬ"
-      ],
-      [
-        "Î¸",
-        "ÉªÅĭ"
-      ],
-      [
-        "Ġth",
-        "ËĪonÉ¡Éľ"
-      ],
-      [
-        "t",
-        "ËĪÉĳ"
-      ],
-      [
-        "Î¸j",
-        "o"
-      ],
-      [
-        "m",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġ",
-        "os"
-      ],
-      [
-        "Ġs",
-        "ÊĬ"
-      ],
-      [
-        "ĠsËĪÊĮ",
-        "mÉĻ"
-      ],
-      [
-        "ĠvËĮÉĽ",
-        "n"
-      ],
-      [
-        "n",
-        "ËĪo"
-      ],
-      [
-        "ĠËĪak",
-        "tÊĥuËĲ"
-      ],
-      [
-        "É£",
-        "a"
-      ],
-      [
-        "ĠtÊ°",
-        "i"
-      ],
-      [
-        "Ġf",
-        "ËĮi"
-      ],
-      [
-        "Ġv",
-        "ËĪÉĽl"
-      ],
-      [
-        "ĠtËĪu",
-        "tËĲi"
-      ],
-      [
-        "x",
-        "os"
-      ]
-    ]
-  }
-}
\ No newline at end of file

From 7c4a9d635cb5225ad1cac98f8c603bef7341a156 Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Wed, 4 Feb 2026 09:30:48 +0000
Subject: [PATCH 34/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 24 +++++++------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 508a1332c31e..09a6491b364a 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -1950,7 +1950,7 @@ def _sample_audio_codes(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Sample audio codes from logits using either local transformer or parallel sampling.
-        
+
         Returns:
             audio_codes_next: Sampled codes with temperature/topk (B, num_codebooks)
             all_codes_next_argmax: Argmax sampled codes for EOS detection (B, num_codebooks)
@@ -1972,9 +1972,7 @@ def _sample_audio_codes(
             all_codes_next_argmax = audio_codes_next
         else:
             # Parallel sampling from all codebook logits
-            audio_codes_next = self.sample_codes_from_logits(
-                all_code_logits_t, temperature=temperature, topk=topk
-            )
+            audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk)
             # Argmax sampling for reliable EOS detection
             all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01)
 
@@ -1995,7 +1993,7 @@ def _process_phoneme_predictions(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Process phoneme predictions for the current timestep.
-        
+
         Returns:
             pred_phoneme_tokens: Predicted phoneme tokens (B, phoneme_stacking_factor)
             gt_phoneme_tokens_current: GT phoneme tokens for current timestep (B, phoneme_stacking_factor)
@@ -2034,9 +2032,7 @@ def _process_phoneme_predictions(
         gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx]
 
         # Select input tokens (GT or predicted) and embed
-        input_phoneme_tokens_current = (
-            gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens
-        )
+        input_phoneme_tokens_current = gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens
         input_phoneme_embedding = self.embed_phoneme_tokens(input_phoneme_tokens_current.unsqueeze(2))
 
         return pred_phoneme_tokens, gt_phoneme_tokens_current, input_phoneme_tokens_current, input_phoneme_embedding
@@ -2051,7 +2047,7 @@ def _compute_phoneme_channel_input(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Compute the phoneme channel input embedding with masking.
-        
+
         Returns:
             phoneme_channel_input_t: Masked phoneme embedding (B, 1, E)
             use_phoneme_input: Mask indicating which items should use phoneme input (B, 1, 1)
@@ -2061,9 +2057,7 @@ def _compute_phoneme_channel_input(
         use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float()
 
         # Create zero embedding for items not using phoneme input
-        zero_phoneme_embedding = torch.zeros(
-            actual_batch_size, 1, self.cfg.embedding_dim, device=device
-        )
+        zero_phoneme_embedding = torch.zeros(actual_batch_size, 1, self.cfg.embedding_dim, device=device)
 
         # Combine: use phoneme embedding where active, zero otherwise
         phoneme_channel_input_t = (
@@ -2088,7 +2082,7 @@ def _prepare_next_decoder_input(
     ) -> torch.Tensor:
         """
         Prepare the input embedding for the next decoder step.
-        
+
         Handles:
         - Mixing context embeddings with generated audio embeddings based on context completeness
         - Adding streaming text embeddings if in streaming mode
@@ -2487,9 +2481,7 @@ def infer_batch(
 
             # Calculate predicted lengths, accounting for context offset
             pred_codes_start_indices = context_plus_audio_lens - min_context_len
-            predicted_lens = [
-                end_indices.get(i, max_decoder_steps) for i in range(actual_batch_size)
-            ]
+            predicted_lens = [end_indices.get(i, max_decoder_steps) for i in range(actual_batch_size)]
             predicted_codes_lens = torch.tensor(predicted_lens, device=device).long()
             predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices
 

From 61b8afde37bc0b393e1b9d67a7ec8dfa656de2f2 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Thu, 5 Feb 2026 14:34:04 -0800
Subject: [PATCH 35/94] Magpietts decoderonly 2601 simplify code (#60)

* adding streaming inference support

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* some code cleanup

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* update to get the right number of samples for context audio

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* add text eos token

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* fix sample rate issues

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* simplifying streaming inference

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* streaming batched inference working

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* inference features added

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* Inference function simplified

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* correct handling of audio EOS

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* bug fix

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* simplify streaming init

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* remove unnecessary line

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* bug fix able to reproduce F2F presentation results with new inference

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

---------

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 examples/tts/magpietts_streaming_inference.py | 1018 ++++++++
 nemo/collections/tts/models/easy_magpietts.py | 2235 ++++++++++-------
 .../modules/magpietts_inference/inference.py  |   20 +-
 3 files changed, 2354 insertions(+), 919 deletions(-)
 create mode 100644 examples/tts/magpietts_streaming_inference.py

diff --git a/examples/tts/magpietts_streaming_inference.py b/examples/tts/magpietts_streaming_inference.py
new file mode 100644
index 000000000000..6e72ea77b8e6
--- /dev/null
+++ b/examples/tts/magpietts_streaming_inference.py
@@ -0,0 +1,1018 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MagpieTTS Streaming Inference Test Script.
+
+This script tests the streaming TTS inference functionality, supporting both
+single sample (batch_size=1) and batched inference (batch_size>1).
+
+For batched inference, each item in the batch can have different context lengths
+and be in different processing phases (context, prompt, phoneme-only, audio).
+
+Example usage:
+    # Single sample inference from checkpoint
+    python examples/tts/magpietts_streaming_inference.py \
+        --hparams_file /path/to/hparams.yaml \
+        --checkpoint_file /path/to/model.ckpt \
+        --codecmodel_path /path/to/codec.nemo \
+        --context_audio /path/to/context.wav \
+        --text "Hello, this is a test of streaming TTS inference." \
+        --output_path /path/to/output.wav
+
+    # Batched inference with multiple context audios
+    python examples/tts/magpietts_streaming_inference.py \
+        --nemo_file /path/to/model.nemo \
+        --codecmodel_path /path/to/codec.nemo \
+        --context_audio /path/to/context1.wav /path/to/context2.wav \
+        --context_duration 3.0 5.0 \
+        --text "First text to synthesize." "Second text to synthesize." \
+        --output_path /path/to/output.wav
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import time
+from typing import Optional
+
+import numpy as np
+import soundfile as sf
+import torch
+from omegaconf import OmegaConf, open_dict
+
+from nemo.collections.tts.models import EasyMagpieTTSModel
+from nemo.utils import logging
+
+
+def load_model(
+    hparams_file: Optional[str],
+    checkpoint_file: Optional[str],
+    nemo_file: Optional[str],
+    codecmodel_path: str,
+    device: str = "cuda",
+) -> EasyMagpieTTSModel:
+    """
+    Load an EasyMagpieTTSModel from checkpoint or .nemo file.
+
+    Args:
+        hparams_file: Path to hparams.yaml (required with checkpoint_file).
+        checkpoint_file: Path to .ckpt file (required with hparams_file).
+        nemo_file: Path to .nemo file (alternative to hparams + checkpoint).
+        codecmodel_path: Path to the audio codec model.
+        device: Device to load model on.
+
+    Returns:
+        Loaded model ready for inference.
+    """
+    if hparams_file is not None and checkpoint_file is not None:
+        # Load from hparams + checkpoint
+        logging.info(f"Loading model from checkpoint: {checkpoint_file}")
+        model_cfg = OmegaConf.load(hparams_file)
+
+        # Handle different config structures
+        if "cfg" in model_cfg:
+            model_cfg = model_cfg.cfg
+
+        with open_dict(model_cfg):
+            # Override codec model path
+            model_cfg.codecmodel_path = codecmodel_path
+
+            # Disable training datasets
+            model_cfg.train_ds = None
+            model_cfg.validation_ds = None
+
+        model = EasyMagpieTTSModel(cfg=model_cfg)
+
+        # Load weights
+        ckpt = torch.load(checkpoint_file, weights_only=False)
+        state_dict = ckpt['state_dict']
+        model.load_state_dict(state_dict)
+
+    elif nemo_file is not None:
+        # Load from .nemo file
+        logging.info(f"Loading model from NeMo archive: {nemo_file}")
+        model_cfg = EasyMagpieTTSModel.restore_from(nemo_file, return_config=True)
+
+        with open_dict(model_cfg):
+            model_cfg.codecmodel_path = codecmodel_path
+            model_cfg.train_ds = None
+            model_cfg.validation_ds = None
+
+        model = EasyMagpieTTSModel.restore_from(nemo_file, override_config_path=model_cfg)
+
+    else:
+        raise ValueError("Must provide either (hparams_file + checkpoint_file) or nemo_file")
+
+    model.to(device)
+    model.eval()
+    logging.info("Model loaded and ready for streaming inference.")
+
+    return model
+
+
+def load_audio(audio_path: str, target_sample_rate: int) -> torch.Tensor:
+    """
+    Load audio file and resample if needed.
+
+    Args:
+        audio_path: Path to audio file.
+        target_sample_rate: Target sample rate.
+
+    Returns:
+        Audio tensor of shape (1, num_samples).
+    """
+    audio, sr = sf.read(audio_path, dtype='float32')
+
+    # Convert to mono if stereo
+    if len(audio.shape) > 1:
+        audio = audio.mean(axis=1)
+
+    # Resample if needed
+    if sr != target_sample_rate:
+        import librosa
+        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate)
+
+    return torch.from_numpy(audio).unsqueeze(0)  # (1, num_samples)
+
+
+def adjust_audio_to_duration(
+    audio: torch.Tensor,
+    sample_rate: int,
+    target_duration: float,
+    codec_model_samples_per_frame: int,
+) -> torch.Tensor:
+    """
+    Adjust audio to target_duration seconds, aligned to codec frame boundaries.
+
+    The target number of samples is calculated to align with codec frame boundaries:
+    1. Convert target_duration to number of codec frames
+    2. Convert codec frames back to samples
+
+    If audio is longer than target, take the first target_duration seconds.
+    If audio is shorter, repeat it until it reaches target_duration seconds.
+
+    Args:
+        audio: Audio tensor of shape (1, num_samples).
+        sample_rate: Sample rate of the audio.
+        target_duration: Target duration in seconds.
+        codec_model_samples_per_frame: Number of audio samples per codec frame
+            (codec downsampling factor).
+
+    Returns:
+        Audio tensor of shape (1, target_num_samples) where target_num_samples
+        is aligned to codec frame boundaries.
+    """
+    # Calculate target samples aligned to codec frame boundaries
+    # Same logic as text_to_speech_dataset.py
+    num_codec_frames = int(target_duration * sample_rate / codec_model_samples_per_frame)
+    target_num_samples = num_codec_frames * codec_model_samples_per_frame
+    current_num_samples = audio.size(1)
+
+    if current_num_samples >= target_num_samples:
+        # Audio is longer than target - take the first target_duration seconds
+        audio = audio[:, :target_num_samples]
+    else:
+        # Audio is shorter - repeat until we have enough samples
+        num_repeats = int(np.ceil(target_num_samples / current_num_samples))
+        audio_repeated = audio.repeat(1, num_repeats)
+        audio = audio_repeated[:, :target_num_samples]
+
+    return audio
+
+
+def run_streaming_inference(
+    model: EasyMagpieTTSModel,
+    context_audio: torch.Tensor,
+    context_audio_lens: torch.Tensor,
+    context_text: str,
+    text: str,
+    phoneme_text: Optional[str] = None,
+    use_gt_phonemes: bool = False,
+    inference_mode: Optional[str] = None,
+    use_cfg: bool = False,
+    cfg_scale: float = 1.5,
+    use_local_transformer: bool = False,
+    temperature: float = 0.7,
+    topk: int = 80,
+    max_steps: int = 500,
+    verbose: bool = True,
+    force_dropout_text: bool = False,
+) -> tuple:
+    """
+    Run streaming TTS inference.
+
+    Args:
+        model: The loaded EasyMagpieTTSModel.
+        context_audio: Context audio tensor (1, num_samples).
+        context_audio_lens: Length of context audio (1,).
+        context_text: Context text for speaker conditioning.
+        text: Main text to synthesize.
+        phoneme_text: Optional phoneme text for GT conditioning. If None, uses text.
+        use_gt_phonemes: If True, use GT phonemes as decoder input (teacher forcing).
+        inference_mode: Inference mode name (e.g., "streaming_4_8").
+        use_cfg: Whether to use classifier-free guidance.
+        cfg_scale: CFG scale factor.
+        use_local_transformer: Whether to use local transformer.
+        temperature: Sampling temperature.
+        topk: Top-k sampling parameter.
+        max_steps: Maximum generation steps.
+        verbose: Whether to print progress.
+
+    Returns:
+        Tuple of (output, timing_info, context_audio_decoded, context_audio_decoded_lens).
+        output is StreamingFinalizeOutput with audio, codes, and phoneme predictions.
+        context_audio_decoded is the decoded context audio from the model's internal codes (for sanity checking).
+    """
+    device = next(model.parameters()).device
+
+    # Encode context audio to codes
+    context_audio = context_audio.to(device)
+    context_audio_lens = context_audio_lens.to(device)
+
+    with torch.inference_mode():
+        context_audio_codes, context_audio_codes_lens = model.audio_to_codes(
+            context_audio, context_audio_lens
+        )
+
+    # Tokenize context text
+    # Use the text conditioning tokenizer
+    tokenizer_name = model.text_conditioning_tokenizer_name
+    context_text_tokens = model.tokenizer.encode(context_text, tokenizer_name=tokenizer_name)
+    context_text_tokens = torch.tensor([context_text_tokens], dtype=torch.long, device=device)
+    context_text_tokens_lens = torch.tensor([context_text_tokens.size(1)], dtype=torch.long, device=device)
+
+    # Tokenize main text
+    # Get the appropriate tokenizer name for main text
+    if hasattr(model.tokenizer, 'tokenizers') and 'english_phoneme' in model.tokenizer.tokenizers:
+        main_tokenizer_name = 'english_phoneme'
+    else:
+        main_tokenizer_name = tokenizer_name
+
+    text_tokens = model.tokenizer.encode(text, tokenizer_name=main_tokenizer_name)
+    text_tokens = text_tokens + [model.eos_id]
+    text_tokens = torch.tensor(text_tokens, dtype=torch.long, device=device)
+
+    # Tokenize phoneme text if provided (for GT phoneme conditioning)
+    gt_phoneme_tokens = None
+    gt_phoneme_tokens_lens = None
+    if model.phoneme_tokenizer is not None:
+        phoneme_source = phoneme_text if phoneme_text is not None else text
+        phoneme_tokens_list = model.phoneme_tokenizer.encode(phoneme_source)
+        # Add BOS and EOS
+        bos_id = model.phoneme_tokenizer.bos_token_id
+        eos_id = model.phoneme_tokenizer.eos_token_id
+        phoneme_tokens_list = [bos_id] + phoneme_tokens_list + [eos_id]
+        gt_phoneme_tokens = torch.tensor([phoneme_tokens_list], dtype=torch.long, device=device)
+        gt_phoneme_tokens_lens = torch.tensor([len(phoneme_tokens_list)], dtype=torch.long, device=device)
+
+    phoneme_input_type = 'gt' if use_gt_phonemes else 'pred'
+
+    # Get streaming delays for logging
+    mode_name = inference_mode or model.default_inference_mode
+    training_mode = model.mode_name_to_mode.get(mode_name, model.training_modes[0])
+    phoneme_delay = training_mode.streaming_phonemes_delay
+    speech_delay = training_mode.streaming_speech_delay
+
+    if verbose:
+        logging.info(f"Context audio codes shape: {context_audio_codes.shape}")
+        logging.info(f"Context text tokens: {context_text_tokens.shape}")
+        logging.info(f"Main text tokens: {text_tokens.shape} ({len(text_tokens)} tokens)")
+        if gt_phoneme_tokens is not None:
+            logging.info(f"GT phoneme tokens: {gt_phoneme_tokens.shape} ({gt_phoneme_tokens_lens[0].item()} tokens)")
+        logging.info(f"Phoneme input type: {phoneme_input_type}")
+        logging.info(f"Using inference mode: {mode_name}")
+        logging.info(f"Phoneme delay: {phoneme_delay}, Speech delay: {speech_delay}")
+        logging.info("Phases: Prompt (0 to phoneme_delay) -> Phoneme-only (phoneme_delay to speech_delay) -> Audio")
+
+    # Initialize streaming state
+    start_time = time.time()
+
+    state = model.streaming_init(
+        context_audio_codes=context_audio_codes,
+        context_audio_codes_lens=context_audio_codes_lens,
+        context_text_tokens=context_text_tokens,
+        context_text_tokens_lens=context_text_tokens_lens,
+        inference_mode=inference_mode,
+        use_cfg=use_cfg,
+        cfg_scale=cfg_scale,
+        use_local_transformer=use_local_transformer,
+        temperature=temperature,
+        topk=topk,
+        phoneme_input_type=phoneme_input_type,
+        gt_phoneme_tokens=gt_phoneme_tokens,
+        gt_phoneme_tokens_lens=gt_phoneme_tokens_lens,
+    )
+
+    init_time = time.time() - start_time
+    if verbose:
+        logging.info(f"Streaming init completed in {init_time:.3f}s")
+
+    # Decode and return context audio for sanity check
+    # The context_audio_codes in state have special tokens and are stacked
+    # We need to remove special tokens and decode them
+    with torch.inference_mode():
+        ctx_codes = state.context_audio_codes.clone()
+        ctx_codes_lens = state.context_audio_codes_lens.clone()
+        # Remove special tokens (BOS and EOS)
+        ctx_codes, ctx_codes_lens = model.remove_special_tokens(
+            codes=ctx_codes,
+            codes_len=ctx_codes_lens,
+        )
+        # codes_to_audio will handle unstacking internally
+        context_audio_decoded, context_audio_decoded_lens, _ = model.codes_to_audio(ctx_codes, ctx_codes_lens)
+
+    # Feed text tokens one at a time
+    generation_start = time.time()
+    num_audio_frames = 0
+    num_phoneme_frames = 0
+    prompt_phase_tokens = 0
+    phoneme_only_phase_tokens = 0
+
+    for i, token in enumerate(text_tokens):
+        state, audio_codes, phoneme_tokens = model.streaming_step(
+            state, text_tokens=token.unsqueeze(0), force_dropout_text=force_dropout_text
+        )
+
+        # Track which phase we're in
+        if audio_codes is None and phoneme_tokens is None:
+            prompt_phase_tokens += 1
+        elif audio_codes is None and phoneme_tokens is not None:
+            phoneme_only_phase_tokens += 1
+            num_phoneme_frames += 1
+        else:
+            if audio_codes is not None:
+                num_audio_frames += 1
+            if phoneme_tokens is not None:
+                num_phoneme_frames += 1
+
+        if verbose and (i + 1) % 10 == 0:
+            phase = "prompt" if audio_codes is None and phoneme_tokens is None else (
+                "phoneme-only" if audio_codes is None else "audio"
+            )
+            logging.info(
+                f"Processed {i + 1}/{len(text_tokens)} text tokens (phase: {phase}), "
+                f"audio frames: {num_audio_frames}, phoneme frames: {num_phoneme_frames}"
+            )
+
+        if state.finished:
+            if verbose:
+                logging.info(f"EOS detected at text token {i + 1}")
+            break
+
+    # Continue generating until finished (text has ended)
+    continuation_steps = 0
+    while not state.finished and continuation_steps < max_steps:
+        state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=None, force_dropout_text=force_dropout_text)
+
+        if audio_codes is not None:
+            num_audio_frames += 1
+        if phoneme_tokens is not None:
+            num_phoneme_frames += 1
+
+        continuation_steps += 1
+
+        if verbose and continuation_steps % 20 == 0:
+            logging.info(
+                f"Continuation step {continuation_steps}, "
+                f"audio frames: {num_audio_frames}, phoneme frames: {num_phoneme_frames}"
+            )
+
+    generation_time = time.time() - generation_start
+
+    if verbose:
+        logging.info(f"Generation completed in {generation_time:.3f}s")
+        logging.info(f"Prompt phase tokens: {prompt_phase_tokens}")
+        logging.info(f"Phoneme-only phase tokens: {phoneme_only_phase_tokens}")
+        logging.info(f"Audio frames generated: {num_audio_frames}")
+        logging.info(f"Phoneme frames generated: {num_phoneme_frames}")
+        logging.info(f"Continuation steps: {continuation_steps}")
+
+    # Finalize and get complete audio
+    output = model.streaming_finalize(state)
+
+    total_time = time.time() - start_time
+
+    if verbose and output.phoneme_text:
+        logging.info(f"Predicted phoneme text: {output.phoneme_text[0]}")
+
+    timing_info = {
+        'init_time': init_time,
+        'generation_time': generation_time,
+        'total_time': total_time,
+        'num_text_tokens': len(text_tokens),
+        'prompt_phase_tokens': prompt_phase_tokens,
+        'phoneme_only_phase_tokens': phoneme_only_phase_tokens,
+        'num_audio_frames': num_audio_frames,
+        'num_phoneme_frames': num_phoneme_frames,
+        'continuation_steps': continuation_steps,
+    }
+
+    return output, timing_info, context_audio_decoded, context_audio_decoded_lens
+
+
+def run_batched_streaming_inference(
+    model: EasyMagpieTTSModel,
+    context_audios: list[torch.Tensor],
+    context_audio_lens_list: list[torch.Tensor],
+    context_texts: list[str],
+    texts: list[str],
+    phoneme_texts: Optional[list[str]] = None,
+    use_gt_phonemes: bool = False,
+    inference_mode: Optional[str] = None,
+    use_cfg: bool = False,
+    cfg_scale: float = 1.5,
+    use_local_transformer: bool = False,
+    temperature: float = 0.7,
+    topk: int = 80,
+    max_steps: int = 500,
+    verbose: bool = True,
+    force_dropout_text: bool = False,
+) -> tuple:
+    """
+    Run batched streaming TTS inference.
+
+    Each batch item can have different context lengths. The streaming processes
+    only the minimum context length initially, then continues processing remaining
+    context per-item in the "context phase" before moving to prompt/audio phases.
+
+    Args:
+        model: The loaded EasyMagpieTTSModel.
+        context_audios: List of context audio tensors, each (1, num_samples).
+        context_audio_lens_list: List of context audio lengths, each (1,).
+        context_texts: List of context texts for speaker conditioning.
+        texts: List of main texts to synthesize.
+        phoneme_texts: Optional list of phoneme texts for GT conditioning. If None, uses texts.
+        use_gt_phonemes: If True, use GT phonemes as decoder input (teacher forcing).
+        inference_mode: Inference mode name (e.g., "streaming_4_8").
+        use_cfg: Whether to use classifier-free guidance.
+        cfg_scale: CFG scale factor.
+        use_local_transformer: Whether to use local transformer.
+        temperature: Sampling temperature.
+        topk: Top-k sampling parameter.
+        max_steps: Maximum generation steps.
+        verbose: Whether to print progress.
+
+    Returns:
+        Tuple of (output, timing_info) where output is StreamingFinalizeOutput.
+    """
+    device = next(model.parameters()).device
+    batch_size = len(context_audios)
+
+    assert len(context_texts) == batch_size, "Number of context texts must match batch size"
+    assert len(texts) == batch_size, "Number of texts must match batch size"
+
+    # Encode context audio to codes for each item
+    context_audio_codes_list = []
+    context_audio_codes_lens_list = []
+
+    with torch.inference_mode():
+        for i in range(batch_size):
+            context_audio = context_audios[i].to(device)
+            context_audio_lens = context_audio_lens_list[i].to(device)
+            codes, codes_lens = model.audio_to_codes(context_audio, context_audio_lens)
+            context_audio_codes_list.append(codes)
+            context_audio_codes_lens_list.append(codes_lens)
+
+    # Pad and batch context audio codes
+    max_context_len = max(c.size(-1) for c in context_audio_codes_list)
+    num_codebooks = context_audio_codes_list[0].size(1)
+
+    context_audio_codes = torch.zeros(batch_size, num_codebooks, max_context_len, dtype=torch.long, device=device)
+    context_audio_codes_lens = torch.zeros(batch_size, dtype=torch.long, device=device)
+
+    for i in range(batch_size):
+        codes = context_audio_codes_list[i]
+        codes_len = context_audio_codes_lens_list[i]
+        context_audio_codes[i, :, :codes.size(-1)] = codes[0]
+        context_audio_codes_lens[i] = codes_len[0]
+
+    # Tokenize context texts
+    tokenizer_name = model.text_conditioning_tokenizer_name
+    context_text_tokens_list = []
+    for ctx_text in context_texts:
+        tokens = model.tokenizer.encode(ctx_text, tokenizer_name=tokenizer_name)
+        context_text_tokens_list.append(tokens)
+
+    # Pad and batch context text tokens
+    max_context_text_len = max(len(t) for t in context_text_tokens_list)
+    context_text_tokens = torch.zeros(batch_size, max_context_text_len, dtype=torch.long, device=device)
+    context_text_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device)
+
+    for i, tokens in enumerate(context_text_tokens_list):
+        context_text_tokens[i, :len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device)
+        context_text_tokens_lens[i] = len(tokens)
+
+    # Tokenize main texts
+    if hasattr(model.tokenizer, 'tokenizers') and 'english_phoneme' in model.tokenizer.tokenizers:
+        main_tokenizer_name = 'english_phoneme'
+    else:
+        main_tokenizer_name = tokenizer_name
+
+    text_tokens_list = []
+    for text in texts:
+        tokens = model.tokenizer.encode(text, tokenizer_name=main_tokenizer_name)
+        tokens = tokens + [model.eos_id]
+        text_tokens_list.append(torch.tensor(tokens, dtype=torch.long, device=device))
+
+    max_text_len = max(len(t) for t in text_tokens_list)
+
+    # Tokenize phoneme texts if model has phoneme tokenizer
+    gt_phoneme_tokens = None
+    gt_phoneme_tokens_lens = None
+    if model.phoneme_tokenizer is not None:
+        phoneme_sources = phoneme_texts if phoneme_texts is not None else texts
+        bos_id = model.phoneme_tokenizer.bos_token_id
+        eos_id = model.phoneme_tokenizer.eos_token_id
+        phoneme_tokens_lists = []
+        for ptext in phoneme_sources:
+            tokens = model.phoneme_tokenizer.encode(ptext)
+            tokens = [bos_id] + tokens + [eos_id]
+            phoneme_tokens_lists.append(tokens)
+        max_phoneme_len = max(len(t) for t in phoneme_tokens_lists)
+        gt_phoneme_tokens = torch.zeros(batch_size, max_phoneme_len, dtype=torch.long, device=device)
+        gt_phoneme_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device)
+        for i, tokens in enumerate(phoneme_tokens_lists):
+            gt_phoneme_tokens[i, :len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device)
+            gt_phoneme_tokens_lens[i] = len(tokens)
+
+    phoneme_input_type = 'gt' if use_gt_phonemes else 'pred'
+
+    # Get streaming delays for logging
+    mode_name = inference_mode or model.default_inference_mode
+    training_mode = model.mode_name_to_mode.get(mode_name, model.training_modes[0])
+    phoneme_delay = training_mode.streaming_phonemes_delay
+    speech_delay = training_mode.streaming_speech_delay
+
+    if verbose:
+        logging.info(f"Batch size: {batch_size}")
+        logging.info(f"Context audio codes shape: {context_audio_codes.shape}")
+        logging.info(f"Context audio codes lens: {context_audio_codes_lens.tolist()}")
+        logging.info(f"Context text tokens shape: {context_text_tokens.shape}")
+        logging.info(f"Context text tokens lens: {context_text_tokens_lens.tolist()}")
+        logging.info(f"Max text tokens: {max_text_len}")
+        logging.info(f"Text tokens per item: {[len(t) for t in text_tokens_list]}")
+        if gt_phoneme_tokens is not None:
+            logging.info(f"GT phoneme tokens shape: {gt_phoneme_tokens.shape}")
+            logging.info(f"GT phoneme tokens lens: {gt_phoneme_tokens_lens.tolist()}")
+        logging.info(f"Phoneme input type: {phoneme_input_type}")
+        logging.info(f"Using inference mode: {mode_name}")
+        logging.info(f"Phoneme delay: {phoneme_delay}, Speech delay: {speech_delay}")
+
+    # Initialize streaming state
+    start_time = time.time()
+
+    state = model.streaming_init(
+        context_audio_codes=context_audio_codes,
+        context_audio_codes_lens=context_audio_codes_lens,
+        context_text_tokens=context_text_tokens,
+        context_text_tokens_lens=context_text_tokens_lens,
+        inference_mode=inference_mode,
+        use_cfg=use_cfg,
+        cfg_scale=cfg_scale,
+        use_local_transformer=use_local_transformer,
+        temperature=temperature,
+        topk=topk,
+        phoneme_input_type=phoneme_input_type,
+        gt_phoneme_tokens=gt_phoneme_tokens,
+        gt_phoneme_tokens_lens=gt_phoneme_tokens_lens,
+    )
+
+    init_time = time.time() - start_time
+    if verbose:
+        logging.info(f"Streaming init completed in {init_time:.3f}s")
+        logging.info(f"Initial context_position: {state.context_position.tolist()}")
+        logging.info(f"Full context lens: {state.full_context_lens.tolist()}")
+
+    # Feed text tokens one at a time
+    generation_start = time.time()
+    step_count = 0
+    num_audio_frames = 0
+
+    # Track which items have finished their text
+    text_positions = torch.zeros(batch_size, dtype=torch.long, device=device)
+    text_finished_mask = torch.zeros(batch_size, dtype=torch.bool, device=device)
+
+    # Main streaming loop
+    while not state.finished.all() and step_count < max_steps + max_text_len:
+        # Determine which items are in context phase
+        in_context_phase = state.context_position < state.full_context_lens
+
+        # Prepare text tokens for this step
+        # Items in context phase: use 0 (will be ignored)
+        # Items not in context phase: use their next text token or 0 if text finished
+        text_tokens_batch = torch.zeros(batch_size, dtype=torch.long, device=device)
+
+        for i in range(batch_size):
+            if not in_context_phase[i] and not text_finished_mask[i]:
+                if text_positions[i] < len(text_tokens_list[i]):
+                    text_tokens_batch[i] = text_tokens_list[i][text_positions[i]]
+                    text_positions[i] += 1
+                else:
+                    text_finished_mask[i] = True
+
+        # Determine if we should pass None (all items have finished text and exited context)
+        all_text_done = text_finished_mask.all() and not in_context_phase.any()
+
+        if all_text_done:
+            state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=None, force_dropout_text=force_dropout_text)
+        else:
+            state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=text_tokens_batch, force_dropout_text=force_dropout_text)
+
+        if audio_codes is not None:
+            num_audio_frames += 1
+
+        step_count += 1
+
+        if verbose and step_count % 20 == 0:
+            in_ctx = state.context_position < state.full_context_lens
+            logging.info(
+                f"Step {step_count}: "
+                f"in_context_phase={in_ctx.tolist()}, "
+                f"text_positions={text_positions.tolist()}, "
+                f"audio_frames={num_audio_frames}, "
+                f"finished={state.finished.tolist()}"
+            )
+
+    generation_time = time.time() - generation_start
+
+    if verbose:
+        logging.info(f"Generation completed in {generation_time:.3f}s")
+        logging.info(f"Total steps: {step_count}")
+        logging.info(f"Audio frames generated: {num_audio_frames}")
+
+    # Finalize and get complete audio
+    output = model.streaming_finalize(state)
+
+    total_time = time.time() - start_time
+
+    if verbose and output.phoneme_text:
+        for i, ptext in enumerate(output.phoneme_text):
+            logging.info(f"Predicted phoneme text [{i}]: {ptext}")
+
+    timing_info = {
+        'init_time': init_time,
+        'generation_time': generation_time,
+        'total_time': total_time,
+        'num_text_tokens': [len(t) for t in text_tokens_list],
+        'num_audio_frames': num_audio_frames,
+        'total_steps': step_count,
+    }
+
+    return output, timing_info
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="MagpieTTS Streaming Inference Test Script",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    # Model loading arguments
+    model_group = parser.add_argument_group('Model Loading')
+    model_group.add_argument(
+        '--hparams_file',
+        type=str,
+        default=None,
+        help='Path to hparams.yaml file',
+    )
+    model_group.add_argument(
+        '--checkpoint_file',
+        type=str,
+        default=None,
+        help='Path to .ckpt checkpoint file',
+    )
+    model_group.add_argument(
+        '--nemo_file',
+        type=str,
+        default=None,
+        help='Path to .nemo model file',
+    )
+    model_group.add_argument(
+        '--codecmodel_path',
+        type=str,
+        required=True,
+        help='Path to audio codec model (.nemo)',
+    )
+
+    # Input arguments
+    input_group = parser.add_argument_group('Input')
+    input_group.add_argument(
+        '--context_audio',
+        type=str,
+        nargs='+',
+        required=True,
+        help='Path(s) to context audio file(s) for speaker cloning. '
+             'Multiple files enable batched inference.',
+    )
+    input_group.add_argument(
+        '--context_text',
+        type=str,
+        nargs='+',
+        default=["[NO TEXT CONTEXT]"],
+        help='Context text(s) for speaker conditioning. Provide one per context audio, '
+             'or a single value to use for all. (default: "[NO TEXT CONTEXT]")',
+    )
+    input_group.add_argument(
+        '--context_duration',
+        type=float,
+        nargs='+',
+        default=[5.0],
+        help='Target duration(s) for context audio in seconds. Provide one per context audio, '
+             'or a single value to use for all. If audio is longer, '
+             'first N seconds are used. If shorter, audio is repeated. (default: 5.0)',
+    )
+    input_group.add_argument(
+        '--text',
+        type=str,
+        nargs='+',
+        required=True,
+        help='Text(s) to synthesize. Provide one per context audio for batched inference.',
+    )
+    input_group.add_argument(
+        '--phoneme_text',
+        type=str,
+        nargs='+',
+        default=None,
+        help='Phoneme text(s) for GT phoneme conditioning. If not provided, uses --text. '
+             'Provide one per context audio for batched inference.',
+    )
+    input_group.add_argument(
+        '--use_gt_phonemes',
+        action='store_true',
+        help='Use ground-truth phonemes as decoder input (teacher forcing). '
+             'If not set, uses model-predicted phonemes.',
+    )
+
+    # Output arguments
+    output_group = parser.add_argument_group('Output')
+    output_group.add_argument(
+        '--output_path',
+        type=str,
+        default='streaming_output.wav',
+        help='Path for output audio file',
+    )
+
+    # Inference arguments
+    infer_group = parser.add_argument_group('Inference Parameters')
+    infer_group.add_argument(
+        '--inference_mode',
+        type=str,
+        default=None,
+        help='Inference mode name (e.g., "streaming_4_8"). Uses model default if not specified.',
+    )
+    infer_group.add_argument(
+        '--use_cfg',
+        action='store_true',
+        help='Enable classifier-free guidance',
+    )
+    infer_group.add_argument(
+        '--cfg_scale',
+        type=float,
+        default=1.5,
+        help='CFG scale factor (higher = stronger conditioning)',
+    )
+    infer_group.add_argument(
+        '--use_local_transformer',
+        action='store_true',
+        help='Use local transformer for inference',
+    )
+    infer_group.add_argument(
+        '--temperature',
+        type=float,
+        default=0.7,
+        help='Sampling temperature',
+    )
+    infer_group.add_argument(
+        '--topk',
+        type=int,
+        default=80,
+        help='Top-k sampling parameter',
+    )
+    infer_group.add_argument(
+        '--max_steps',
+        type=int,
+        default=500,
+        help='Maximum generation steps after text ends',
+    )
+    infer_group.add_argument(
+        '--device',
+        type=str,
+        default='cuda',
+        choices=['cuda', 'cpu'],
+        help='Device to run inference on',
+    )
+    infer_group.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Print detailed progress information',
+    )
+    infer_group.add_argument(
+        '--force_dropout_text',
+        action='store_true',
+        help='Force dropout of text embeddings (pass zeros) to test phoneme-only inference',
+    )
+
+    args = parser.parse_args()
+
+    # Validate arguments
+    has_ckpt_mode = args.hparams_file is not None and args.checkpoint_file is not None
+    has_nemo_mode = args.nemo_file is not None
+
+    if not (has_ckpt_mode or has_nemo_mode):
+        parser.error("Must provide either (--hparams_file and --checkpoint_file) or --nemo_file")
+
+    # Load model
+    model = load_model(
+        hparams_file=args.hparams_file,
+        checkpoint_file=args.checkpoint_file,
+        nemo_file=args.nemo_file,
+        codecmodel_path=args.codecmodel_path,
+        device=args.device,
+    )
+
+    model = model.float()
+
+    # Determine batch size from number of context audios
+    batch_size = len(args.context_audio)
+
+    # Expand context_text, context_duration, and text to match batch_size
+    context_texts = args.context_text
+    if len(context_texts) == 1 and batch_size > 1:
+        context_texts = context_texts * batch_size
+    elif len(context_texts) != batch_size:
+        parser.error(f"Number of context_texts ({len(context_texts)}) must match number of context_audios ({batch_size}) or be 1")
+
+    context_durations = args.context_duration
+    if len(context_durations) == 1 and batch_size > 1:
+        context_durations = context_durations * batch_size
+    elif len(context_durations) != batch_size:
+        parser.error(f"Number of context_durations ({len(context_durations)}) must match number of context_audios ({batch_size}) or be 1")
+
+    texts = args.text
+    if len(texts) == 1 and batch_size > 1:
+        texts = texts * batch_size
+    elif len(texts) != batch_size:
+        parser.error(f"Number of texts ({len(texts)}) must match number of context_audios ({batch_size}) or be 1")
+
+    # Handle phoneme_text - default to text if not provided
+    phoneme_texts = args.phoneme_text
+    if phoneme_texts is None:
+        phoneme_texts = texts
+    elif len(phoneme_texts) == 1 and batch_size > 1:
+        phoneme_texts = phoneme_texts * batch_size
+    elif len(phoneme_texts) != batch_size:
+        parser.error(f"Number of phoneme_texts ({len(phoneme_texts)}) must match number of context_audios ({batch_size}) or be 1")
+
+    # Load and process context audios
+    context_audios = []
+    context_audio_lens_list = []
+
+    for i, (audio_path, duration) in enumerate(zip(args.context_audio, context_durations)):
+        logging.info(f"Loading context audio {i+1}/{batch_size} from: {audio_path}")
+        audio = load_audio(audio_path, model.sample_rate)
+        original_duration = audio.size(1) / model.sample_rate
+        logging.info(f"  Original duration: {original_duration:.2f}s")
+
+        # Adjust to target duration (aligned to codec frame boundaries)
+        audio = adjust_audio_to_duration(audio, model.sample_rate, duration, model.codec_model_samples_per_frame)
+        adjusted_duration = audio.size(1) / model.sample_rate
+        logging.info(f"  Adjusted duration: {adjusted_duration:.2f}s (target: {duration}s, codec-aligned)")
+
+        context_audios.append(audio)
+        context_audio_lens_list.append(torch.tensor([audio.size(1)], dtype=torch.long))
+
+    logging.info(f"\nBatch size: {batch_size}")
+    logging.info(f"Context texts: {context_texts}")
+    logging.info(f"Texts to synthesize: {texts}")
+    logging.info(f"Phoneme texts: {phoneme_texts}")
+    logging.info(f"Use GT phonemes: {args.use_gt_phonemes}")
+
+    # Use single-sample or batched inference
+    if batch_size == 1:
+        logging.info("\n=== Running single-sample streaming inference ===")
+        output, timing_info, context_audio_decoded, context_audio_decoded_lens = run_streaming_inference(
+            model=model,
+            context_audio=context_audios[0],
+            context_audio_lens=context_audio_lens_list[0],
+            context_text=context_texts[0],
+            text=texts[0],
+            phoneme_text=phoneme_texts[0],
+            use_gt_phonemes=args.use_gt_phonemes,
+            inference_mode=args.inference_mode,
+            use_cfg=args.use_cfg,
+            cfg_scale=args.cfg_scale,
+            use_local_transformer=args.use_local_transformer,
+            temperature=args.temperature,
+            topk=args.topk,
+            max_steps=args.max_steps,
+            verbose=args.verbose,
+            force_dropout_text=args.force_dropout_text,
+        )
+
+        # Save output
+        output_dir = os.path.dirname(args.output_path)
+        if output_dir and not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        audio_np = output.audio[0, :output.audio_len[0].item()].cpu().numpy()
+        sf.write(args.output_path, audio_np, model.output_sample_rate)
+        logging.info(f"Output saved to: {args.output_path}")
+
+        # Save decoded context audio for sanity check
+        output_base, output_ext = os.path.splitext(args.output_path)
+        context_output_path = f"{output_base}_context_decoded{output_ext}"
+        context_audio_np = context_audio_decoded[0, :context_audio_decoded_lens[0].item()].cpu().numpy()
+        sf.write(context_output_path, context_audio_np, model.output_sample_rate)
+
+        logging.info(f"Context audio (decoded from codes) saved to: {context_output_path}")
+        logging.info(f"Context audio duration: {context_audio_decoded_lens[0].item() / model.output_sample_rate:.2f}s")
+        logging.info(f"Audio duration: {output.audio_len[0].item() / model.output_sample_rate:.2f}s")
+        logging.info(f"Generated codes shape: {output.audio_codes.shape}")
+        if output.phoneme_text:
+            logging.info(f"Predicted phoneme text: {output.phoneme_text[0]}")
+
+        # Print timing summary
+        logging.info("\n=== Timing Summary ===")
+        logging.info(f"Init time: {timing_info['init_time']:.3f}s")
+        logging.info(f"Generation time: {timing_info['generation_time']:.3f}s")
+        logging.info(f"Total time: {timing_info['total_time']:.3f}s")
+        logging.info(f"Text tokens processed: {timing_info['num_text_tokens']}")
+        logging.info(f"  - Prompt phase tokens: {timing_info['prompt_phase_tokens']}")
+        logging.info(f"  - Phoneme-only phase tokens: {timing_info['phoneme_only_phase_tokens']}")
+        logging.info(f"Audio frames generated: {timing_info['num_audio_frames']}")
+        logging.info(f"Phoneme frames generated: {timing_info['num_phoneme_frames']}")
+        logging.info(f"Continuation steps: {timing_info['continuation_steps']}")
+
+        # Calculate RTF
+        audio_duration = output.audio_len[0].item() / model.output_sample_rate
+        rtf = audio_duration / timing_info['total_time']
+        logging.info(f"Real-time factor (RTF): {rtf:.2f}x")
+
+    else:
+        logging.info(f"\n=== Running batched streaming inference (batch_size={batch_size}) ===")
+        output, timing_info = run_batched_streaming_inference(
+            model=model,
+            context_audios=context_audios,
+            context_audio_lens_list=context_audio_lens_list,
+            context_texts=context_texts,
+            texts=texts,
+            phoneme_texts=phoneme_texts,
+            use_gt_phonemes=args.use_gt_phonemes,
+            inference_mode=args.inference_mode,
+            use_cfg=args.use_cfg,
+            cfg_scale=args.cfg_scale,
+            use_local_transformer=args.use_local_transformer,
+            temperature=args.temperature,
+            topk=args.topk,
+            max_steps=args.max_steps,
+            verbose=args.verbose,
+            force_dropout_text=args.force_dropout_text,
+        )
+
+        # Save outputs for each batch item
+        output_dir = os.path.dirname(args.output_path)
+        if output_dir and not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        output_base, output_ext = os.path.splitext(args.output_path)
+
+        for i in range(batch_size):
+            output_path_i = f"{output_base}_{i}{output_ext}"
+            audio_np = output.audio[i, :output.audio_len[i].item()].cpu().numpy()
+            sf.write(output_path_i, audio_np, model.output_sample_rate)
+            audio_duration_i = output.audio_len[i].item() / model.output_sample_rate
+            logging.info(f"Output {i+1}/{batch_size} saved to: {output_path_i} (duration: {audio_duration_i:.2f}s)")
+            if output.phoneme_text and i < len(output.phoneme_text):
+                logging.info(f"  Predicted phoneme text: {output.phoneme_text[i]}")
+
+        logging.info(f"\nGenerated codes shape: {output.audio_codes.shape}")
+
+        # Print timing summary
+        logging.info("\n=== Timing Summary ===")
+        logging.info(f"Init time: {timing_info['init_time']:.3f}s")
+        logging.info(f"Generation time: {timing_info['generation_time']:.3f}s")
+        logging.info(f"Total time: {timing_info['total_time']:.3f}s")
+        logging.info(f"Text tokens per item: {timing_info['num_text_tokens']}")
+        logging.info(f"Audio frames generated: {timing_info['num_audio_frames']}")
+        logging.info(f"Total steps: {timing_info['total_steps']}")
+
+        # Calculate average RTF
+        total_audio_duration = sum(output.audio_len[i].item() for i in range(batch_size)) / model.output_sample_rate
+        avg_rtf = total_audio_duration / timing_info['total_time']
+        logging.info(f"Average real-time factor (RTF): {avg_rtf:.2f}x")
+        logging.info(f"Total audio duration (all items): {total_audio_duration:.2f}s")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 09a6491b364a..1351b8409417 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -15,7 +15,7 @@
 import time
 from dataclasses import dataclass
 from functools import partial
-from typing import Dict, List, Optional, Sequence, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Tuple
 
 import torch
 import wandb
@@ -68,38 +68,6 @@ class TrainingMode:
     mode_idx: int
 
 
-@dataclass
-class ContextTensors:
-    """
-    Output dataclass from prepare_context_tensors containing all context-related tensors.
-
-    Attributes:
-        context_embedding: Combined context embedding tensor (B, T_total, E)
-        context_lens: Length of context for each batch item (B,)
-        context_audio_codes: Audio codes for context audio (B, C, T')
-        context_audio_embedded: Embedded context audio codes (B, T', E)
-        context_audio_codes_lens: Length of context audio codes (B,)
-        text_embedded: Embedded text tokens (B, L, E)
-        text_lens: Length of text for each batch item (B,)
-        context_text_tokens: Context text token IDs (B, L)
-        context_text_lens: Length of context text (B,)
-        remaining_text_embedded: Embedded remaining text for streaming mode, None otherwise (B, T, E)
-        remaining_text_lens: Length of remaining text for streaming mode, None otherwise (B,)
-    """
-
-    context_embedding: torch.Tensor
-    context_lens: torch.Tensor
-    context_audio_codes: torch.Tensor
-    context_audio_embedded: torch.Tensor
-    context_audio_codes_lens: torch.Tensor
-    text_embedded: torch.Tensor
-    text_lens: torch.Tensor
-    context_text_tokens: torch.Tensor
-    context_text_lens: torch.Tensor
-    remaining_text_embedded: Optional[torch.Tensor]
-    remaining_text_lens: Optional[torch.Tensor]
-
-
 @dataclass
 class ProcessBatchOutput:
     """
@@ -132,6 +100,120 @@ class ProcessBatchOutput:
     selected_training_mode: Optional[str] = None
 
 
+@dataclass
+class StreamingState:
+    """
+    State for streaming TTS inference with batch support.
+
+    This dataclass maintains all the necessary state for autoregressive streaming
+    generation, allowing text tokens to be fed incrementally. Supports arbitrary
+    batch sizes where each batch item can have different context lengths and be
+    in different phases.
+
+    The streaming operates in four phases (per batch item):
+    1. Context phase (context_position < full_context_lens): Processing remaining context
+    2. Prompt phase (text_tokens_seen < phoneme_delay): Only text, no predictions
+    3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay): Phoneme predictions only
+    4. Audio phase (text_tokens_seen >= speech_delay): Both phoneme and audio predictions
+
+    Attributes:
+        batch_size: Number of items in the batch.
+        past_key_values: KV cache from the transformer for efficient autoregressive decoding.
+        cache_seq_len: Current sequence length in the cache.
+        all_predictions: List of predicted audio codes at each timestep, each tensor is (B, C, S) unstacked.
+        all_phoneme_predictions: List of predicted phoneme tokens at each timestep, each tensor is (B, phoneme_stacking_factor).
+        context_audio_codes: Processed context audio codes with special tokens.
+        context_audio_codes_lens: Length of context audio codes.
+        context_lens: Total context length (task_embedding + context_audio + context_text).
+        full_context_embedding: Full context embedding for each batch item (B, T_max_context, E).
+        full_context_lens: Full context length for each batch item (B,).
+        context_position: How much context has been processed per batch item (B,).
+        text_tokens_seen: Number of text tokens processed so far per batch item (B,).
+        phoneme_steps: Number of phoneme prediction steps taken per batch item (B,).
+        audio_steps: Number of audio prediction steps taken per batch item (B,).
+        phoneme_stream_ended: Whether the phoneme stream has ended per batch item (B,) bool tensor.
+        finished: Whether generation is complete per batch item (B,) bool tensor.
+        device: Device tensors are on.
+        training_mode: The training mode being used for inference.
+        use_cfg: Whether classifier-free guidance is enabled.
+        cfg_scale: CFG scale factor.
+        use_local_transformer: Whether to use local transformer for inference.
+        temperature: Sampling temperature.
+        topk: Top-k sampling parameter.
+        dummy_context_embedding_unconditional: Unconditional embedding for CFG (if enabled).
+        last_hidden: Last hidden state from transformer.
+        text_finished: Whether text input has finished per batch item (B,) bool tensor.
+        phoneme_input_type: 'gt' or 'pred' for phoneme tokens.
+        phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
+        last_phoneme_tokens: Last predicted phoneme tokens (B, phoneme_stacking_factor).
+        last_audio_codes: Last predicted audio codes (B, num_codebooks).
+        audio_prediction_start_idx: Global frame index where audio predictions start per batch item (B,).
+        audio_prediction_end_idx: Global frame index where audio predictions end per batch item (B,), -1 if not ended.
+        phoneme_prediction_start_idx: Global step index where phoneme predictions start per batch item (B,).
+        phoneme_prediction_end_idx: Global step index where phoneme predictions end per batch item (B,), -1 if not ended.
+    """
+
+    batch_size: int
+    past_key_values: Optional[Tuple]
+    cache_seq_len: int
+    all_predictions: List[torch.Tensor]
+    all_phoneme_predictions: List[torch.Tensor]
+    context_audio_codes: torch.Tensor
+    context_audio_codes_lens: torch.Tensor
+    context_lens: torch.Tensor
+    full_context_embedding: torch.Tensor
+    full_context_lens: torch.Tensor
+    context_position: torch.Tensor
+    text_tokens_seen: torch.Tensor
+    phoneme_steps: torch.Tensor
+    audio_steps: torch.Tensor
+    phoneme_stream_ended: torch.Tensor
+    finished: torch.Tensor
+    device: torch.device
+    training_mode: TrainingMode
+    use_cfg: bool
+    cfg_scale: float
+    use_local_transformer: bool
+    temperature: float
+    topk: int
+    dummy_context_embedding_unconditional: Optional[torch.Tensor]
+    last_hidden: torch.Tensor
+    text_finished: torch.Tensor
+    phoneme_input_type: str
+    phoneme_sampling_method: str
+    last_phoneme_tokens: Optional[torch.Tensor]
+    last_audio_codes: Optional[torch.Tensor]
+    audio_prediction_start_idx: torch.Tensor
+    audio_prediction_end_idx: torch.Tensor
+    phoneme_prediction_start_idx: torch.Tensor
+    phoneme_prediction_end_idx: torch.Tensor
+    gt_phoneme_embeddings: Optional[torch.Tensor] = None  # (B, T', E) pre-computed GT embeddings
+    gt_phoneme_lens: Optional[torch.Tensor] = None  # (B,) lengths after stacking
+
+
+@dataclass
+class StreamingFinalizeOutput:
+    """Output from streaming_finalize containing audio and phoneme predictions."""
+
+    audio: torch.Tensor  # (B, max_audio_len) generated audio waveform
+    audio_len: torch.Tensor  # (B,) length of audio per batch item
+    audio_codes: torch.Tensor  # (B, num_codebooks, T) generated audio codes
+    audio_codes_len: torch.Tensor  # (B,) length of codes per batch item
+    phoneme_tokens: List[List[int]]  # List of phoneme token sequences per batch item
+    phoneme_text: List[str]  # Decoded phoneme strings per batch item
+
+
+@dataclass
+class InferBatchOutput:
+    """Output dataclass for EasyMagpieTTS infer_batch method."""
+
+    predicted_audio: torch.Tensor  # (B, T_audio)
+    predicted_audio_lens: torch.Tensor  # (B,)
+    predicted_codes: torch.Tensor  # (B, num_codebooks, T_frames)
+    predicted_codes_lens: torch.Tensor  # (B,)
+    rtf_metrics: Dict[str, Any]
+
+
 def worker_init_fn(worker_id):
     # For mp.set_start_method("spawn", force=True)
     # The dataset class should be picklable, so we initialize non-picklable objects here
@@ -885,13 +967,13 @@ def log_val_audio_example(
                     wandb_audio_log[f"Audio/Example_{idx}"] = list()
                     if context_audio_np is not None:
                         wandb_audio_log[f"Audio/Example_{idx}"].append(
-                            wandb.Audio(context_audio_np, sample_rate=self.sample_rate, caption="context")
+                            wandb.Audio(context_audio_np, sample_rate=self.output_sample_rate, caption="context")
                         )
                     wandb_audio_log[f"Audio/Example_{idx}"].append(
-                        wandb.Audio(pred_audio_np, sample_rate=self.sample_rate, caption="prediction")
+                        wandb.Audio(pred_audio_np, sample_rate=self.output_sample_rate, caption="prediction")
                     )
                     wandb_audio_log[f"Audio/Example_{idx}"].append(
-                        wandb.Audio(target_audio_np, sample_rate=self.sample_rate, caption="target")
+                        wandb.Audio(target_audio_np, sample_rate=self.output_sample_rate, caption="target")
                     )
 
                 if is_tb:
@@ -900,19 +982,19 @@ def log_val_audio_example(
                             f'Example_{idx}/context',
                             context_audio_np,
                             global_step=self.global_step,
-                            sample_rate=self.sample_rate,
+                            sample_rate=self.output_sample_rate,
                         )
                     logger.experiment.add_audio(
                         f'Example_{idx}/prediction',
                         pred_audio_np,
                         global_step=self.global_step,
-                        sample_rate=self.sample_rate,
+                        sample_rate=self.output_sample_rate,
                     )
                     logger.experiment.add_audio(
                         f'Example_{idx}/target',
                         target_audio_np,
                         global_step=self.global_step,
-                        sample_rate=self.sample_rate,
+                        sample_rate=self.output_sample_rate,
                     )
 
         return wandb_audio_log
@@ -977,27 +1059,21 @@ def join_embeddings_temporally(
 
     def prepare_context_tensors(
         self,
-        text: torch.Tensor,
-        text_lens: torch.Tensor,
         context_text_tokens: torch.Tensor,
         context_text_tokens_lens: torch.Tensor,
         context_audio_codes: Optional[torch.Tensor] = None,
         context_audio_codes_lens: Optional[torch.Tensor] = None,
         context_audio: Optional[torch.Tensor] = None,
         context_audio_lens: Optional[torch.Tensor] = None,
-        dropout_text_input: bool = False,
         training_mode: Optional[TrainingMode] = None,
-    ) -> ContextTensors:
+        dropout_conditional_input: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
-        Prepare context tensors for the EasyMagpieTTS model.
-
-        This function processes the input text, context audio, and context text to create
-        the combined context embedding that will be fed to the transformer decoder. It handles
-        both 'full' and 'streaming' text input modes.
+        Prepare context tensors (without text) for the simplified process_batch.
 
+        This function processes context audio and context text to create the combined
+        context embedding.
         Args:
-            text: Input text token IDs (B, L)
-            text_lens: Length of text for each batch item (B,)
             context_text_tokens: Context text token IDs for speaker/style conditioning (B, L)
             context_text_tokens_lens: Length of context text for each batch item (B,)
             context_audio_codes: Pre-computed audio codes for context audio (B, C, T').
@@ -1008,57 +1084,24 @@ def prepare_context_tensors(
                 Used to compute context_audio_codes if not provided.
             context_audio_lens: Length of context audio (B,).
                 Required if context_audio is provided.
-            dropout_text_input: If True, zero out the text embedding for classifier-free guidance.
             training_mode: Optional TrainingMode object specifying the mode to use.
                 If None, uses the first mode from training_modes as default.
+            dropout_conditional_input: If True, replace context with CFG unconditional token.
 
         Returns:
-            ContextTensors: A dataclass containing all prepared context tensors including:
-                - context_embedding: Combined context embedding (B, T_total, E)
+            Tuple of:
+                - context_embedding: Combined context embedding (B, T_context, E)
                 - context_lens: Total context length per batch item (B,)
                 - context_audio_codes: Processed audio codes with special tokens (B, C, T')
-                - context_audio_embedded: Embedded context audio (B, T', E)
                 - context_audio_codes_lens: Length of processed context audio codes (B,)
-                - text_embedded: Embedded text tokens (B, L, E)
-                - text_lens: Text length per batch item (B,)
-                - context_text_tokens: Context text token IDs (B, L)
-                - context_text_lens: Context text length per batch item (B,)
-                - remaining_text_embedded: For streaming mode, embedded remaining text (B, T, E)
-                - remaining_text_lens: For streaming mode, remaining text length (B,)
-
-        Raises:
-            ValueError: If neither context_audio_codes nor context_audio is provided.
-            ValueError: If text_input_mode is not 'full' or 'streaming'.
         """
         # Determine the mode parameters to use
-        # If no mode is specified, use the first (default) mode
         if training_mode is None:
             training_mode = self.training_modes[0]
 
-        current_text_input_mode = training_mode.text_input_mode
-        current_streaming_speech_delay = training_mode.streaming_speech_delay
-        current_streaming_phonemes_delay = training_mode.streaming_phonemes_delay
         current_mode_idx = training_mode.mode_idx
-
-        text_embedded = self.decoder.get_input_embeddings()(text)
-        if self.use_bpe_char_tokenizer:
-            text_mask = get_mask_from_lengths(text_lens)
-            cas_embedding = self.cas_encoder(text, subword_mask=text_mask)  # (B, L, E)
-            text_embedded = text_embedded + cas_embedding
-
-        if text_embedded.shape[1] < current_streaming_speech_delay + 1:
-            # If text is too short, pad it with zeros
-            padding_tensor = torch.zeros(
-                text_embedded.shape[0],
-                current_streaming_speech_delay + 1 - text_embedded.shape[1],
-                text_embedded.shape[2],
-                device=text_embedded.device,
-            )
-            text_embedded = torch.cat([text_embedded, padding_tensor], dim=1)
-
-        if dropout_text_input:
-            # Make text embedding all zeros
-            text_embedded = text_embedded * 0.0
+        batch_size = context_text_tokens.size(0)
+        device = context_text_tokens.device
 
         # Context Audio
         if context_audio_codes is None:
@@ -1093,63 +1136,237 @@ def prepare_context_tensors(
         context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens)  # (B, L, E)
 
         # Prepare task embedding for multi-mode training
-        # Only use task embedding if there are multiple modes (task_embedding is not None)
         task_embedding = None
         task_embedding_lens = None
         if self.task_embedding is not None and current_mode_idx is not None:
-            batch_size = text.size(0)
-            mode_idx_tensor = torch.full((batch_size,), current_mode_idx, dtype=torch.long, device=text.device)
+            mode_idx_tensor = torch.full((batch_size,), current_mode_idx, dtype=torch.long, device=device)
             task_embedding = self.task_embedding(mode_idx_tensor).unsqueeze(1)  # (B, 1, E)
-            task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=text.device)  # (B,)
+            task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=device)  # (B,)
 
-        remaining_text_embedded = None
-        remaining_text_lens = None
-        if current_text_input_mode == 'full':
-            if task_embedding is not None:
-                context_embedding, context_lens = self.join_embeddings_temporally(
-                    embeddings=[task_embedding, context_audio_embedded, context_text_embedded, text_embedded],
-                    lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens, text_lens],
-                )
-            else:
-                context_embedding, context_lens = self.join_embeddings_temporally(
-                    embeddings=[context_audio_embedded, context_text_embedded, text_embedded],
-                    lengths=[context_audio_codes_lens, context_text_lens, text_lens],
-                )
-        elif current_text_input_mode == 'streaming':
-            prompt_text_embedded = text_embedded[:, :current_streaming_speech_delay, :]
-            prompt_text_lens = torch.ones_like(text_lens) * current_streaming_speech_delay
-            if task_embedding is not None:
-                context_embedding, context_lens = self.join_embeddings_temporally(
-                    embeddings=[task_embedding, context_audio_embedded, context_text_embedded, prompt_text_embedded],
-                    lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens, prompt_text_lens],
-                )
-            else:
-                context_embedding, context_lens = self.join_embeddings_temporally(
-                    embeddings=[context_audio_embedded, context_text_embedded, prompt_text_embedded],
-                    lengths=[context_audio_codes_lens, context_text_lens, prompt_text_lens],
-                )
-            remaining_text_embedded = text_embedded[:, current_streaming_speech_delay:, :]
-            remaining_text_lens = text_lens - current_streaming_speech_delay
-            remaining_text_lens = remaining_text_lens.clamp(min=0)
-            remaining_text_mask = get_mask_from_lengths(remaining_text_lens)
-            remaining_text_embedded = remaining_text_embedded * remaining_text_mask.unsqueeze(2)  # (B, T, E)
+        # Combine context embeddings: [task_embedding | context_audio | context_text]
+        if task_embedding is not None:
+            context_embedding, context_lens = self.join_embeddings_temporally(
+                embeddings=[task_embedding, context_audio_embedded, context_text_embedded],
+                lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens],
+            )
         else:
-            raise ValueError(f"Invalid text input mode: {current_text_input_mode}")
+            context_embedding, context_lens = self.join_embeddings_temporally(
+                embeddings=[context_audio_embedded, context_text_embedded],
+                lengths=[context_audio_codes_lens, context_text_lens],
+            )
 
-        return ContextTensors(
-            context_embedding=context_embedding,
-            context_lens=context_lens,
-            context_audio_codes=context_audio_codes,
-            context_audio_embedded=context_audio_embedded,
-            context_audio_codes_lens=context_audio_codes_lens,
-            text_embedded=text_embedded,
-            text_lens=text_lens,
-            context_text_tokens=context_text_tokens,
-            context_text_lens=context_text_lens,
-            remaining_text_embedded=remaining_text_embedded,
-            remaining_text_lens=remaining_text_lens,
+        # Handle CFG unconditional dropout
+        if dropout_conditional_input:
+            cfg_token_id = self.cfg_unk_token_id
+            cfg_token_embedding = self.decoder.get_input_embeddings()(
+                torch.full((batch_size, 1), cfg_token_id, device=device)
+            )  # (B, 1, E)
+            # Expand CFG token to match context embedding size
+            context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1)  # (B, T_context, E)
+
+        return context_embedding, context_lens, context_audio_codes, context_audio_codes_lens
+
+    def prepare_text_channel_embeddings(
+        self,
+        text: torch.Tensor,
+        text_lens: torch.Tensor,
+        delay: torch.Tensor,
+        dropout_text_input: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare text embeddings as a channel input with delay handling.
+
+        This function embeds text tokens and prepends zero-padding based on the delay
+        parameter. The delay represents the number of zero positions to prepend before
+        the text embeddings, aligning the text channel with other channels.
+
+        Args:
+            text: Input text token IDs (B, L)
+            text_lens: Length of text for each batch item (B,)
+            delay: Number of zero positions to prepend for each batch item (B,).
+                   For text channel, this is typically just context_lens.
+            dropout_text_input: If True, return all zeros (for text dropout regularization).
+
+        Returns:
+            Tuple of:
+                - text_channel_embedding: Text embeddings with zero-padded delay (B, T_delay + T_text, E)
+                - text_channel_lens: Total length of text channel for each batch item (B,)
+        """
+        batch_size = text.size(0)
+        device = text.device
+
+        # Embed text tokens
+        text_embedded = self.decoder.get_input_embeddings()(text)  # (B, L, E)
+
+        # Apply CAS encoding if using BPE char tokenizer
+        if self.use_bpe_char_tokenizer:
+            text_mask = get_mask_from_lengths(text_lens)
+            cas_embedding = self.cas_encoder(text, subword_mask=text_mask)  # (B, L, E)
+            text_embedded = text_embedded + cas_embedding
+
+        # Handle text dropout - zero out the embeddings
+        if dropout_text_input:
+            text_embedded = text_embedded * 0.0
+
+        # Create zero tensor for delay padding
+        max_delay = delay.max().item()
+        zero_delay_tensor = torch.zeros(
+            batch_size, max_delay, self.cfg.embedding_dim, device=device
+        )
+
+        # Join delay zeros with text embeddings
+        text_channel_embedding, text_channel_lens = self.join_embeddings_temporally(
+            embeddings=[zero_delay_tensor, text_embedded],
+            lengths=[delay, text_lens],
+        )
+
+        return text_channel_embedding, text_channel_lens
+
+    def prepare_phoneme_channel_embeddings(
+        self,
+        phoneme_tokens: torch.Tensor,
+        phoneme_tokens_lens: torch.Tensor,
+        delay: torch.Tensor,
+        dropout_phoneme_input: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Prepare phoneme embeddings as a channel input with delay handling.
+
+        This function stacks phoneme tokens (if configured), embeds them, and prepends
+        zero-padding based on the delay parameter. The delay represents the number of
+        zero positions to prepend before the phoneme embeddings.
+
+        Args:
+            phoneme_tokens: Phoneme token IDs (B, L)
+            phoneme_tokens_lens: Length of phoneme tokens for each batch item (B,)
+            delay: Number of zero positions to prepend for each batch item (B,).
+                   This is typically context_lens + phoneme_delay.
+            dropout_phoneme_input: If True, return all zeros (for phoneme dropout regularization).
+
+        Returns:
+            Tuple of:
+                - phoneme_channel_embedding: Phoneme embeddings with zero-padded delay (B, T_delay + T_phoneme, E)
+                - phoneme_channel_lens: Total length of phoneme channel for each batch item (B,)
+                - phoneme_tokens_stacked: Stacked phoneme tokens (B, S, T')
+                - phoneme_tokens_lens_stacked: Length of stacked phoneme tokens (B,)
+        """
+        batch_size = phoneme_tokens.size(0)
+        device = phoneme_tokens.device
+
+        # Stack phoneme tokens
+        phoneme_tokens_expanded = phoneme_tokens.unsqueeze(1)  # (B, 1, L)
+        phoneme_tokens_stacked, phoneme_tokens_lens_stacked = self.stack_codes(
+            phoneme_tokens_expanded,
+            phoneme_tokens_lens,
+            self.phoneme_tokenizer.bos_token_id,
+            self.phoneme_tokenizer.eos_token_id,
+            self.phoneme_stacking_factor,
+            1,
         )
 
+        # Embed phoneme tokens
+        phoneme_embedded = self.embed_phoneme_tokens(phoneme_tokens_stacked)  # (B, T', E)
+
+        # Apply mask to zero out padding
+        phoneme_mask = get_mask_from_lengths(phoneme_tokens_lens_stacked)
+        phoneme_embedded = phoneme_embedded * phoneme_mask.unsqueeze(2)  # (B, T', E)
+
+        # Handle phoneme dropout - zero out the embeddings
+        if dropout_phoneme_input:
+            phoneme_embedded = phoneme_embedded * 0.0
+
+        # Create zero tensor for delay padding
+        max_delay = delay.max().item()
+        zero_delay_tensor = torch.zeros(
+            batch_size, max_delay, self.cfg.embedding_dim, device=device
+        )
+
+        # Join delay zeros with phoneme embeddings
+        phoneme_channel_embedding, phoneme_channel_lens = self.join_embeddings_temporally(
+            embeddings=[zero_delay_tensor, phoneme_embedded],
+            lengths=[delay, phoneme_tokens_lens_stacked],
+        )
+
+        return phoneme_channel_embedding, phoneme_channel_lens, phoneme_tokens_stacked, phoneme_tokens_lens_stacked
+
+    def prepare_audio_channel_embeddings(
+        self,
+        audio_codes: torch.Tensor,
+        audio_codes_lens: torch.Tensor,
+        delay: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Prepare audio embeddings as a channel input with delay handling.
+
+        This function processes audio codes by adding special tokens, stacking them,
+        and embedding them. It prepends zero-padding based on the delay parameter.
+        Also prepares input/target split for autoregressive training.
+
+        Args:
+            audio_codes: Audio codes (B, C, T) - raw codes without special tokens
+            audio_codes_lens: Length of audio codes for each batch item (B,)
+            delay: Number of zero positions to prepend for each batch item (B,).
+                   In full mode: context_lens + text_lens + speech_delay
+                   In streaming mode: context_lens + speech_delay
+
+        Returns:
+            Tuple of:
+                - audio_channel_embedding: Audio embeddings with zero-padded delay (B, T_delay + T_audio, E)
+                - audio_channel_lens: Total length of audio channel for each batch item (B,)
+                - audio_codes_target: Target audio codes for loss computation (B, C, T'-1)
+                - audio_codes_lens_target: Length of target audio codes (B,)
+        """
+        batch_size = audio_codes.size(0)
+        device = audio_codes.device
+
+        # Apply codec conversion if configured
+        if self._codec_converter is not None:
+            audio_codes = self._codec_converter.convert_original_to_new(
+                audio_tokens=audio_codes, audio_lens=audio_codes_lens
+            ).long()
+
+        # Add BOS and EOS tokens
+        audio_codes, audio_codes_lens = self.add_special_tokens(
+            codes=audio_codes,
+            codes_len=audio_codes_lens,
+            bos_id=self.audio_bos_id,
+            eos_id=self.audio_eos_id,
+        )
+
+        # Stack audio codes across codebooks
+        audio_codes, audio_codes_lens = self.stack_codes(
+            audio_codes,
+            audio_codes_lens,
+            self.audio_bos_id,
+            self.audio_eos_id,
+            self.frame_stacking_factor,
+            self.num_audio_codebooks,
+        )
+
+        # Prepare input and target for autoregressive training
+        # Input: all tokens except the last (teacher forcing)
+        # Target: all tokens except the first (shifted by one)
+        audio_codes_lens_target = audio_codes_lens - 1
+        audio_codes_target = audio_codes[:, :, 1:]  # (B, C, T'-1)
+        audio_codes_input = audio_codes[:, :, :-1]  # (B, C, T'-1)
+
+        # Embed audio tokens
+        audio_embedded = self.embed_audio_tokens(audio_codes_input)  # (B, T'-1, E)
+
+        # Create zero tensor for delay padding
+        max_delay = delay.max().item()
+        zero_delay_tensor = torch.zeros(
+            batch_size, max_delay, self.cfg.embedding_dim, device=device
+        )
+
+        # Join delay zeros with audio embeddings
+        audio_channel_embedding, audio_channel_lens = self.join_embeddings_temporally(
+            embeddings=[zero_delay_tensor, audio_embedded],
+            lengths=[delay, audio_codes_lens_target],
+        )
+
+        return audio_channel_embedding, audio_channel_lens, audio_codes_target, audio_codes_lens_target
+
     def slice_pred_embeddings(self, transformer_out, context_lens, target_lens):
         """
         Slices the transformer output to get the predicted embeddings for the target sequence.
@@ -1270,346 +1487,259 @@ def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor):
 
         return x, orig_lens
 
-    def prepare_phoneme_channel_input(self, phoneme_tokens, phoneme_tokens_lens, context_lens):
-        """
-        Prepare phoneme tokens as an auxiliary input channel for the decoder.
-
-        This function processes phoneme tokens by stacking them (if configured), embedding them,
-        and prepending a zero-padded context region. The resulting tensor can be used as an
-        additional input channel to provide phoneme conditioning to the audio decoder.
-
-        Args:
-            phoneme_tokens: Phoneme token IDs, shape (B, L) where B is batch size and
-                           L is the phoneme sequence length.
-            phoneme_tokens_lens: Length of valid phoneme tokens for each batch item, shape (B,).
-            context_lens: Length of the context region for each batch item, shape (B,).
-                         Used to prepend zero-padding to align with audio context.
-
-        Returns:
-            Tuple of:
-                - phoneme_channel_input: Embedded phoneme tokens with zero-padded context,
-                  shape (B, T_context + T_phoneme, E) where E is the embedding dimension.
-                - phoneme_channel_input_lens: Total length of phoneme channel input for each
-                  batch item (context_lens + phoneme_tokens_lens after stacking), shape (B,).
-                - phoneme_tokens: Stacked phoneme tokens, shape (B, phoneme_stacking_factor, T_stacked).
-                - phoneme_tokens_lens: Length of stacked phoneme tokens, shape (B,).
-        """
-        phoneme_tokens = phoneme_tokens.unsqueeze(1)  # (B, 1, L)
-        phoneme_tokens, phoneme_tokens_lens = self.stack_codes(
-            phoneme_tokens,
-            phoneme_tokens_lens,
-            self.phoneme_tokenizer.bos_token_id,
-            self.phoneme_tokenizer.eos_token_id,
-            self.phoneme_stacking_factor,
-            1,
-        )
-        # import ipdb; ipdb.set_trace()
-        phoneme_tokens_embedded = self.embed_phoneme_tokens(phoneme_tokens)  # (B, T', E)
-
-        phoneme_mask = get_mask_from_lengths(phoneme_tokens_lens)
-        phoneme_tokens_embedded = phoneme_tokens_embedded * phoneme_mask.unsqueeze(2)  # (B, T', E)
-
-        zero_context_tensor = torch.zeros(
-            context_lens.size(0), context_lens.max().item(), self.cfg.embedding_dim, device=phoneme_tokens.device
-        )
-        phoneme_channel_input, phoneme_channel_input_lens = self.join_embeddings_temporally(
-            embeddings=[zero_context_tensor, phoneme_tokens_embedded],
-            lengths=[context_lens, phoneme_tokens_lens],
-        )
-        return phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens
-
     def process_batch(
         self,
         text: torch.Tensor,
         text_lens: torch.Tensor,
         context_text_tokens: torch.Tensor,
         context_text_tokens_lens: torch.Tensor,
-        audio: Optional[torch.Tensor] = None,
-        audio_lens: Optional[torch.Tensor] = None,
-        audio_codes: Optional[torch.Tensor] = None,
-        audio_codes_lens: Optional[torch.Tensor] = None,
-        context_audio: Optional[torch.Tensor] = None,
-        context_audio_lens: Optional[torch.Tensor] = None,
-        context_audio_codes: Optional[torch.Tensor] = None,
-        context_audio_codes_lens: Optional[torch.Tensor] = None,
+        audio_codes: torch.Tensor,
+        audio_codes_lens: torch.Tensor,
+        context_audio_codes: torch.Tensor,
+        context_audio_codes_lens: torch.Tensor,
         phoneme_tokens: Optional[torch.Tensor] = None,
         phoneme_tokens_lens: Optional[torch.Tensor] = None,
         mode: str = "train",
         training_mode: Optional[TrainingMode] = None,
     ) -> ProcessBatchOutput:
         """
-        Process a batch of inputs to compute model outputs and losses.
+        Simplified batch processing using channel-based embedding architecture.
+
+        This function provides a cleaner implementation of process_batch where:
+        1. Context is prepared separately (without text)
+        2. Text, phoneme, and audio are each treated as channels with delay-based alignment
+        3. Channels are summed element-wise and joined temporally with context
 
-        This function performs the following steps:
-        1. Prepares context tensors from text and audio inputs
-        2. Optionally applies dropout to text/phoneme inputs for regularization
-        3. Optionally applies classifier-free guidance (CFG) unconditional training
-        4. Converts audio to codes if not already provided
-        5. Embeds audio codes and combines with context embeddings
-        6. Runs the transformer forward pass
-        7. Computes codebook loss, phoneme loss (if applicable), and local transformer loss (if applicable)
+        The delay handling ensures proper temporal alignment:
+        - Text channel delay: context_lens (no additional delay)
+        - Phoneme channel delay: context_lens + phoneme_delay
+        - Audio channel delay: context_lens + text_lens + speech_delay (full mode)
+                              or context_lens + speech_delay (streaming mode)
 
         Args:
-            text: Input text token IDs, shape (B, L)
-            text_lens: Length of text for each batch item, shape (B,)
-            context_text_tokens: Context text token IDs for conditioning, shape (B, L_ctx)
-            context_text_tokens_lens: Length of context text for each batch item, shape (B,)
-            audio: Raw audio waveform (used if audio_codes not provided), shape (B, T_audio)
-            audio_lens: Length of audio for each batch item, shape (B,)
-            audio_codes: Pre-computed audio codes (optional, computed from audio if not provided), shape (B, C, T)
-            audio_codes_lens: Length of audio codes for each batch item, shape (B,)
-            context_audio: Raw context audio waveform (optional), shape (B, T_ctx_audio)
-            context_audio_lens: Length of context audio for each batch item, shape (B,)
-            context_audio_codes: Pre-computed context audio codes (optional), shape (B, C, T_ctx)
-            context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,)
-            phoneme_tokens: Phoneme token IDs (required if phoneme_tokenizer is enabled), shape (B, L_phoneme)
-            phoneme_tokens_lens: Length of phoneme tokens for each batch item, shape (B,)
-            mode: Training mode, either "train" or "val". Affects dropout behavior.
-            training_mode: Optional TrainingMode object specifying which mode to use.
-                If None and multi_mode_training is enabled, a random mode is selected during training.
+            text: Input text token IDs (B, L)
+            text_lens: Length of text for each batch item (B,)
+            context_text_tokens: Context text token IDs for conditioning (B, L_ctx)
+            context_text_tokens_lens: Length of context text (B,)
+            audio_codes: Audio codes (B, C, T) - raw codes without special tokens
+            audio_codes_lens: Length of audio codes (B,)
+            context_audio_codes: Pre-computed context audio codes (B, C, T')
+            context_audio_codes_lens: Length of context audio codes (B,)
+            phoneme_tokens: Phoneme token IDs (optional) (B, L_phoneme)
+            phoneme_tokens_lens: Length of phoneme tokens (B,)
+            mode: Training mode, either "train" or "val"
+            training_mode: Optional TrainingMode object
 
         Returns:
-            ProcessBatchOutput: Dataclass containing:
-                - loss: Total combined loss
-                - codebook_loss: Loss for audio codebook prediction
-                - phoneme_loss: Loss for phoneme prediction (None if not using phonemes)
-                - local_transformer_loss: Loss from local transformer (None if not used)
-                - local_transformer_logits: Logits from local transformer
-                - logits: Predicted logits from the main decoder
-                - audio_codes_target: Target audio codes
-                - audio_codes_lens_target: Length of target audio codes
-                - context_audio_codes: Audio codes from context
-                - context_audio_codes_lens: Length of context audio codes
+            ProcessBatchOutput: Contains loss values and model predictions
         """
-        # Select training mode for multi-mode training
-        # During training, randomly select a mode if not specified
-        # During validation, use the first mode (default) if not specified
+        # Select training mode
         selected_training_mode = training_mode
         if selected_training_mode is None:
             if mode == 'train':
-                # Randomly select a mode during training
                 selected_training_mode = random.choice(self.training_modes)
             else:
-                # Use the first mode during validation
                 selected_training_mode = self.training_modes[0]
 
-        # Get the current mode's parameters
         current_text_input_mode = selected_training_mode.text_input_mode
         current_streaming_speech_delay = selected_training_mode.streaming_speech_delay
         current_streaming_phonemes_delay = selected_training_mode.streaming_phonemes_delay
 
-        # Determine whether to apply text/phoneme dropout for regularization during training
-        # Text dropout: randomly drop text input to encourage the model to rely on other signals
+        # Determine dropout flags
         dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False
         dropout_phoneme_input = (random.random() < self.dropout_phoneme_input_prob) if mode == 'train' else False
         if dropout_phoneme_input and dropout_text_input:
-            # Only one of the two can be True, so choose randomly
             dropout_phoneme_input = random.random() < 0.5
             dropout_text_input = not dropout_phoneme_input
 
-        # Prepare context tensors by combining text and audio context information
-        context_tensors = self.prepare_context_tensors(
-            text=text,
-            text_lens=text_lens,
-            context_text_tokens=context_text_tokens,
-            context_text_tokens_lens=context_text_tokens_lens,
-            context_audio_codes=context_audio_codes,
-            context_audio_codes_lens=context_audio_codes_lens,
-            context_audio=context_audio,
-            context_audio_lens=context_audio_lens,
-            dropout_text_input=dropout_text_input,
-            training_mode=selected_training_mode,
-        )
-
-        # Extract context tensors for use in the forward pass
-        remaining_text_embedded = context_tensors.remaining_text_embedded
-        context_embedding = context_tensors.context_embedding
-        context_lens = context_tensors.context_lens
-
-        # Classifier-Free Guidance (CFG) unconditional training:
-        # With some probability, replace the context with a special unconditional token
-        # This allows the model to generate without conditioning during inference
+        # Determine CFG unconditional dropout
         dropout_conditional_input = False
         if mode == 'train' and self.cfg_unconditional_prob > 0.0:
             if torch.rand(1).item() < self.cfg_unconditional_prob:
                 dropout_conditional_input = True
-                # Get embedding of a special UNCONDITIONAL_TOKEN
-                cfg_token_id = self.cfg_unk_token_id  # int
-                cfg_token_embedding = self.decoder.get_input_embeddings()(
-                    torch.full((context_embedding.size(0), 1), cfg_token_id, device=context_embedding.device)
-                )  # (B, 1, E)
-                # Keeping the dummy context same size as the context embedding makes
-                # inference easier especially with KV caching and using a duplicated batch.
-                context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1)  # (B, T_total, E)
-                # Make unconditional remaining text embedding all zeros. Simplifies the inference implementation.
-                if current_text_input_mode == 'streaming':
-                    remaining_text_embedded = torch.zeros_like(remaining_text_embedded)
-
-        # Convert raw audio to discrete codes if codes are not already provided
-        if audio_codes is None:
-            audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens)
 
-        # Apply codec conversion if a converter is configured (e.g., for different codec formats)
-        if self._codec_converter is not None:
-            audio_codes = self._codec_converter.convert_original_to_new(
-                audio_tokens=audio_codes, audio_lens=audio_codes_lens
-            ).long()
+        # 1. Prepare context tensors (without text)
+        context_embedding, context_lens, context_audio_codes_processed, context_audio_codes_lens_processed = (
+            self.prepare_context_tensors(
+                context_text_tokens=context_text_tokens,
+                context_text_tokens_lens=context_text_tokens_lens,
+                context_audio_codes=context_audio_codes,
+                context_audio_codes_lens=context_audio_codes_lens,
+                training_mode=selected_training_mode,
+                dropout_conditional_input=dropout_conditional_input,
+            )
+        )
 
-        # Add BOS (beginning of sequence) and EOS (end of sequence) tokens to audio codes
-        audio_codes, audio_codes_lens = self.add_special_tokens(
-            codes=audio_codes,
-            codes_len=audio_codes_lens,
-            bos_id=self.audio_bos_id,
-            eos_id=self.audio_eos_id,
+        # 2. Compute delays for each channel based on mode
+        # Text channel delay: always context_lens
+        text_delay = context_lens.clone()
+
+        # Phoneme channel delay: context_lens + phoneme_delay (both modes)
+        phoneme_delay = context_lens + current_streaming_phonemes_delay
+
+        # Audio channel delay depends on mode
+        if current_text_input_mode == 'full':
+            # Full mode: context_lens + text_lens + speech_delay
+            audio_delay = context_lens + text_lens + current_streaming_speech_delay
+        else:
+            # Streaming mode: context_lens + speech_delay
+            audio_delay = context_lens + current_streaming_speech_delay
+
+        # 3. Prepare text channel embeddings
+        text_channel_embedding, text_channel_lens = self.prepare_text_channel_embeddings(
+            text=text,
+            text_lens=text_lens,
+            delay=text_delay,
+            dropout_text_input=dropout_text_input or dropout_conditional_input,
         )
 
-        # Stack audio codes across codebooks for multi-codebook processing
-        # This reshapes codes for parallel prediction of multiple codebooks
-        audio_codes, audio_codes_lens = self.stack_codes(
-            audio_codes,
-            audio_codes_lens,
-            self.audio_bos_id,
-            self.audio_eos_id,
-            self.frame_stacking_factor,
-            self.num_audio_codebooks,
+        # 4. Prepare phoneme channel embeddings (if phoneme tokenizer is configured)
+        phoneme_channel_embedding = None
+        phoneme_tokens_stacked = None
+        phoneme_tokens_lens_stacked = None
+        if self.phoneme_tokenizer is not None and phoneme_tokens is not None:
+            (
+                phoneme_channel_embedding,
+                phoneme_channel_lens,
+                phoneme_tokens_stacked,
+                phoneme_tokens_lens_stacked,
+            ) = self.prepare_phoneme_channel_embeddings(
+                phoneme_tokens=phoneme_tokens,
+                phoneme_tokens_lens=phoneme_tokens_lens,
+                delay=phoneme_delay,
+                dropout_phoneme_input=dropout_phoneme_input or dropout_conditional_input,
+            )
+
+        # 5. Prepare audio channel embeddings
+        (
+            audio_channel_embedding,
+            audio_channel_lens,
+            audio_codes_target,
+            audio_codes_lens_target,
+        ) = self.prepare_audio_channel_embeddings(
+            audio_codes=audio_codes,
+            audio_codes_lens=audio_codes_lens,
+            delay=audio_delay,
         )
 
-        # Prepare input and target sequences for autoregressive training
-        # Input: all tokens except the last (teacher forcing)
-        # Target: all tokens except the first (shifted by one position)
-        audio_codes_lens_input = audio_codes_lens_target = audio_codes_lens - 1
-        audio_codes_target = audio_codes[:, :, 1:]  # (B, C, T') Target for the decoder
-        audio_codes_input = audio_codes[:, :, :-1]  # (B, C, T') Input to the decoder
-
-        # Embed audio tokens to get continuous representations
-        audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input)  # (B, T, E)
-
-        # In streaming mode, add remaining text embeddings to audio embeddings
-        # This provides text information at each audio timestep
-        if remaining_text_embedded is not None:
-            # Pad remaining text to match audio sequence length by adding zeros on the right
-            padding_len = audio_codes_input_embedded.size(1) - remaining_text_embedded.size(1)
-            if padding_len > 0:
-                padding_tensor = torch.zeros(
-                    remaining_text_embedded.size(0),
-                    padding_len,
-                    remaining_text_embedded.size(2),
-                    device=remaining_text_embedded.device,
-                )
-                remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1)
-            else:
-                # Log Warning
-                print(
-                    f"Warning: Remaining text length {remaining_text_embedded.size(1)} is greater than audio codes input length {audio_codes_input_embedded.size(1)}"
-                )
-                remaining_text_embedded = remaining_text_embedded[:, : audio_codes_input_embedded.size(1), :]
-            # Add text information to audio embeddings (element-wise addition)
-            audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded
-
-        # Concatenate context embeddings with audio embeddings along the time dimension
-        # Result: [context_embedding | audio_codes_input_embedded]
-        context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally(
-            embeddings=[context_embedding, audio_codes_input_embedded],
-            lengths=[context_lens, audio_codes_lens_input],
+        # 6. Sum the channel embeddings element-wise
+        # First, align all channels to the same length (max of all channel lengths)
+        max_channel_len = max(
+            text_channel_embedding.size(1),
+            audio_channel_embedding.size(1),
+            phoneme_channel_embedding.size(1) if phoneme_channel_embedding is not None else 0,
         )
 
-        # Process phoneme input if phoneme tokenizer is configured
-        if self.phoneme_tokenizer is not None:
-            # Compute context length offset for phoneme alignment
-            # This accounts for different delays in speech vs phoneme streams
-            # Use the selected mode's streaming delays
-            context_lens_for_phonemes = (
-                context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay
+        # Pad text channel if needed
+        if text_channel_embedding.size(1) < max_channel_len:
+            padding = torch.zeros(
+                text_channel_embedding.size(0),
+                max_channel_len - text_channel_embedding.size(1),
+                text_channel_embedding.size(2),
+                device=text_channel_embedding.device,
             )
-
-            # Prepare phoneme channel input with proper alignment
-            (
-                phoneme_channel_input,
-                phoneme_channel_input_lens,
-                phoneme_tokens_processed,
-                phoneme_tokens_lens_processed,
-            ) = self.prepare_phoneme_channel_input(phoneme_tokens, phoneme_tokens_lens, context_lens_for_phonemes)
-
-            # Align phoneme channel input to match the combined context+audio sequence length
-            if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]:
-                # Pad phoneme channel with zeros if shorter than context+audio
-                padding_tensor = torch.zeros(
-                    phoneme_channel_input.shape[0],
-                    context_plus_audio_embedded.shape[1] - phoneme_channel_input.shape[1],
-                    phoneme_channel_input.shape[2],
-                    device=phoneme_channel_input.device,
+            text_channel_embedding = torch.cat([text_channel_embedding, padding], dim=1)
+
+        # Pad audio channel if needed
+        if audio_channel_embedding.size(1) < max_channel_len:
+            padding = torch.zeros(
+                audio_channel_embedding.size(0),
+                max_channel_len - audio_channel_embedding.size(1),
+                audio_channel_embedding.size(2),
+                device=audio_channel_embedding.device,
+            )
+            audio_channel_embedding = torch.cat([audio_channel_embedding, padding], dim=1)
+
+        # Sum channels
+        combined_channel_embedding = text_channel_embedding + audio_channel_embedding
+
+        # Add phoneme channel if available
+        if phoneme_channel_embedding is not None:
+            if phoneme_channel_embedding.size(1) < max_channel_len:
+                padding = torch.zeros(
+                    phoneme_channel_embedding.size(0),
+                    max_channel_len - phoneme_channel_embedding.size(1),
+                    phoneme_channel_embedding.size(2),
+                    device=phoneme_channel_embedding.device,
                 )
-                phoneme_channel_input = torch.cat([phoneme_channel_input, padding_tensor], dim=1)
-            else:
-                # Truncate phoneme channel if longer than context+audio
-                phoneme_channel_input = phoneme_channel_input[:, : context_plus_audio_embedded.shape[1], :]
+                phoneme_channel_embedding = torch.cat([phoneme_channel_embedding, padding], dim=1)
+            combined_channel_embedding = combined_channel_embedding + phoneme_channel_embedding
+
+        # 7. Join context with combined channel embeddings
+        # The combined_channel_lens is the max of all channel lens for each batch item
+        combined_channel_lens = torch.stack([
+            text_channel_lens,
+            audio_channel_lens,
+            phoneme_channel_lens if phoneme_channel_embedding is not None else audio_channel_lens,
+        ], dim=0).max(dim=0).values
+
+        
+
+        # Right pad context embedding
+        context_padding = torch.zeros(
+            context_embedding.size(0),
+            combined_channel_embedding.size(1) - context_embedding.size(1),
+            context_embedding.size(2),
+            device=context_embedding.device,
+        )
+        context_embedding_padded = torch.cat([context_embedding, context_padding], dim=1)
 
-            # Add phoneme information unless doing unconditional or phoneme dropout training
-            if (not dropout_conditional_input) and (not dropout_phoneme_input):
-                context_plus_audio_embedded = context_plus_audio_embedded + phoneme_channel_input
+        full_embedding = context_embedding_padded + combined_channel_embedding
 
-        # Run the transformer forward pass
+        # 8. Forward pass through transformer
         transformer_out = self.forward(
-            inputs_embeds=context_plus_audio_embedded,
-            attention_mask=get_mask_from_lengths(context_plus_audio_lens),
+            inputs_embeds=full_embedding,
+            attention_mask=get_mask_from_lengths(combined_channel_lens),
         )
         transformer_hidden_states = transformer_out.last_hidden_state  # (B, T_total, E)
 
-        # Extract prediction embeddings by slicing out the audio portion (excluding context)
+        # 9. Extract prediction embeddings and compute losses
+        # Audio predictions start at audio_delay
         pred_embeddings = self.slice_pred_embeddings(
             transformer_hidden_states,
-            context_lens=context_lens,
+            context_lens=audio_delay,
             target_lens=audio_codes_lens_target,
         )
 
-        # Project embeddings to logits for each codebook
-        # First project from hidden_dim to audio_embedding_dim, then to logits
+        # Project to audio logits
         pred_embeddings_audio = self.audio_out_projection(pred_embeddings)
-        logits = self.final_proj(pred_embeddings_audio)  # (B, T', num_codebooks * num_tokens_per_codebook)
+        logits = self.final_proj(pred_embeddings_audio)
 
-        # Compute the main codebook prediction loss
+        # Compute codebook loss
         codebook_loss, _ = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target)
         loss = codebook_loss
 
-        # Compute local transformer loss if using local transformer architecture
+        # Compute local transformer loss if applicable
         local_transformer_loss = None
         local_transformer_logits = None
         if self.local_transformer_type != LocalTransformerType.NO_LT:
             assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type"
-            # Compute logits using the local (autoregressive) transformer
             local_transformer_logits = self.compute_local_transformer_logits(
                 pred_embeddings, audio_codes_target, targets_offset_by_one=False
             )
             local_transformer_loss, _ = self.compute_loss(
                 local_transformer_logits, audio_codes_target, audio_codes_lens_target
             )
-            # Scale and add local transformer loss to total loss
             local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0)
             loss = loss + local_transformer_loss_scale * local_transformer_loss
 
-        # Compute phoneme prediction loss if using phoneme tokenizer
+        # Compute phoneme loss if applicable
         phoneme_loss = None
-        if self.phoneme_tokenizer is not None:
-            # Extract phoneme prediction embeddings with proper alignment
+        if self.phoneme_tokenizer is not None and phoneme_tokens_stacked is not None:
+            # Phoneme predictions start at phoneme_delay
             pred_embeddings_phoneme = self.slice_pred_embeddings(
                 transformer_hidden_states,
-                context_lens=context_lens_for_phonemes,
-                target_lens=phoneme_tokens_lens_processed - 1,
+                context_lens=phoneme_delay,
+                target_lens=phoneme_tokens_lens_stacked - 1,
             )
-            # Project to phoneme logits
-            phoneme_logits = self.phoneme_final_proj(
-                pred_embeddings_phoneme
-            )  # (B, T', phoneme_stacking_factor * phoneme_vocab_size)
+            phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme)
 
-            # Only compute phoneme loss if not doing any dropout
-            # (unconditional, text dropout, or phoneme dropout)
             if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input):
                 phoneme_loss, _ = self.compute_phoneme_loss(
-                    phoneme_logits, phoneme_tokens_processed[:, :, 1:].long(), phoneme_tokens_lens_processed - 1
+                    phoneme_logits, phoneme_tokens_stacked[:, :, 1:].long(), phoneme_tokens_lens_stacked - 1
                 )
                 print("No Dropout - phoneme loss:", phoneme_loss.item())
             else:
-                # Skip phoneme loss computation during dropout training
                 phoneme_loss = torch.tensor(0.0, device=logits.device)
                 print("Dropout - phoneme loss skipped", phoneme_loss.item())
 
@@ -1624,27 +1754,37 @@ def process_batch(
             logits=logits,
             audio_codes_target=audio_codes_target,
             audio_codes_lens_target=audio_codes_lens_target,
-            context_audio_codes=context_tensors.context_audio_codes,
-            context_audio_codes_lens=context_tensors.context_audio_codes_lens,
+            context_audio_codes=context_audio_codes_processed,
+            context_audio_codes_lens=context_audio_codes_lens_processed,
             selected_training_mode=selected_training_mode.name if selected_training_mode is not None else None,
         )
 
     def training_step(self, batch, batch_idx):
-        # Extract inputs from batch and pass explicitly to process_batch
-        # import ipdb; ipdb.set_trace()
+        if 'context_audio_codes' in batch:
+            context_audio_codes = batch['context_audio_codes']
+            context_audio_codes_lens = batch['context_audio_codes_lens']
+        else:
+            context_audio = batch['context_audio']
+            context_audio_lens = batch['context_audio_lens']
+            context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
+        
+        if 'audio_codes' in batch:
+            audio_codes = batch['audio_codes']
+            audio_codes_lens = batch['audio_codes_lens']
+        else:
+            audio = batch['audio']
+            audio_lens = batch['audio_lens']
+            audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens)
+
         batch_output = self.process_batch(
             text=batch['text'],
             text_lens=batch['text_lens'],
             context_text_tokens=batch['context_text_tokens'],
             context_text_tokens_lens=batch['context_text_tokens_lens'],
-            audio=batch.get('audio'),
-            audio_lens=batch.get('audio_lens'),
-            audio_codes=batch.get('audio_codes'),
-            audio_codes_lens=batch.get('audio_codes_lens'),
-            context_audio=batch.get('context_audio'),
-            context_audio_lens=batch.get('context_audio_lens'),
-            context_audio_codes=batch.get('context_audio_codes'),
-            context_audio_codes_lens=batch.get('context_audio_codes_lens'),
+            audio_codes=audio_codes,
+            audio_codes_lens=audio_codes_lens,
+            context_audio_codes=context_audio_codes,
+            context_audio_codes_lens=context_audio_codes_lens,
             phoneme_tokens=batch.get('phoneme_tokens'),
             phoneme_tokens_lens=batch.get('phoneme_tokens_lens'),
             mode="train",
@@ -1709,19 +1849,31 @@ def training_step(self, batch, batch_idx):
 
     def validation_step(self, batch, batch_idx):
         # Extract inputs from batch and pass explicitly to process_batch
+        if 'context_audio_codes' in batch:
+            context_audio_codes = batch['context_audio_codes']
+            context_audio_codes_lens = batch['context_audio_codes_lens']
+        else:
+            context_audio = batch['context_audio']
+            context_audio_lens = batch['context_audio_lens']
+            context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
+        
+        if 'audio_codes' in batch:
+            audio_codes = batch['audio_codes']
+            audio_codes_lens = batch['audio_codes_lens']
+        else:
+            audio = batch['audio']
+            audio_lens = batch['audio_lens']
+            audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens)
+        
         batch_output = self.process_batch(
             text=batch['text'],
             text_lens=batch['text_lens'],
             context_text_tokens=batch['context_text_tokens'],
             context_text_tokens_lens=batch['context_text_tokens_lens'],
-            audio=batch.get('audio'),
-            audio_lens=batch.get('audio_lens'),
-            audio_codes=batch.get('audio_codes'),
-            audio_codes_lens=batch.get('audio_codes_lens'),
-            context_audio=batch.get('context_audio'),
-            context_audio_lens=batch.get('context_audio_lens'),
-            context_audio_codes=batch.get('context_audio_codes'),
-            context_audio_codes_lens=batch.get('context_audio_codes_lens'),
+            audio_codes=audio_codes,
+            audio_codes_lens=audio_codes_lens,
+            context_audio_codes=context_audio_codes,
+            context_audio_codes_lens=context_audio_codes_lens,
             phoneme_tokens=batch.get('phoneme_tokens'),
             phoneme_tokens_lens=batch.get('phoneme_tokens_lens'),
             mode="val",
@@ -1897,47 +2049,7 @@ def setup_validation_data(self, cfg):
     def setup_test_data(self, cfg):
         self._test_dl = self._setup_test_dataloader(cfg)
 
-    def _log_phoneme_predictions(
-        self,
-        pred_phoneme_token_lists: List[List[int]],
-        gt_phoneme_token_lists: List[List[int]],
-        batch_size: int,
-    ) -> None:
-        """Log predicted vs ground truth phoneme tokens for debugging."""
-        for item_idx in range(batch_size):
-            logging.info(f"Predicted phoneme tokens for item {item_idx}: {pred_phoneme_token_lists[item_idx]}")
-            logging.info(f"GT phoneme tokens for item {item_idx}: {gt_phoneme_token_lists[item_idx]}")
-            predicted_phoneme_text = self.phoneme_tokenizer.decode(pred_phoneme_token_lists[item_idx])
-            gt_phoneme_text = self.phoneme_tokenizer.decode(gt_phoneme_token_lists[item_idx])
-            logging.info(f"Predicted phoneme text for item {item_idx}: {predicted_phoneme_text}")
-            logging.info(f"GT phoneme text for item {item_idx}: {gt_phoneme_text}")
-
-    def _collect_phoneme_tokens_for_logging(
-        self,
-        pred_phoneme_tokens: torch.Tensor,
-        gt_phoneme_tokens_current: torch.Tensor,
-        use_phoneme_input: torch.Tensor,
-        pred_phoneme_token_lists: List[List[int]],
-        gt_phoneme_token_lists: List[List[int]],
-        batch_size: int,
-    ) -> None:
-        """Collect phoneme tokens into lists for later logging (does not print)."""
-        special_tokens = {
-            self.phoneme_tokenizer.eos_token_id,
-            self.phoneme_tokenizer.bos_token_id,
-            self.phoneme_tokenizer.pad,
-        }
-        for item_idx in range(batch_size):
-            if use_phoneme_input[item_idx, 0, 0] > 0:
-                for phoneme_channel_idx in range(self.phoneme_stacking_factor):
-                    pred_token = pred_phoneme_tokens[item_idx, phoneme_channel_idx].item()
-                    if pred_token not in special_tokens:
-                        pred_phoneme_token_lists[item_idx].append(pred_token)
-
-                    gt_token = gt_phoneme_tokens_current[item_idx, phoneme_channel_idx].item()
-                    if gt_token not in special_tokens:
-                        gt_phoneme_token_lists[item_idx].append(gt_token)
-
+    
     def _sample_audio_codes(
         self,
         last_hidden: torch.Tensor,
@@ -1978,235 +2090,67 @@ def _sample_audio_codes(
 
         return audio_codes_next, all_codes_next_argmax
 
-    def _process_phoneme_predictions(
-        self,
-        last_hidden: torch.Tensor,
-        actual_batch_size: int,
-        current_phoneme_positions: torch.Tensor,
-        gt_phoneme_tokens: torch.Tensor,
-        phoneme_input_type: str,
-        phoneme_sampling_method: str,
-        temperature: float,
-        topk: int,
-        timestep_idx: int,
-        device: torch.device,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Process phoneme predictions for the current timestep.
-
-        Returns:
-            pred_phoneme_tokens: Predicted phoneme tokens (B, phoneme_stacking_factor)
-            gt_phoneme_tokens_current: GT phoneme tokens for current timestep (B, phoneme_stacking_factor)
-            input_phoneme_tokens_current: Tokens to use as input (GT or predicted)
-            input_phoneme_embedding: Embedded phoneme tokens (B, phoneme_stacking_factor, E)
-        """
-        # Get phoneme logits and sample
-        all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :])
-        all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size]
-
-        all_codes_next_phoneme = self.sample_codes_from_logits_phoneme(
-            all_code_logits_t_phoneme, temperature=temperature, topk=topk
-        )
-        all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme(
-            all_code_logits_t_phoneme, temperature=0.01
-        )
-
-        # Select predicted tokens based on sampling method
-        pred_phoneme_tokens = (
-            all_codes_next_phoneme_argmax if phoneme_sampling_method == 'argmax' else all_codes_next_phoneme
-        )
-
-        # Handle BOS token at position 0
-        phoneme_bos_tensor = torch.full(
-            (actual_batch_size, self.phoneme_stacking_factor),
-            self.phoneme_tokenizer.bos_token_id,
-            device=device,
-        ).long()
-        use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long()
-        pred_phoneme_tokens = (
-            use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens
-        ).long()
-
-        # Get ground truth phoneme tokens for current timestep
-        gt_phoneme_idx = min(timestep_idx, gt_phoneme_tokens.size(2) - 1)
-        gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx]
-
-        # Select input tokens (GT or predicted) and embed
-        input_phoneme_tokens_current = gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens
-        input_phoneme_embedding = self.embed_phoneme_tokens(input_phoneme_tokens_current.unsqueeze(2))
-
-        return pred_phoneme_tokens, gt_phoneme_tokens_current, input_phoneme_tokens_current, input_phoneme_embedding
-
-    def _compute_phoneme_channel_input(
-        self,
-        input_phoneme_embedding: torch.Tensor,
-        current_phoneme_positions: torch.Tensor,
-        phoneme_stream_ended: torch.Tensor,
-        actual_batch_size: int,
-        device: torch.device,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Compute the phoneme channel input embedding with masking.
-
-        Returns:
-            phoneme_channel_input_t: Masked phoneme embedding (B, 1, E)
-            use_phoneme_input: Mask indicating which items should use phoneme input (B, 1, 1)
-        """
-        # Determine which items should use phoneme input
-        use_phoneme_input = (current_phoneme_positions >= 0) & (~phoneme_stream_ended)
-        use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float()
-
-        # Create zero embedding for items not using phoneme input
-        zero_phoneme_embedding = torch.zeros(actual_batch_size, 1, self.cfg.embedding_dim, device=device)
-
-        # Combine: use phoneme embedding where active, zero otherwise
-        phoneme_channel_input_t = (
-            use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding
-        )
-
-        return phoneme_channel_input_t, use_phoneme_input
-
-    def _prepare_next_decoder_input(
+    def streaming_init(
         self,
-        audio_codes_next: torch.Tensor,
-        context_plus_audio_embedded: torch.Tensor,
-        context_plus_audio_lens: torch.Tensor,
-        min_context_len: int,
-        idx: int,
-        current_text_input_mode: str,
-        remaining_text_embedded: Optional[torch.Tensor],
-        current_text_positions: torch.Tensor,
-        phoneme_channel_input_t: Optional[torch.Tensor],
-        use_cfg: bool,
-        dummy_context_embedding_unconditional: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        """
-        Prepare the input embedding for the next decoder step.
-
-        Handles:
-        - Mixing context embeddings with generated audio embeddings based on context completeness
-        - Adding streaming text embeddings if in streaming mode
-        - Adding phoneme channel input if available
-        - Duplicating for CFG if enabled
+        context_audio_codes: torch.Tensor,
+        context_audio_codes_lens: torch.Tensor,
+        context_text_tokens: torch.Tensor,
+        context_text_tokens_lens: torch.Tensor,
+        inference_mode: Optional[str] = None,
+        use_cfg: bool = False,
+        cfg_scale: float = 1.0,
+        use_local_transformer: bool = False,
+        temperature: float = 0.7,
+        topk: int = 80,
+        phoneme_input_type: str = 'predicted',
+        phoneme_sampling_method: str = 'argmax',
+        gt_phoneme_tokens: Optional[torch.Tensor] = None,
+        gt_phoneme_tokens_lens: Optional[torch.Tensor] = None,
+    ) -> StreamingState:
         """
-        batch_size = audio_codes_next.size(0)
-        device = audio_codes_next.device
-
-        # Embed the newly generated audio codes
-        new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2))  # (B, 1, E)
-        new_emb_unconditional = new_emb.clone()
-
-        # Add streaming text embeddings if in streaming mode
-        if current_text_input_mode == 'streaming':
-            remaining_text_idx = current_text_positions.clamp(min=0)
-            remaining_text_embedded_current = remaining_text_embedded[
-                torch.arange(batch_size, device=device), remaining_text_idx, :
-            ].unsqueeze(1)
-            new_emb = new_emb + remaining_text_embedded_current
-
-        # Check which items still have context to process
-        context_incomplete_mask = context_plus_audio_lens > idx + min_context_len
-
-        if context_incomplete_mask.any():
-            # Some items still processing context - blend context with generated embeddings
-            context_incomplete_mask = context_incomplete_mask.unsqueeze(1).unsqueeze(2).float()
-            context_embedding_slice = context_plus_audio_embedded[
-                :, min_context_len + idx : min_context_len + idx + 1, :
-            ]
-            next_input = context_incomplete_mask * context_embedding_slice + (1 - context_incomplete_mask) * new_emb
-
-            if phoneme_channel_input_t is not None:
-                next_input = next_input + phoneme_channel_input_t
+        Initialize streaming TTS inference state.
 
-            if use_cfg:
-                next_input_unconditional = (
-                    context_incomplete_mask * dummy_context_embedding_unconditional
-                    + (1 - context_incomplete_mask) * new_emb_unconditional
-                )
-                next_input = torch.cat([next_input, next_input_unconditional], dim=0)
-        else:
-            # All items finished context - use generated embeddings
-            next_input = new_emb
-            if phoneme_channel_input_t is not None:
-                next_input = next_input + phoneme_channel_input_t
+        This prepares the model for streaming inference by processing the context
+        (audio + context text) and returning a StreamingState that can be used
+        with streaming_step() to incrementally generate audio.
 
-            if use_cfg:
-                next_input = torch.cat([next_input, new_emb_unconditional], dim=0)
-
-        return next_input
+        Note: This function does NOT take the main text input. Text tokens are
+        provided incrementally via streaming_step().
 
-    def _check_eos_and_update_end_indices(
-        self,
-        all_codes_next_argmax: torch.Tensor,
-        audio_codes_next: torch.Tensor,
-        end_indices: Dict[int, int],
-        context_plus_audio_lens: torch.Tensor,
-        min_context_len: int,
-        idx: int,
-        verbose: bool = False,
-    ) -> None:
-        """Check for EOS tokens and update end indices for completed items."""
-        for item_idx in range(all_codes_next_argmax.size(0)):
-            # Only check items that haven't ended and have passed their context
-            if item_idx not in end_indices and idx + min_context_len > context_plus_audio_lens[item_idx]:
-                pred_tokens = all_codes_next_argmax[item_idx]
-                pred_tokens_multinomial = audio_codes_next[item_idx]
-
-                if torch.any(pred_tokens == self.audio_eos_id) or torch.any(
-                    pred_tokens_multinomial == self.audio_eos_id
-                ):
-                    if verbose:
-                        logging.info(f"EOS detected for item {item_idx} at timestep {idx}")
-                    end_indices[item_idx] = idx
+        For batched inference, each batch item can have a different context length.
+        This function processes only up to the minimum context length across the batch,
+        storing the remaining context to be processed in streaming_step's context phase.
 
-    def infer_batch(
-        self,
-        batch,
-        max_decoder_steps=500,
-        temperature=0.7,
-        topk=80,
-        use_local_transformer_for_inference=False,
-        maskgit_n_steps=3,
-        use_cfg=False,
-        cfg_scale=1.0,
-        phoneme_input_type='gt',
-        phoneme_sampling_method='argmax',
-        dropout_text_input=False,
-        inference_mode: Optional[str] = None,
-        verbose: bool = False,
-    ):
-        """
-        Run inference on a batch of inputs to generate audio from text.
+        The streaming inference follows phases (per batch item):
+        1. Context phase: Processing remaining context (if any) for items with longer context.
+        2. Prompt phase: First `streaming_speech_delay` text tokens are processed
+           without generating audio (building up context).
+        3. Generation phase: Audio BOS is added and audio codes are generated
+           autoregressively, with remaining text tokens added to audio embeddings.
 
         Args:
-            batch: Input batch containing:
-                - text, text_lens: Input text tokens and lengths
-                - context_text_tokens, context_text_tokens_lens: Context text for speaker/style
-                - context_audio_codes/context_audio (optional): Audio context for speaker cloning
-            max_decoder_steps: Maximum number of decoding steps.
-            temperature: Sampling temperature for audio codes.
-            topk: Top-k sampling parameter.
-            use_local_transformer_for_inference: Whether to use local transformer for AR sampling.
-            maskgit_n_steps: Number of MaskGit steps (unused in AR mode).
+            context_audio_codes: Pre-computed audio codes for context audio (B, C, T').
+            context_audio_codes_lens: Length of context audio codes (B,).
+            context_text_tokens: Context text token IDs for speaker/style conditioning (B, L).
+            context_text_tokens_lens: Length of context text (B,).
+            inference_mode: Name of the inference mode to use (e.g., "streaming_4_8").
+                If None, uses the default inference mode.
             use_cfg: Whether to use classifier-free guidance.
             cfg_scale: CFG scale factor (higher = stronger conditioning).
-            phoneme_input_type: 'gt' for ground truth or 'pred' for predicted phonemes.
+            use_local_transformer: Whether to use local transformer for AR sampling.
+            temperature: Sampling temperature for audio codes.
+            topk: Top-k sampling parameter.
+            phoneme_input_type: 'gt' or 'predicted' for phoneme tokens (use 'predicted' for streaming).
             phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
-            dropout_text_input: Whether to dropout text input for CFG training.
-            inference_mode: Name of the inference mode to use (e.g., "full", "streaming_4_8").
-                If None, uses the default inference mode.
-            verbose: If True, enables detailed logging of decoding progress, EOS detection,
-                and phoneme predictions. Default False for cleaner output.
+            gt_phoneme_tokens: Optional GT phoneme tokens (B, L) with BOS/EOS for teacher forcing.
+            gt_phoneme_tokens_lens: Lengths of GT phoneme tokens (B,).
 
         Returns:
-            predicted_audio: Generated audio waveforms (B, max_audio_len)
-            predicted_audio_lens: Lengths of generated audio (B,)
-            predicted_codes: Generated audio codes (B, num_codebooks, T)
-            predicted_codes_lens: Lengths of generated codes (B,)
-            rtf_metrics: Dictionary with timing metrics (rtf, time_to_first_prediction, etc.)
+            StreamingState: Initial state for streaming inference.
         """
         with torch.inference_mode():
-            start_time = time.time()
+            batch_size = context_audio_codes.size(0)
+            device = context_audio_codes.device
 
             # Resolve inference mode
             mode_name = inference_mode if inference_mode is not None else self.default_inference_mode
@@ -2215,306 +2159,775 @@ def infer_batch(
                 raise ValueError(f"Unknown inference mode '{mode_name}'. Available modes: {available_modes}")
 
             selected_training_mode = self.mode_name_to_mode[mode_name]
-            if verbose:
-                logging.info(f"Using inference mode: {selected_training_mode.name}")
-
-            current_text_input_mode = selected_training_mode.text_input_mode
-            current_streaming_speech_delay = selected_training_mode.streaming_speech_delay
-            current_streaming_phonemes_delay = selected_training_mode.streaming_phonemes_delay
-
-            # Prepare context embeddings (text + audio context)
-            context_tensors = self.prepare_context_tensors(
-                text=batch['text'],
-                text_lens=batch['text_lens'],
-                context_text_tokens=batch['context_text_tokens'],
-                context_text_tokens_lens=batch['context_text_tokens_lens'],
-                context_audio_codes=batch.get('context_audio_codes'),
-                context_audio_codes_lens=batch.get('context_audio_codes_lens'),
-                context_audio=batch.get('context_audio'),
-                context_audio_lens=batch.get('context_audio_lens'),
-                dropout_text_input=dropout_text_input,
+
+            # Prepare context embedding using shared helper
+            context_embedding, context_lens, context_audio_codes, context_audio_codes_lens = self.prepare_context_tensors(
+                context_text_tokens=context_text_tokens,
+                context_text_tokens_lens=context_text_tokens_lens,
+                context_audio_codes=context_audio_codes,
+                context_audio_codes_lens=context_audio_codes_lens,
                 training_mode=selected_training_mode,
+                dropout_conditional_input=False,
             )
-            context_embedding = context_tensors.context_embedding  # (B, T_total, E)
-            context_lens = context_tensors.context_lens  # (B,)
-            remaining_text_embedded = context_tensors.remaining_text_embedded
-            remaining_text_lens = context_tensors.remaining_text_lens
-
-            actual_batch_size = context_embedding.size(0)
-            device = context_embedding.device
-
-            # Prepare phoneme channel input if phoneme tokenizer is available
-            gt_phoneme_tokens = None
-            if self.phoneme_tokenizer is not None:
-                context_lens_for_phonemes = (
-                    context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay
-                )
-                _, _, gt_phoneme_tokens, _ = self.prepare_phoneme_channel_input(
-                    batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes
-                )
-
-            # Initialize audio codes with BOS token
-            audio_codes_bos = torch.full(
-                (actual_batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1),
-                self.audio_bos_id,
-                device=device,
-            ).long()
-            audio_codes_lens = torch.ones(actual_batch_size, device=device).long()
-
-            audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_bos)  # (B, 1, E)
 
-            # For streaming mode, add text embeddings to audio BOS
-            if current_text_input_mode == 'streaming':
-                remaining_text_pad_length = max_decoder_steps - remaining_text_lens.max().item() + 1
-                remaining_text_pad_tensor = torch.zeros(
-                    actual_batch_size, remaining_text_pad_length, remaining_text_embedded.size(2), device=device
-                )
-                remaining_text_embedded = torch.cat([remaining_text_embedded, remaining_text_pad_tensor], dim=1)
-                audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded[:, :1, :]
+            # Store full context embedding and lens before any CFG manipulation
+            full_context_embedding = context_embedding.clone()  # (B, T_max, E)
+            full_context_lens = context_lens.clone()  # (B,)
 
-            # Combine context and audio embeddings
-            context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally(
-                embeddings=[context_embedding, audio_codes_input_embedded],
-                lengths=[context_lens, audio_codes_lens],
-            )
-            min_context_len = context_plus_audio_lens.min().item()
-
-            # Adjust min_context_len for phoneme delay if using phoneme tokenizer
-            if self.phoneme_tokenizer is not None:
-                min_context_len = (
-                    min_context_len - current_streaming_speech_delay + current_streaming_phonemes_delay - 1
-                )
+            # Compute min context length - we only process up to this in init
+            min_context_len = context_lens.min().item()
 
             # Setup classifier-free guidance if enabled
             dummy_context_embedding_unconditional = None
             if use_cfg:
-                # Create unconditional context embedding (all UNK tokens)
                 dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()(
-                    torch.full((actual_batch_size, 1), self.cfg_unk_token_id, device=device)
+                    torch.full((1, 1), self.cfg_unk_token_id, device=device)
                 )
-                dummy_context_embedding_unconditional_expanded = dummy_context_embedding_unconditional.expand(
-                    -1, context_embedding.size(1), -1
+                # Create unconditional context (same length as conditional)
+                dummy_context_expanded = dummy_context_embedding_unconditional.expand(
+                    batch_size, context_embedding.size(1), -1
                 )
+                # Concatenate conditional and unconditional: (2*B, T, E)
+                context_embedding = torch.cat([context_embedding, dummy_context_expanded], dim=0)
 
-                dummy_context_plus_audio_embedded, _ = self.join_embeddings_temporally(
-                    embeddings=[dummy_context_embedding_unconditional_expanded, audio_codes_input_embedded],
-                    lengths=[context_lens, audio_codes_lens],
-                )
-                # Concatenate conditional and unconditional inputs: (2B, T_min, E)
-                first_inference_input = torch.cat(
-                    [context_plus_audio_embedded, dummy_context_plus_audio_embedded], dim=0
-                )[:, :min_context_len, :]
-            else:
-                first_inference_input = context_plus_audio_embedded[:, :min_context_len, :]
-
-            # First forward pass to process all context at once
+            # First forward pass to process context - only up to min_context_len
             cache_position = torch.arange(min_context_len, device=device)
             transformer_out = self.forward(
-                inputs_embeds=first_inference_input,
+                inputs_embeds=context_embedding[:, :min_context_len, :],
                 attention_mask=None,
                 use_cache=True,
                 past_key_values=None,
                 cache_position=cache_position,
             )
 
-            time_to_first_prediction = time.time() - start_time
             last_hidden = transformer_out.last_hidden_state
             past_kv = transformer_out.past_key_values
             current_cache_seq_len = min_context_len
 
-            # Initialize decoding state
-            all_predictions = []
-            end_indices = {}  # Maps item_idx -> timestep when EOS was detected
-
-            # Track text position for each item in batch
-            # Negative values indicate we haven't started reading remaining text yet
-            current_text_positions = torch.tensor(
-                [min_context_len - context_plus_audio_lens[i] for i in range(actual_batch_size)],
+            # Process GT phoneme tokens if provided (for teacher forcing)
+            gt_phoneme_embeddings = None
+            gt_phoneme_lens = None
+            if gt_phoneme_tokens is not None and gt_phoneme_tokens_lens is not None:
+                gt_phoneme_expanded = gt_phoneme_tokens.unsqueeze(1)  # (B, 1, L)
+                gt_phoneme_stacked, gt_phoneme_lens = self.stack_codes(
+                    gt_phoneme_expanded,
+                    gt_phoneme_tokens_lens,
+                    self.phoneme_tokenizer.bos_token_id,
+                    self.phoneme_tokenizer.eos_token_id,
+                    self.phoneme_stacking_factor,
+                    1,
+                )
+                gt_phoneme_embeddings = self.embed_phoneme_tokens(gt_phoneme_stacked)  # (B, T', E)
+
+            # Initialize streaming state with batch support
+            state = StreamingState(
+                batch_size=batch_size,
+                past_key_values=past_kv,
+                cache_seq_len=current_cache_seq_len,
+                all_predictions=[],
+                all_phoneme_predictions=[],
+                context_audio_codes=context_audio_codes,
+                context_audio_codes_lens=context_audio_codes_lens,
+                context_lens=context_lens,
+                full_context_embedding=full_context_embedding,
+                full_context_lens=full_context_lens,
+                context_position=torch.full((batch_size,), min_context_len, dtype=torch.long, device=device),
+                text_tokens_seen=torch.zeros(batch_size, dtype=torch.long, device=device),
+                phoneme_steps=torch.zeros(batch_size, dtype=torch.long, device=device),
+                audio_steps=torch.zeros(batch_size, dtype=torch.long, device=device),
+                phoneme_stream_ended=torch.zeros(batch_size, dtype=torch.bool, device=device),
+                finished=torch.zeros(batch_size, dtype=torch.bool, device=device),
                 device=device,
-            ).long()
+                training_mode=selected_training_mode,
+                use_cfg=use_cfg,
+                cfg_scale=cfg_scale,
+                use_local_transformer=use_local_transformer,
+                temperature=temperature,
+                topk=topk,
+                dummy_context_embedding_unconditional=dummy_context_embedding_unconditional,
+                last_hidden=last_hidden,
+                text_finished=torch.zeros(batch_size, dtype=torch.bool, device=device),
+                phoneme_input_type=phoneme_input_type,
+                phoneme_sampling_method=phoneme_sampling_method,
+                last_phoneme_tokens=None,
+                last_audio_codes=None,
+                audio_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
+                audio_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
+                phoneme_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
+                phoneme_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
+                gt_phoneme_embeddings=gt_phoneme_embeddings,
+                gt_phoneme_lens=gt_phoneme_lens,
+            )
 
-            # Initialize phoneme tracking state
-            current_phoneme_positions = None
-            pred_phoneme_token_lists = [[] for _ in range(actual_batch_size)]
-            gt_phoneme_token_lists = [[] for _ in range(actual_batch_size)]
-            phoneme_stream_ended = torch.zeros(actual_batch_size, device=device).bool()
+            return state
 
-            if self.phoneme_tokenizer is not None:
-                current_phoneme_positions = current_text_positions - current_text_positions.max() - 1
-
-            # Main autoregressive decoding loop
-            for idx in range(max_decoder_steps):
-                # Update position trackers
-                current_text_positions += 1
-                if self.phoneme_tokenizer is not None:
-                    current_phoneme_positions += 1
-
-                if verbose and idx % 20 == 0:
-                    logging.info(f"Decoding timestep {idx}")
-
-                # Compute audio logits from last hidden state
-                last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :])
-                all_code_logits_t = self.final_proj(last_hidden_audio)
-
-                # Apply CFG to logits if enabled
-                if use_cfg:
-                    conditional_logits = all_code_logits_t[:actual_batch_size]
-                    unconditional_logits = all_code_logits_t[actual_batch_size:]
-                    all_code_logits_t = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
-
-                # Sample audio codes
-                audio_codes_next, all_codes_next_argmax = self._sample_audio_codes(
-                    last_hidden=last_hidden,
-                    all_code_logits_t=all_code_logits_t,
-                    temperature=temperature,
-                    topk=topk,
-                    use_local_transformer_for_inference=use_local_transformer_for_inference,
-                    use_cfg=use_cfg,
-                    cfg_scale=cfg_scale,
-                )
+    def streaming_step(
+        self,
+        state: StreamingState,
+        text_tokens: Optional[torch.Tensor] = None,
+        force_dropout_text: bool = False,
+    ) -> Tuple[StreamingState, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform one streaming inference step with batch support.
+
+        This function processes one text token per batch item (or signals end of text with None)
+        and generates predictions according to the streaming delays. Each batch item can be
+        in a different phase.
+
+        The streaming operates in four phases per batch item:
+        1. Context phase (context_position < full_context_lens):
+           - Still processing remaining context from streaming_init
+           - Uses context embedding, ignores text_tokens for this item
+        2. Prompt phase (text_tokens_seen < phoneme_delay):
+           - Only text tokens are processed, KV cache is extended
+           - No phoneme or audio predictions
+        3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay):
+           - Starts with phoneme BOS on first step
+           - Only phoneme predictions (no audio)
+           - Input: text embedding + phoneme embedding
+        4. Audio phase (text_tokens_seen >= speech_delay):
+           - Starts with audio BOS on first step
+           - Both phoneme and audio predictions
+           - Input: text embedding + phoneme embedding + audio embedding
+
+        IMPORTANT: Only ONE forward call to the decoder per streaming_step.
 
-                # Process phoneme predictions if phoneme tokenizer exists
-                phoneme_channel_input_t = None
-                if self.phoneme_tokenizer is not None:
-                    (
-                        pred_phoneme_tokens,
-                        gt_phoneme_tokens_current,
-                        input_phoneme_tokens_current,
-                        input_phoneme_embedding,
-                    ) = self._process_phoneme_predictions(
-                        last_hidden=last_hidden,
-                        actual_batch_size=actual_batch_size,
-                        current_phoneme_positions=current_phoneme_positions,
-                        gt_phoneme_tokens=gt_phoneme_tokens,
-                        phoneme_input_type=phoneme_input_type,
-                        phoneme_sampling_method=phoneme_sampling_method,
-                        temperature=temperature,
-                        topk=topk,
-                        timestep_idx=idx,
+        Args:
+            state: Current StreamingState from streaming_init or previous streaming_step.
+            text_tokens: Next text token for each batch item, shape (B,), or None if text has finished.
+                For items still in context phase, the text_token value is ignored (can be 0).
+                When None is passed, the model continues generating until EOS.
+
+        Returns:
+            Tuple of:
+                - Updated StreamingState
+                - Predicted audio codes for this step (B, C, S) unstacked, or None if no items in audio phase
+                  where C = num_audio_codebooks and S = frame_stacking_factor
+                - Predicted phoneme tokens for this step (B, phoneme_stacking_factor) or None if no items in phoneme phase
+        """
+        if state.finished.all():
+            return state, None, None
+
+        with torch.inference_mode():
+            device = state.device
+            batch_size = state.batch_size
+            streaming_speech_delay = state.training_mode.streaming_speech_delay
+            streaming_phonemes_delay = state.training_mode.streaming_phonemes_delay
+
+            # ==================== DETERMINE PHASES PER BATCH ITEM ====================
+            needs_context = state.context_position < state.full_context_lens  # (B,) bool
+            needs_text = (~needs_context) & (~state.text_finished)
+            needs_phoneme = (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended)
+            needs_audio = (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished)
+
+            next_input = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
+            # --- Context phase items: use next context embedding ---
+            if needs_context.any():
+                # Gather context embeddings at current position for each item
+                # context_position: (B,) - position indices
+                # full_context_embedding: (B, T_max, E)
+                ctx_positions = state.context_position.clone()  # (B,)
+                # Clamp positions to valid range for gathering
+                ctx_positions = ctx_positions.clamp(max=state.full_context_embedding.size(1) - 1)
+                # Gather: need (B, 1, E) from (B, T, E) at positions (B,)
+                ctx_emb = state.full_context_embedding[
+                    torch.arange(batch_size, device=device),
+                    ctx_positions,
+                    :
+                ].unsqueeze(1)  # (B, 1, E)
+                # Only apply to items in context phase
+                context_mask = needs_context.view(batch_size, 1, 1).float()
+                next_input = next_input + ctx_emb * context_mask
+
+            # --- Non-context phase items: handle text embedding ---
+            text_embedded = None
+            if text_tokens is not None and needs_text.any():
+                # Embed text tokens for all items (will be masked later)
+                text_tokens_2d = text_tokens.unsqueeze(1)  # (B, 1)
+                text_embedded = self.decoder.get_input_embeddings()(text_tokens_2d)  # (B, 1, E)
+
+                # Handle BPE char tokenizer
+                if self.use_bpe_char_tokenizer:
+                    text_mask = torch.ones_like(text_tokens_2d, dtype=torch.bool)
+                    cas_embedding = self.cas_encoder(text_tokens_2d, subword_mask=text_mask)  # (B, 1, E)
+                    text_embedded = text_embedded + cas_embedding
+
+                if force_dropout_text:
+                    text_embedded = text_embedded * 0
+
+                text_add_mask = needs_text.view(batch_size, 1, 1).float()
+                next_input = next_input + text_embedded * text_add_mask
+                # Check for EOS tokens - mark those items as text_finished
+                # Items that receive EOS should not have their text embedded added after this step
+                is_eos_token = (text_tokens == self.eos_id)  # (B,) bool
+                state.text_finished = state.text_finished | is_eos_token
+
+            elif text_tokens is None:
+                # Text finished signal for items not in context phase
+                state.text_finished = state.text_finished | ~needs_context
+
+            # --- Phoneme embedding for phoneme and audio phase items ---
+            if self.phoneme_tokenizer is not None:
+                if needs_phoneme.any():
+                    phoneme_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
+
+                    if state.phoneme_input_type == 'gt' and state.gt_phoneme_embeddings is not None:
+                        # Teacher forcing: use pre-computed GT phoneme embeddings
+                        # Only use GT embedding if within valid length, otherwise zero
+                        within_gt_len = state.phoneme_steps < state.gt_phoneme_lens  # (B,)
+                        positions = state.phoneme_steps.clamp(max=state.gt_phoneme_embeddings.size(1) - 1)
+                        gt_emb = state.gt_phoneme_embeddings[
+                            torch.arange(batch_size, device=device), positions, :
+                        ].unsqueeze(1)  # (B, 1, E)
+                        phoneme_mask = (needs_phoneme & within_gt_len).view(batch_size, 1, 1).float()
+                        phoneme_emb = phoneme_emb + gt_emb * phoneme_mask
+                    else:
+                        # Prediction mode: use BOS or last predicted phoneme
+                        first_phoneme_step = needs_phoneme & (state.phoneme_steps == 0)
+                        has_last_phoneme = needs_phoneme & ~first_phoneme_step & (state.last_phoneme_tokens is not None)
+
+                        if first_phoneme_step.any():
+                            phoneme_bos = torch.full(
+                                (batch_size, self.phoneme_stacking_factor, 1),
+                                self.phoneme_tokenizer.bos_token_id,
+                                device=device,
+                            ).long()
+                            phoneme_bos_emb = self.embed_phoneme_tokens(phoneme_bos)  # (B, 1, E)
+                            first_mask = first_phoneme_step.view(batch_size, 1, 1).float()
+                            phoneme_emb = phoneme_emb + phoneme_bos_emb * first_mask
+
+                        if has_last_phoneme.any() and state.last_phoneme_tokens is not None:
+                            last_phoneme_emb = self.embed_phoneme_tokens(state.last_phoneme_tokens.unsqueeze(2))  # (B, 1, E)
+                            last_mask = has_last_phoneme.view(batch_size, 1, 1).float()
+                            phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask
+
+                    next_input = next_input + phoneme_emb
+
+            # --- Audio embedding for audio phase items ---
+            if needs_audio.any():
+                # Determine which items are at first audio step
+                first_audio_step = needs_audio & (state.audio_steps == 0)
+                has_last_audio = needs_audio & ~first_audio_step & (state.last_audio_codes is not None)
+
+                audio_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
+
+                if first_audio_step.any():
+                    # Create BOS for items at first audio step
+                    audio_bos = torch.full(
+                        (batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1),
+                        self.audio_bos_id,
                         device=device,
+                    ).long()
+                    audio_bos_emb = self.embed_audio_tokens(audio_bos)  # (B, 1, E)
+                    first_mask = first_audio_step.view(batch_size, 1, 1).float()
+                    audio_emb = audio_emb + audio_bos_emb * first_mask
+
+                if has_last_audio.any() and state.last_audio_codes is not None:
+                    # Use last predicted audio
+                    last_audio_emb = self.embed_audio_tokens(state.last_audio_codes.unsqueeze(2))  # (B, 1, E)
+                    last_mask = has_last_audio.view(batch_size, 1, 1).float()
+                    audio_emb = audio_emb + last_audio_emb * last_mask
+
+                next_input = next_input + audio_emb
+
+            # ==================== HANDLE CFG ====================
+            if state.use_cfg:
+                # For unconditional branch, use dummy embedding for non-audio items
+                # and audio-only embedding for audio items
+                next_input_unconditional_context = state.dummy_context_embedding_unconditional.expand(batch_size, 1, -1)
+                # After the context is finished, we use zero embedding for the unconditional branch until audio phase starts
+                next_input_unconditional_zeros = torch.zeros_like(next_input_unconditional_context)
+                context_mask = needs_context.view(batch_size, 1, 1).float()
+                next_input_unconditional = context_mask * next_input_unconditional_context + (1 - context_mask) * next_input_unconditional_zeros
+                
+                # For audio phase items, we use audio embedding for the unconditional branch
+                if needs_audio.any():
+                    audio_mask = needs_audio.view(batch_size, 1, 1).float()
+                    next_input_unconditional = next_input_unconditional * (1 - audio_mask) + audio_emb * audio_mask
+
+                # Concatenate conditional and unconditional: (2*B, 1, E)
+                next_input = torch.cat([next_input, next_input_unconditional], dim=0)
+
+            # ==================== FORWARD PASS ====================
+            cache_position = torch.tensor([state.cache_seq_len], device=device)
+            transformer_out = self.forward(
+                inputs_embeds=next_input,
+                attention_mask=None,
+                use_cache=True,
+                past_key_values=state.past_key_values,
+                cache_position=cache_position,
+            )
+
+            state.last_hidden = transformer_out.last_hidden_state
+            state.past_key_values = transformer_out.past_key_values
+            state.cache_seq_len += 1
+
+            # ==================== UPDATE STATE ====================
+            # Update context_position for items in context phase
+            state.context_position = state.context_position + needs_context.long()
+            # Keep updating text_tokens_seen for items once the context is finished
+            # This is because this counter is used to determine when to start predicting phonemes and audio
+            state.text_tokens_seen = state.text_tokens_seen + (~needs_context).long()
+
+            # Update phoneme_steps for items in phoneme or audio phase
+            state.phoneme_steps = state.phoneme_steps + needs_phoneme.long()
+
+            # Update audio_steps for items in audio phase
+            state.audio_steps = state.audio_steps + needs_audio.long()
+
+            # ==================== PREDICTIONS ====================
+            pred_phoneme_tokens = None
+            audio_codes_next = None
+
+            # Phoneme predictions for items in phoneme or audio phase
+            if needs_phoneme.any() and self.phoneme_tokenizer is not None:
+                # Track phoneme prediction start index for items just entering phoneme phase
+                first_phoneme_step = needs_phoneme & (state.phoneme_prediction_start_idx == -1)
+                if first_phoneme_step.any():
+                    current_phoneme_step_idx = len(state.all_phoneme_predictions)  # before append
+                    state.phoneme_prediction_start_idx = torch.where(
+                        first_phoneme_step,
+                        torch.full_like(state.phoneme_prediction_start_idx, current_phoneme_step_idx),
+                        state.phoneme_prediction_start_idx
                     )
 
-                    # Compute masked phoneme channel input
-                    phoneme_channel_input_t, use_phoneme_input = self._compute_phoneme_channel_input(
-                        input_phoneme_embedding=input_phoneme_embedding,
-                        current_phoneme_positions=current_phoneme_positions,
-                        phoneme_stream_ended=phoneme_stream_ended,
-                        actual_batch_size=actual_batch_size,
-                        device=device,
+                # Check which items should predict phonemes (not ended)
+                pred_phoneme_tokens = self._predict_phoneme_tokens(state)  # (B, phoneme_stacking_factor)
+                state.last_phoneme_tokens = pred_phoneme_tokens
+                state.all_phoneme_predictions.append(pred_phoneme_tokens)
+
+                # Check for phoneme EOS per item
+                phoneme_eos_detected = needs_phoneme & (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1)  # (B,)
+                state.phoneme_stream_ended = state.phoneme_stream_ended | phoneme_eos_detected
+
+                # Track phoneme prediction end index for items that just ended
+                newly_ended_phoneme = phoneme_eos_detected & (state.phoneme_prediction_end_idx == -1)
+                if newly_ended_phoneme.any():
+                    current_phoneme_step_idx = len(state.all_phoneme_predictions)  # after append
+                    state.phoneme_prediction_end_idx = torch.where(
+                        newly_ended_phoneme,
+                        torch.full_like(state.phoneme_prediction_end_idx, current_phoneme_step_idx),
+                        state.phoneme_prediction_end_idx
                     )
 
-                    # Collect phoneme tokens for logging (no printing here)
-                    self._collect_phoneme_tokens_for_logging(
-                        pred_phoneme_tokens=pred_phoneme_tokens,
-                        gt_phoneme_tokens_current=gt_phoneme_tokens_current,
-                        use_phoneme_input=use_phoneme_input,
-                        pred_phoneme_token_lists=pred_phoneme_token_lists,
-                        gt_phoneme_token_lists=gt_phoneme_token_lists,
-                        batch_size=actual_batch_size,
+            # Audio predictions for items in audio phase
+            if needs_audio.any():
+                # Track audio prediction start index for items just entering audio phase
+                first_audio_step = needs_audio & (state.audio_prediction_start_idx == -1)
+                if first_audio_step.any():
+                    # Track start in terms of frames (not steps)
+                    current_frame_idx = sum(p.size(-1) for p in state.all_predictions)  # total frames so far
+                    state.audio_prediction_start_idx = torch.where(
+                        first_audio_step,
+                        torch.full_like(state.audio_prediction_start_idx, current_frame_idx),
+                        state.audio_prediction_start_idx
                     )
 
-                    # Check for phoneme EOS
-                    for item_idx in range(actual_batch_size):
-                        if torch.any(input_phoneme_tokens_current[item_idx] == self.phoneme_tokenizer.eos_token_id):
-                            if verbose and not phoneme_stream_ended[item_idx]:
-                                logging.info(f"Phoneme EOS detected for item {item_idx} at timestep {idx}")
-                            phoneme_stream_ended[item_idx] = True
-
-                # Check for audio EOS
-                self._check_eos_and_update_end_indices(
-                    all_codes_next_argmax=all_codes_next_argmax,
-                    audio_codes_next=audio_codes_next,
-                    end_indices=end_indices,
-                    context_plus_audio_lens=context_plus_audio_lens,
-                    min_context_len=min_context_len,
-                    idx=idx,
-                    verbose=verbose,
-                )
+                audio_codes_next_stacked, all_codes_next_argmax = self._predict_audio_codes(state)  # (B, C*S)
+
+                # Unstack immediately: (B, C*S) -> (B, C, S) where S = frame_stacking_factor
+                S = self.frame_stacking_factor
+                C = self.num_audio_codebooks
+                audio_codes_unstacked = audio_codes_next_stacked.view(batch_size, C, S)  # (B, C, S)
+
+                # Update last_audio_codes with stacked format (needed for next step's embedding)
+                if state.last_audio_codes is None:
+                    state.last_audio_codes = audio_codes_next_stacked
+                else:
+                    update_mask = needs_audio.view(batch_size, 1).expand_as(audio_codes_next_stacked)
+                    state.last_audio_codes = torch.where(update_mask, audio_codes_next_stacked, state.last_audio_codes)
+
+                # Check for EOS in each frame and track exact end position
+                # all_codes_next_argmax is also (B, C*S), reshape to (B, C, S)
+                all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S)
+
+                # For each batch item, find if/where EOS occurs in this step's frames
+                eos_in_sampled = (audio_codes_unstacked == self.audio_eos_id)  # (B, C, S)
+                eos_in_argmax = (all_codes_argmax_unstacked == self.audio_eos_id)  # (B, C, S)
+                eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1)  # (B, S)
+
+                # Find first frame with EOS per batch item (or S if none)
+                eos_frame_idx = torch.where(
+                    eos_any_codebook.any(dim=1),
+                    eos_any_codebook.int().argmax(dim=1),  # first frame with EOS
+                    torch.full((batch_size,), S, device=device)  # no EOS in this step
+                )  # (B,)
+
+                audio_eos_detected = eos_any_codebook.any(dim=1)  # (B,)
+                state.finished = state.finished | audio_eos_detected
+
+                # Track audio prediction end index (in frames) for items that just ended
+                newly_ended_audio = audio_eos_detected & (state.audio_prediction_end_idx == -1)
+                if newly_ended_audio.any():
+                    # End index = current frame count + frame offset where EOS was found
+                    current_frame_count = len(state.all_predictions) * self.frame_stacking_factor
+                    end_frame_idx = current_frame_count + eos_frame_idx
+                    state.audio_prediction_end_idx = torch.where(
+                        newly_ended_audio,
+                        end_frame_idx,
+                        state.audio_prediction_end_idx
+                    )
 
-                all_predictions.append(audio_codes_next)
-
-                # Prepare input for next decoder step
-                next_input = self._prepare_next_decoder_input(
-                    audio_codes_next=audio_codes_next,
-                    context_plus_audio_embedded=context_plus_audio_embedded,
-                    context_plus_audio_lens=context_plus_audio_lens,
-                    min_context_len=min_context_len,
-                    idx=idx,
-                    current_text_input_mode=current_text_input_mode,
-                    remaining_text_embedded=remaining_text_embedded,
-                    current_text_positions=current_text_positions,
-                    phoneme_channel_input_t=phoneme_channel_input_t,
-                    use_cfg=use_cfg,
-                    dummy_context_embedding_unconditional=dummy_context_embedding_unconditional,
+                # Store unstacked codes
+                state.all_predictions.append(audio_codes_unstacked)
+                audio_codes_next = audio_codes_unstacked
+
+            return state, audio_codes_next, pred_phoneme_tokens
+
+    def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor:
+        """Predict phoneme tokens from the last hidden state."""
+        actual_batch_size = state.batch_size
+        last_hidden = state.last_hidden
+
+        # Get phoneme logits
+        all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :])
+        all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size]
+
+        # Sample phonemes
+        if state.phoneme_sampling_method == 'argmax':
+            pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(
+                all_code_logits_t_phoneme, temperature=0.01
+            )
+        else:
+            pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(
+                all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk
+            )
+        # (B, phoneme_stacking_factor)
+        return pred_phoneme_tokens
+
+    def _predict_audio_codes(
+        self, state: StreamingState
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predict audio codes from the last hidden state."""
+        actual_batch_size = state.batch_size
+        last_hidden = state.last_hidden
+
+        # Compute audio logits
+        last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :])
+        all_code_logits_t = self.final_proj(last_hidden_audio)
+
+        # Apply CFG if enabled
+        if state.use_cfg:
+            conditional_logits = all_code_logits_t[:actual_batch_size]
+            unconditional_logits = all_code_logits_t[actual_batch_size:]
+            all_code_logits_t = state.cfg_scale * conditional_logits + (1.0 - state.cfg_scale) * unconditional_logits
+
+        # Sample audio codes
+        audio_codes_next, all_codes_next_argmax = self._sample_audio_codes(
+            last_hidden=last_hidden,
+            all_code_logits_t=all_code_logits_t,
+            temperature=state.temperature,
+            topk=state.topk,
+            use_local_transformer_for_inference=state.use_local_transformer,
+            use_cfg=state.use_cfg,
+            cfg_scale=state.cfg_scale,
+        )
+
+        return audio_codes_next, all_codes_next_argmax
+
+    def streaming_decode(
+        self,
+        state: StreamingState,
+        previous_decode_length: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
+        """
+        Decode accumulated audio codes to waveform, returning only the new chunk.
+
+        WARNING: This function does not yet support batch_size > 1.
+        Do not use with batched streaming inference. Use streaming_finalize instead.
+
+        This function takes all predicted codes so far and decodes them, but only
+        returns the newly generated audio portion (after previous_decode_length).
+
+        Args:
+            state: Current StreamingState containing all_predictions.
+            previous_decode_length: Number of audio samples already decoded and returned
+                in previous calls. Use 0 on first call.
+
+        Returns:
+            Tuple of:
+                - new_audio: Newly generated audio waveform (1, new_samples)
+                - new_audio_len: Length of new audio (1,)
+                - total_decode_length: Total decoded length so far (use as previous_decode_length
+                    for next call)
+        """
+        if len(state.all_predictions) == 0:
+            return (
+                torch.zeros(1, 0, device=state.device),
+                torch.zeros(1, dtype=torch.long, device=state.device),
+                previous_decode_length,
+            )
+
+        with torch.inference_mode():
+            # Concatenate all predictions - each is (1, C, S), concat gives (1, C, T_total_frames)
+            predicted_codes = torch.cat(state.all_predictions, dim=-1)  # (1, C, T_total_frames)
+            predicted_codes_lens = torch.tensor([predicted_codes.size(-1)], device=state.device)
+
+            # Decode to audio (codes are already unstacked, no EOS removal needed)
+            audio, audio_len, _ = self.codes_to_audio(predicted_codes, predicted_codes_lens)
+
+            # Extract only new audio
+            total_decode_length = audio_len[0].item()
+            if total_decode_length <= previous_decode_length:
+                return (
+                    torch.zeros(1, 0, device=state.device),
+                    torch.zeros(1, dtype=torch.long, device=state.device),
+                    previous_decode_length,
                 )
 
-                # Forward pass for next token
-                cache_position = torch.tensor([current_cache_seq_len], device=device)
-                transformer_out = self.forward(
-                    inputs_embeds=next_input,
-                    attention_mask=None,
-                    use_cache=True,
-                    past_key_values=past_kv,
-                    cache_position=cache_position,
+            new_audio = audio[:, previous_decode_length:total_decode_length]
+            new_audio_len = torch.tensor([total_decode_length - previous_decode_length], device=state.device)
+
+            return new_audio, new_audio_len, total_decode_length
+
+    def streaming_finalize(
+        self,
+        state: StreamingState,
+    ) -> StreamingFinalizeOutput:
+        """
+        Finalize streaming and return the complete generated audio and phoneme predictions.
+
+        This function should be called after all streaming_step() calls are complete
+        (i.e., when state.finished.all() is True or max steps reached).
+
+        Args:
+            state: Final StreamingState after streaming is complete.
+
+        Returns:
+            StreamingFinalizeOutput containing audio, codes, and phoneme predictions.
+        """
+        batch_size = state.batch_size
+
+        # Extract and decode phoneme predictions
+        phoneme_tokens_list: List[List[int]] = []
+        phoneme_text_list: List[str] = []
+        if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0:
+            # Stack phoneme predictions: each is (B, phoneme_stacking_factor)
+            all_phonemes = torch.stack(state.all_phoneme_predictions, dim=-1)  # (B, S, T)
+            for i in range(batch_size):
+                start = max(0, state.phoneme_prediction_start_idx[i].item())
+                end = state.phoneme_prediction_end_idx[i].item()
+                if end < 0:
+                    end = all_phonemes.size(-1)
+                # Flatten stacked phonemes back to sequence
+                tokens = all_phonemes[i, :, start:end].T.reshape(-1).tolist()
+                # Remove special tokens (BOS, EOS, PAD)
+                special = {self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.eos_token_id}
+                if hasattr(self.phoneme_tokenizer, 'pad_token_id'):
+                    special.add(self.phoneme_tokenizer.pad_token_id)
+                tokens = [t for t in tokens if t not in special]
+                phoneme_tokens_list.append(tokens)
+                phoneme_text_list.append(self.phoneme_tokenizer.decode(tokens))
+        else:
+            phoneme_tokens_list = [[] for _ in range(batch_size)]
+            phoneme_text_list = ["" for _ in range(batch_size)]
+
+        if len(state.all_predictions) == 0:
+            return StreamingFinalizeOutput(
+                audio=torch.zeros(batch_size, 0, device=state.device),
+                audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
+                audio_codes=torch.zeros(batch_size, self.num_audio_codebooks, 0, device=state.device),
+                audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
+                phoneme_tokens=phoneme_tokens_list,
+                phoneme_text=phoneme_text_list,
+            )
+
+        with torch.inference_mode():
+            # Concatenate all predictions - each is (B, C, S), concat gives (B, C, T_total_frames)
+            all_codes = torch.cat(state.all_predictions, dim=-1)  # (B, C, T_total_frames)
+            total_frames = all_codes.size(-1)
+            num_codebooks = all_codes.size(1)
+
+            # Start and end indices are in frames (not steps)
+            # If start_idx is -1, item never started audio predictions - use 0
+            # If end_idx is -1, item never ended - use total_frames
+            start_indices = torch.clamp(state.audio_prediction_start_idx, min=0)
+            end_indices = torch.where(
+                state.audio_prediction_end_idx >= 0,
+                state.audio_prediction_end_idx,
+                torch.full_like(state.audio_prediction_end_idx, total_frames)
+            )
+
+            # Calculate per-item lengths (in frames)
+            predicted_codes_lens = end_indices - start_indices
+            max_len = predicted_codes_lens.max().item()
+
+            # Handle case where all items have zero-length predictions
+            if max_len == 0:
+                return StreamingFinalizeOutput(
+                    audio=torch.zeros(batch_size, 0, device=state.device),
+                    audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
+                    audio_codes=torch.zeros(batch_size, num_codebooks, 0, device=state.device, dtype=all_codes.dtype),
+                    audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
+                    phoneme_tokens=phoneme_tokens_list,
+                    phoneme_text=phoneme_text_list,
                 )
-                last_hidden = transformer_out.last_hidden_state
-                past_kv = transformer_out.past_key_values
-                current_cache_seq_len += 1
-
-                # Check if all items have finished
-                if len(end_indices) == actual_batch_size:
-                    if verbose:
-                        logging.info(f"All items finished at timestep {idx}")
-                    break
-
-            # Log phoneme predictions if verbose
-            if verbose and self.phoneme_tokenizer is not None:
-                self._log_phoneme_predictions(
-                    pred_phoneme_token_lists=pred_phoneme_token_lists,
-                    gt_phoneme_token_lists=gt_phoneme_token_lists,
-                    batch_size=actual_batch_size,
+
+            # Create padded output tensor and slice each item's valid predictions
+            predicted_codes = torch.zeros(
+                batch_size, num_codebooks, max_len,
+                dtype=all_codes.dtype, device=state.device
+            )
+            for i in range(batch_size):
+                start = start_indices[i].item()
+                end = end_indices[i].item()
+                length = end - start
+                if length > 0:
+                    predicted_codes[i, :, :length] = all_codes[i, :, start:end]
+
+            # No need to remove EOS - end_indices already point to the frame before EOS
+            # Decode to audio (codes are already unstacked: B, C, T)
+            audio, audio_len, decoded_codes = self.codes_to_audio(predicted_codes, predicted_codes_lens)
+
+            return StreamingFinalizeOutput(
+                audio=audio,
+                audio_len=audio_len,
+                audio_codes=predicted_codes,
+                audio_codes_len=predicted_codes_lens,
+                phoneme_tokens=phoneme_tokens_list,
+                phoneme_text=phoneme_text_list,
+            )
+
+    def infer_batch(
+        self,
+        batch: Dict[str, torch.Tensor],
+        max_decoder_steps: int = 500,
+        temperature: float = 0.7,
+        topk: int = 80,
+        use_cfg: bool = False,
+        cfg_scale: float = 1.0,
+        use_local_transformer_for_inference: bool = False,
+        phoneme_input_type: str = 'pred',
+        phoneme_sampling_method: str = 'argmax',
+        force_dropout_text: bool = False,
+    ) -> InferBatchOutput:
+        """
+        Batch inference using streaming infrastructure.
+
+        This is a simple wrapper around streaming_init, streaming_step, and streaming_finalize
+        that processes a batch dictionary similar to training_step/validation_step.
+
+        Args:
+            batch: Dictionary containing:
+                - text: Text token IDs (B, L)
+                - text_lens: Lengths (B,)
+                - context_text_tokens: Context text tokens (B, L')
+                - context_text_tokens_lens: Lengths (B,)
+                - context_audio_codes: Context audio codes (B, C, T) OR
+                - context_audio / context_audio_lens: Raw context audio to encode
+                - phoneme_tokens (optional): GT phoneme tokens (B, L'')
+                - phoneme_tokens_lens (optional): Lengths (B,)
+            max_decoder_steps: Maximum number of decoder steps.
+            temperature: Sampling temperature for audio codes.
+            topk: Top-k sampling parameter.
+            use_cfg: Whether to use classifier-free guidance.
+            cfg_scale: CFG scale factor.
+            use_local_transformer_for_inference: Whether to use local transformer.
+            phoneme_input_type: 'gt' or 'pred' for phoneme tokens.
+            phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
+            force_dropout_text: Whether to dropout text embeddings.
+
+        Returns:
+            InferBatchOutput containing predicted audio, codes, and RTF metrics.
+        """
+        with torch.inference_mode():
+            start_time = time.time()
+
+            # Extract tensors from batch
+            text = batch['text']
+            text_lens = batch['text_lens']
+            context_text_tokens = batch['context_text_tokens']
+            context_text_tokens_lens = batch['context_text_tokens_lens']
+
+            # Handle context audio - either use codes directly or encode from audio
+            if 'context_audio_codes' in batch:
+                context_audio_codes = batch['context_audio_codes']
+                context_audio_codes_lens = batch['context_audio_codes_lens']
+            else:
+                context_audio = batch['context_audio']
+                context_audio_lens = batch['context_audio_lens']
+                context_audio_codes, context_audio_codes_lens = self.audio_to_codes(
+                    context_audio, context_audio_lens
                 )
 
-            # Post-process predictions
-            tts_generation_time = time.time() - start_time
-            tts_generation_time_per_frame = tts_generation_time / len(all_predictions)
-
-            # Calculate predicted lengths, accounting for context offset
-            pred_codes_start_indices = context_plus_audio_lens - min_context_len
-            predicted_lens = [end_indices.get(i, max_decoder_steps) for i in range(actual_batch_size)]
-            predicted_codes_lens = torch.tensor(predicted_lens, device=device).long()
-            predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices
-
-            # Stack and slice predictions to remove context portion
-            predicted_codes = torch.stack(all_predictions, dim=-1)  # (B, num_codebooks, T)
-            predicted_codes = self.slice_pred_embeddings(
-                predicted_codes.permute(0, 2, 1),
-                context_lens=pred_codes_start_indices,
-                target_lens=predicted_codes_lens,
+            # Optional GT phoneme tokens for teacher forcing
+            gt_phoneme_tokens = batch.get('phoneme_tokens')
+            gt_phoneme_tokens_lens = batch.get('phoneme_tokens_lens')
+
+            batch_size = text.size(0)
+
+            # Initialize streaming state
+            state = self.streaming_init(
+                context_audio_codes=context_audio_codes,
+                context_audio_codes_lens=context_audio_codes_lens,
+                context_text_tokens=context_text_tokens,
+                context_text_tokens_lens=context_text_tokens_lens,
+                use_cfg=use_cfg,
+                cfg_scale=cfg_scale,
+                use_local_transformer=use_local_transformer_for_inference,
+                temperature=temperature,
+                topk=topk,
+                phoneme_input_type=phoneme_input_type,
+                phoneme_sampling_method=phoneme_sampling_method,
+                gt_phoneme_tokens=gt_phoneme_tokens,
+                gt_phoneme_tokens_lens=gt_phoneme_tokens_lens,
             )
-            predicted_codes = predicted_codes.permute(0, 2, 1)
 
-            # Remove EOS tokens and convert codes to audio
-            predicted_codes, predicted_codes_lens = self.remove_eos_token(predicted_codes, predicted_codes_lens)
-            predicted_audio, predicted_audio_lens, _ = self.codes_to_audio(predicted_codes, predicted_codes_lens)
+            time_to_first_prediction = None
+            generation_start_time = time.time()
+            device = text.device
+
+            # Generate until all items are finished or max steps reached
+            while not state.finished.all() and len(state.all_predictions) < max_decoder_steps:
+                # Gather the correct text token for each batch item based on text_tokens_seen
+                # Items in context phase will have their token ignored by streaming_step
+                positions = state.text_tokens_seen.clamp(max=text.size(1) - 1)
+                current_tokens = text[torch.arange(batch_size, device=device), positions]
+
+                # For items that have exhausted their text, provide EOS token
+                text_exhausted = state.text_tokens_seen >= text_lens
+                current_tokens = torch.where(text_exhausted, torch.full_like(current_tokens, self.eos_id), current_tokens)
+
+                state, audio_codes, phoneme_tokens = self.streaming_step(
+                    state=state,
+                    text_tokens=current_tokens,
+                    force_dropout_text=force_dropout_text,
+                )
+
+                # Record time to first audio prediction
+                if time_to_first_prediction is None and audio_codes is not None:
+                    time_to_first_prediction = time.time() - start_time
+
+            tts_generation_time = time.time() - generation_start_time
+
+            # Finalize and decode audio
+            finalize_output = self.streaming_finalize(state)
 
-            # Compute RTF metrics
             end_time = time.time()
-            total_audio_duration_generated = (
-                predicted_audio_lens.max().item() * predicted_audio_lens.shape[0]
-            ) / self.sample_rate
-            rtf = total_audio_duration_generated / (end_time - start_time)
+            total_time = end_time - start_time
+
+            # Compute RTF metrics
+            total_audio_samples = finalize_output.audio_len.sum().item()
+            total_audio_duration = total_audio_samples / self.output_sample_rate
+            num_frames = len(state.all_predictions)
+            tts_generation_time_per_frame = tts_generation_time / num_frames if num_frames > 0 else 0.0
 
             rtf_metrics = {
-                'rtf': rtf,
+                'rtf': total_audio_duration / total_time if total_time > 0 else 0.0,
                 'time_to_first_prediction': time_to_first_prediction,
                 'tts_generation_time': tts_generation_time,
-                'max_frames_generated': len(all_predictions),
+                'max_frames_generated': num_frames,
                 'tts_generation_time_per_frame': tts_generation_time_per_frame,
-                'batch_size': actual_batch_size,
+                'batch_size': batch_size,
             }
 
-            return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics
+            return InferBatchOutput(
+                predicted_audio=finalize_output.audio,
+                predicted_audio_lens=finalize_output.audio_len,
+                predicted_codes=finalize_output.audio_codes,
+                predicted_codes_lens=finalize_output.audio_codes_len,
+                rtf_metrics=rtf_metrics,
+            )
 
     @classmethod
     def list_available_models(cls) -> List[PretrainedModelInfo]:
diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index 34ba8d62c730..9b0db0f7f75e 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -319,21 +319,24 @@ def _run_decoder_only_inference(
 
         for batch_idx, batch in enumerate(dataloader):
             logging.info(f"Processing batch {batch_idx + 1}/{len(dataloader)}")
-            batch_cuda = self._batch_to_cuda(batch)
-
-            predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics = self.model.infer_batch(
-                batch_cuda,
+            batch = self._batch_to_cuda(batch)
+            output = self.model.infer_batch(
+                batch,
                 max_decoder_steps=self.config.model_inference_parameters.max_decoder_steps,
                 temperature=self.config.model_inference_parameters.temperature,
                 topk=self.config.model_inference_parameters.topk,
-                use_local_transformer_for_inference=self.config.use_local_transformer,
-                maskgit_n_steps=self.config.maskgit_n_steps,
                 use_cfg=self.config.use_cfg,
                 cfg_scale=self.config.model_inference_parameters.cfg_scale,
+                use_local_transformer_for_inference=self.config.use_local_transformer,
                 phoneme_input_type=self.config.phoneme_input_type,
                 phoneme_sampling_method=phoneme_sampling_method,
-                dropout_text_input=self.config.dropout_text_input,
+                force_dropout_text=self.config.dropout_text_input,
             )
+            predicted_audio = output.predicted_audio
+            predicted_audio_lens = output.predicted_audio_lens
+            predicted_codes = output.predicted_codes
+            predicted_codes_lens = output.predicted_codes_lens
+            rtf_metrics = output.rtf_metrics
 
             all_rtf_metrics.append(rtf_metrics)
             logging.info(f"Output shape: {predicted_audio.size()}")
@@ -342,7 +345,8 @@ def _run_decoder_only_inference(
                 audio_len = predicted_audio_lens[idx].item()
                 audio_np = predicted_audio[idx].float().detach().cpu().numpy()[:audio_len]
                 audio_path = os.path.join(output_dir, f"predicted_audio_{item_idx}.wav")
-                sf.write(audio_path, audio_np, self.model.sample_rate)
+                sample_rate = getattr(self.model, "output_sample_rate", self.model.sample_rate)
+                sf.write(audio_path, audio_np, sample_rate)
                 generated_audio_paths.append(audio_path)
 
                 if save_context_audio and item_idx < len(manifest_records):

From cd59639f34ff643d8ef4ab756d732852c790eba1 Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Thu, 5 Feb 2026 22:34:49 +0000
Subject: [PATCH 36/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 examples/tts/magpietts_streaming_inference.py |  60 ++++----
 nemo/collections/tts/models/easy_magpietts.py | 130 ++++++++++--------
 2 files changed, 105 insertions(+), 85 deletions(-)

diff --git a/examples/tts/magpietts_streaming_inference.py b/examples/tts/magpietts_streaming_inference.py
index 6e72ea77b8e6..d25172d4e1f6 100644
--- a/examples/tts/magpietts_streaming_inference.py
+++ b/examples/tts/magpietts_streaming_inference.py
@@ -141,6 +141,7 @@ def load_audio(audio_path: str, target_sample_rate: int) -> torch.Tensor:
     # Resample if needed
     if sr != target_sample_rate:
         import librosa
+
         audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate)
 
     return torch.from_numpy(audio).unsqueeze(0)  # (1, num_samples)
@@ -241,9 +242,7 @@ def run_streaming_inference(
     context_audio_lens = context_audio_lens.to(device)
 
     with torch.inference_mode():
-        context_audio_codes, context_audio_codes_lens = model.audio_to_codes(
-            context_audio, context_audio_lens
-        )
+        context_audio_codes, context_audio_codes_lens = model.audio_to_codes(context_audio, context_audio_lens)
 
     # Tokenize context text
     # Use the text conditioning tokenizer
@@ -357,8 +356,10 @@ def run_streaming_inference(
                 num_phoneme_frames += 1
 
         if verbose and (i + 1) % 10 == 0:
-            phase = "prompt" if audio_codes is None and phoneme_tokens is None else (
-                "phoneme-only" if audio_codes is None else "audio"
+            phase = (
+                "prompt"
+                if audio_codes is None and phoneme_tokens is None
+                else ("phoneme-only" if audio_codes is None else "audio")
             )
             logging.info(
                 f"Processed {i + 1}/{len(text_tokens)} text tokens (phase: {phase}), "
@@ -373,7 +374,9 @@ def run_streaming_inference(
     # Continue generating until finished (text has ended)
     continuation_steps = 0
     while not state.finished and continuation_steps < max_steps:
-        state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=None, force_dropout_text=force_dropout_text)
+        state, audio_codes, phoneme_tokens = model.streaming_step(
+            state, text_tokens=None, force_dropout_text=force_dropout_text
+        )
 
         if audio_codes is not None:
             num_audio_frames += 1
@@ -494,7 +497,7 @@ def run_batched_streaming_inference(
     for i in range(batch_size):
         codes = context_audio_codes_list[i]
         codes_len = context_audio_codes_lens_list[i]
-        context_audio_codes[i, :, :codes.size(-1)] = codes[0]
+        context_audio_codes[i, :, : codes.size(-1)] = codes[0]
         context_audio_codes_lens[i] = codes_len[0]
 
     # Tokenize context texts
@@ -510,7 +513,7 @@ def run_batched_streaming_inference(
     context_text_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device)
 
     for i, tokens in enumerate(context_text_tokens_list):
-        context_text_tokens[i, :len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device)
+        context_text_tokens[i, : len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device)
         context_text_tokens_lens[i] = len(tokens)
 
     # Tokenize main texts
@@ -543,7 +546,7 @@ def run_batched_streaming_inference(
         gt_phoneme_tokens = torch.zeros(batch_size, max_phoneme_len, dtype=torch.long, device=device)
         gt_phoneme_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device)
         for i, tokens in enumerate(phoneme_tokens_lists):
-            gt_phoneme_tokens[i, :len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device)
+            gt_phoneme_tokens[i, : len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device)
             gt_phoneme_tokens_lens[i] = len(tokens)
 
     phoneme_input_type = 'gt' if use_gt_phonemes else 'pred'
@@ -625,9 +628,13 @@ def run_batched_streaming_inference(
         all_text_done = text_finished_mask.all() and not in_context_phase.any()
 
         if all_text_done:
-            state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=None, force_dropout_text=force_dropout_text)
+            state, audio_codes, phoneme_tokens = model.streaming_step(
+                state, text_tokens=None, force_dropout_text=force_dropout_text
+            )
         else:
-            state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=text_tokens_batch, force_dropout_text=force_dropout_text)
+            state, audio_codes, phoneme_tokens = model.streaming_step(
+                state, text_tokens=text_tokens_batch, force_dropout_text=force_dropout_text
+            )
 
         if audio_codes is not None:
             num_audio_frames += 1
@@ -712,8 +719,7 @@ def main():
         type=str,
         nargs='+',
         required=True,
-        help='Path(s) to context audio file(s) for speaker cloning. '
-             'Multiple files enable batched inference.',
+        help='Path(s) to context audio file(s) for speaker cloning. ' 'Multiple files enable batched inference.',
     )
     input_group.add_argument(
         '--context_text',
@@ -721,7 +727,7 @@ def main():
         nargs='+',
         default=["[NO TEXT CONTEXT]"],
         help='Context text(s) for speaker conditioning. Provide one per context audio, '
-             'or a single value to use for all. (default: "[NO TEXT CONTEXT]")',
+        'or a single value to use for all. (default: "[NO TEXT CONTEXT]")',
     )
     input_group.add_argument(
         '--context_duration',
@@ -729,8 +735,8 @@ def main():
         nargs='+',
         default=[5.0],
         help='Target duration(s) for context audio in seconds. Provide one per context audio, '
-             'or a single value to use for all. If audio is longer, '
-             'first N seconds are used. If shorter, audio is repeated. (default: 5.0)',
+        'or a single value to use for all. If audio is longer, '
+        'first N seconds are used. If shorter, audio is repeated. (default: 5.0)',
     )
     input_group.add_argument(
         '--text',
@@ -745,13 +751,13 @@ def main():
         nargs='+',
         default=None,
         help='Phoneme text(s) for GT phoneme conditioning. If not provided, uses --text. '
-             'Provide one per context audio for batched inference.',
+        'Provide one per context audio for batched inference.',
     )
     input_group.add_argument(
         '--use_gt_phonemes',
         action='store_true',
         help='Use ground-truth phonemes as decoder input (teacher forcing). '
-             'If not set, uses model-predicted phonemes.',
+        'If not set, uses model-predicted phonemes.',
     )
 
     # Output arguments
@@ -851,13 +857,17 @@ def main():
     if len(context_texts) == 1 and batch_size > 1:
         context_texts = context_texts * batch_size
     elif len(context_texts) != batch_size:
-        parser.error(f"Number of context_texts ({len(context_texts)}) must match number of context_audios ({batch_size}) or be 1")
+        parser.error(
+            f"Number of context_texts ({len(context_texts)}) must match number of context_audios ({batch_size}) or be 1"
+        )
 
     context_durations = args.context_duration
     if len(context_durations) == 1 and batch_size > 1:
         context_durations = context_durations * batch_size
     elif len(context_durations) != batch_size:
-        parser.error(f"Number of context_durations ({len(context_durations)}) must match number of context_audios ({batch_size}) or be 1")
+        parser.error(
+            f"Number of context_durations ({len(context_durations)}) must match number of context_audios ({batch_size}) or be 1"
+        )
 
     texts = args.text
     if len(texts) == 1 and batch_size > 1:
@@ -872,7 +882,9 @@ def main():
     elif len(phoneme_texts) == 1 and batch_size > 1:
         phoneme_texts = phoneme_texts * batch_size
     elif len(phoneme_texts) != batch_size:
-        parser.error(f"Number of phoneme_texts ({len(phoneme_texts)}) must match number of context_audios ({batch_size}) or be 1")
+        parser.error(
+            f"Number of phoneme_texts ({len(phoneme_texts)}) must match number of context_audios ({batch_size}) or be 1"
+        )
 
     # Load and process context audios
     context_audios = []
@@ -925,14 +937,14 @@ def main():
         if output_dir and not os.path.exists(output_dir):
             os.makedirs(output_dir)
 
-        audio_np = output.audio[0, :output.audio_len[0].item()].cpu().numpy()
+        audio_np = output.audio[0, : output.audio_len[0].item()].cpu().numpy()
         sf.write(args.output_path, audio_np, model.output_sample_rate)
         logging.info(f"Output saved to: {args.output_path}")
 
         # Save decoded context audio for sanity check
         output_base, output_ext = os.path.splitext(args.output_path)
         context_output_path = f"{output_base}_context_decoded{output_ext}"
-        context_audio_np = context_audio_decoded[0, :context_audio_decoded_lens[0].item()].cpu().numpy()
+        context_audio_np = context_audio_decoded[0, : context_audio_decoded_lens[0].item()].cpu().numpy()
         sf.write(context_output_path, context_audio_np, model.output_sample_rate)
 
         logging.info(f"Context audio (decoded from codes) saved to: {context_output_path}")
@@ -989,7 +1001,7 @@ def main():
 
         for i in range(batch_size):
             output_path_i = f"{output_base}_{i}{output_ext}"
-            audio_np = output.audio[i, :output.audio_len[i].item()].cpu().numpy()
+            audio_np = output.audio[i, : output.audio_len[i].item()].cpu().numpy()
             sf.write(output_path_i, audio_np, model.output_sample_rate)
             audio_duration_i = output.audio_len[i].item() / model.output_sample_rate
             logging.info(f"Output {i+1}/{batch_size} saved to: {output_path_i} (duration: {audio_duration_i:.2f}s)")
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 1351b8409417..4c9b26ded4d7 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -1210,9 +1210,7 @@ def prepare_text_channel_embeddings(
 
         # Create zero tensor for delay padding
         max_delay = delay.max().item()
-        zero_delay_tensor = torch.zeros(
-            batch_size, max_delay, self.cfg.embedding_dim, device=device
-        )
+        zero_delay_tensor = torch.zeros(batch_size, max_delay, self.cfg.embedding_dim, device=device)
 
         # Join delay zeros with text embeddings
         text_channel_embedding, text_channel_lens = self.join_embeddings_temporally(
@@ -1277,9 +1275,7 @@ def prepare_phoneme_channel_embeddings(
 
         # Create zero tensor for delay padding
         max_delay = delay.max().item()
-        zero_delay_tensor = torch.zeros(
-            batch_size, max_delay, self.cfg.embedding_dim, device=device
-        )
+        zero_delay_tensor = torch.zeros(batch_size, max_delay, self.cfg.embedding_dim, device=device)
 
         # Join delay zeros with phoneme embeddings
         phoneme_channel_embedding, phoneme_channel_lens = self.join_embeddings_temporally(
@@ -1355,9 +1351,7 @@ def prepare_audio_channel_embeddings(
 
         # Create zero tensor for delay padding
         max_delay = delay.max().item()
-        zero_delay_tensor = torch.zeros(
-            batch_size, max_delay, self.cfg.embedding_dim, device=device
-        )
+        zero_delay_tensor = torch.zeros(batch_size, max_delay, self.cfg.embedding_dim, device=device)
 
         # Join delay zeros with audio embeddings
         audio_channel_embedding, audio_channel_lens = self.join_embeddings_temporally(
@@ -1667,13 +1661,18 @@ def process_batch(
 
         # 7. Join context with combined channel embeddings
         # The combined_channel_lens is the max of all channel lens for each batch item
-        combined_channel_lens = torch.stack([
-            text_channel_lens,
-            audio_channel_lens,
-            phoneme_channel_lens if phoneme_channel_embedding is not None else audio_channel_lens,
-        ], dim=0).max(dim=0).values
-
-        
+        combined_channel_lens = (
+            torch.stack(
+                [
+                    text_channel_lens,
+                    audio_channel_lens,
+                    phoneme_channel_lens if phoneme_channel_embedding is not None else audio_channel_lens,
+                ],
+                dim=0,
+            )
+            .max(dim=0)
+            .values
+        )
 
         # Right pad context embedding
         context_padding = torch.zeros(
@@ -1767,7 +1766,7 @@ def training_step(self, batch, batch_idx):
             context_audio = batch['context_audio']
             context_audio_lens = batch['context_audio_lens']
             context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
-        
+
         if 'audio_codes' in batch:
             audio_codes = batch['audio_codes']
             audio_codes_lens = batch['audio_codes_lens']
@@ -1856,7 +1855,7 @@ def validation_step(self, batch, batch_idx):
             context_audio = batch['context_audio']
             context_audio_lens = batch['context_audio_lens']
             context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
-        
+
         if 'audio_codes' in batch:
             audio_codes = batch['audio_codes']
             audio_codes_lens = batch['audio_codes_lens']
@@ -1864,7 +1863,7 @@ def validation_step(self, batch, batch_idx):
             audio = batch['audio']
             audio_lens = batch['audio_lens']
             audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens)
-        
+
         batch_output = self.process_batch(
             text=batch['text'],
             text_lens=batch['text_lens'],
@@ -2049,7 +2048,6 @@ def setup_validation_data(self, cfg):
     def setup_test_data(self, cfg):
         self._test_dl = self._setup_test_dataloader(cfg)
 
-    
     def _sample_audio_codes(
         self,
         last_hidden: torch.Tensor,
@@ -2161,13 +2159,15 @@ def streaming_init(
             selected_training_mode = self.mode_name_to_mode[mode_name]
 
             # Prepare context embedding using shared helper
-            context_embedding, context_lens, context_audio_codes, context_audio_codes_lens = self.prepare_context_tensors(
-                context_text_tokens=context_text_tokens,
-                context_text_tokens_lens=context_text_tokens_lens,
-                context_audio_codes=context_audio_codes,
-                context_audio_codes_lens=context_audio_codes_lens,
-                training_mode=selected_training_mode,
-                dropout_conditional_input=False,
+            context_embedding, context_lens, context_audio_codes, context_audio_codes_lens = (
+                self.prepare_context_tensors(
+                    context_text_tokens=context_text_tokens,
+                    context_text_tokens_lens=context_text_tokens_lens,
+                    context_audio_codes=context_audio_codes,
+                    context_audio_codes_lens=context_audio_codes_lens,
+                    training_mode=selected_training_mode,
+                    dropout_conditional_input=False,
+                )
             )
 
             # Store full context embedding and lens before any CFG manipulation
@@ -2331,10 +2331,10 @@ def streaming_step(
                 ctx_positions = ctx_positions.clamp(max=state.full_context_embedding.size(1) - 1)
                 # Gather: need (B, 1, E) from (B, T, E) at positions (B,)
                 ctx_emb = state.full_context_embedding[
-                    torch.arange(batch_size, device=device),
-                    ctx_positions,
-                    :
-                ].unsqueeze(1)  # (B, 1, E)
+                    torch.arange(batch_size, device=device), ctx_positions, :
+                ].unsqueeze(
+                    1
+                )  # (B, 1, E)
                 # Only apply to items in context phase
                 context_mask = needs_context.view(batch_size, 1, 1).float()
                 next_input = next_input + ctx_emb * context_mask
@@ -2359,7 +2359,7 @@ def streaming_step(
                 next_input = next_input + text_embedded * text_add_mask
                 # Check for EOS tokens - mark those items as text_finished
                 # Items that receive EOS should not have their text embedded added after this step
-                is_eos_token = (text_tokens == self.eos_id)  # (B,) bool
+                is_eos_token = text_tokens == self.eos_id  # (B,) bool
                 state.text_finished = state.text_finished | is_eos_token
 
             elif text_tokens is None:
@@ -2378,13 +2378,17 @@ def streaming_step(
                         positions = state.phoneme_steps.clamp(max=state.gt_phoneme_embeddings.size(1) - 1)
                         gt_emb = state.gt_phoneme_embeddings[
                             torch.arange(batch_size, device=device), positions, :
-                        ].unsqueeze(1)  # (B, 1, E)
+                        ].unsqueeze(
+                            1
+                        )  # (B, 1, E)
                         phoneme_mask = (needs_phoneme & within_gt_len).view(batch_size, 1, 1).float()
                         phoneme_emb = phoneme_emb + gt_emb * phoneme_mask
                     else:
                         # Prediction mode: use BOS or last predicted phoneme
                         first_phoneme_step = needs_phoneme & (state.phoneme_steps == 0)
-                        has_last_phoneme = needs_phoneme & ~first_phoneme_step & (state.last_phoneme_tokens is not None)
+                        has_last_phoneme = (
+                            needs_phoneme & ~first_phoneme_step & (state.last_phoneme_tokens is not None)
+                        )
 
                         if first_phoneme_step.any():
                             phoneme_bos = torch.full(
@@ -2397,7 +2401,9 @@ def streaming_step(
                             phoneme_emb = phoneme_emb + phoneme_bos_emb * first_mask
 
                         if has_last_phoneme.any() and state.last_phoneme_tokens is not None:
-                            last_phoneme_emb = self.embed_phoneme_tokens(state.last_phoneme_tokens.unsqueeze(2))  # (B, 1, E)
+                            last_phoneme_emb = self.embed_phoneme_tokens(
+                                state.last_phoneme_tokens.unsqueeze(2)
+                            )  # (B, 1, E)
                             last_mask = has_last_phoneme.view(batch_size, 1, 1).float()
                             phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask
 
@@ -2434,12 +2440,17 @@ def streaming_step(
             if state.use_cfg:
                 # For unconditional branch, use dummy embedding for non-audio items
                 # and audio-only embedding for audio items
-                next_input_unconditional_context = state.dummy_context_embedding_unconditional.expand(batch_size, 1, -1)
+                next_input_unconditional_context = state.dummy_context_embedding_unconditional.expand(
+                    batch_size, 1, -1
+                )
                 # After the context is finished, we use zero embedding for the unconditional branch until audio phase starts
                 next_input_unconditional_zeros = torch.zeros_like(next_input_unconditional_context)
                 context_mask = needs_context.view(batch_size, 1, 1).float()
-                next_input_unconditional = context_mask * next_input_unconditional_context + (1 - context_mask) * next_input_unconditional_zeros
-                
+                next_input_unconditional = (
+                    context_mask * next_input_unconditional_context
+                    + (1 - context_mask) * next_input_unconditional_zeros
+                )
+
                 # For audio phase items, we use audio embedding for the unconditional branch
                 if needs_audio.any():
                     audio_mask = needs_audio.view(batch_size, 1, 1).float()
@@ -2488,7 +2499,7 @@ def streaming_step(
                     state.phoneme_prediction_start_idx = torch.where(
                         first_phoneme_step,
                         torch.full_like(state.phoneme_prediction_start_idx, current_phoneme_step_idx),
-                        state.phoneme_prediction_start_idx
+                        state.phoneme_prediction_start_idx,
                     )
 
                 # Check which items should predict phonemes (not ended)
@@ -2497,7 +2508,11 @@ def streaming_step(
                 state.all_phoneme_predictions.append(pred_phoneme_tokens)
 
                 # Check for phoneme EOS per item
-                phoneme_eos_detected = needs_phoneme & (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1)  # (B,)
+                phoneme_eos_detected = needs_phoneme & (
+                    pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id
+                ).any(
+                    dim=1
+                )  # (B,)
                 state.phoneme_stream_ended = state.phoneme_stream_ended | phoneme_eos_detected
 
                 # Track phoneme prediction end index for items that just ended
@@ -2507,7 +2522,7 @@ def streaming_step(
                     state.phoneme_prediction_end_idx = torch.where(
                         newly_ended_phoneme,
                         torch.full_like(state.phoneme_prediction_end_idx, current_phoneme_step_idx),
-                        state.phoneme_prediction_end_idx
+                        state.phoneme_prediction_end_idx,
                     )
 
             # Audio predictions for items in audio phase
@@ -2520,7 +2535,7 @@ def streaming_step(
                     state.audio_prediction_start_idx = torch.where(
                         first_audio_step,
                         torch.full_like(state.audio_prediction_start_idx, current_frame_idx),
-                        state.audio_prediction_start_idx
+                        state.audio_prediction_start_idx,
                     )
 
                 audio_codes_next_stacked, all_codes_next_argmax = self._predict_audio_codes(state)  # (B, C*S)
@@ -2542,15 +2557,15 @@ def streaming_step(
                 all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S)
 
                 # For each batch item, find if/where EOS occurs in this step's frames
-                eos_in_sampled = (audio_codes_unstacked == self.audio_eos_id)  # (B, C, S)
-                eos_in_argmax = (all_codes_argmax_unstacked == self.audio_eos_id)  # (B, C, S)
+                eos_in_sampled = audio_codes_unstacked == self.audio_eos_id  # (B, C, S)
+                eos_in_argmax = all_codes_argmax_unstacked == self.audio_eos_id  # (B, C, S)
                 eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1)  # (B, S)
 
                 # Find first frame with EOS per batch item (or S if none)
                 eos_frame_idx = torch.where(
                     eos_any_codebook.any(dim=1),
                     eos_any_codebook.int().argmax(dim=1),  # first frame with EOS
-                    torch.full((batch_size,), S, device=device)  # no EOS in this step
+                    torch.full((batch_size,), S, device=device),  # no EOS in this step
                 )  # (B,)
 
                 audio_eos_detected = eos_any_codebook.any(dim=1)  # (B,)
@@ -2563,9 +2578,7 @@ def streaming_step(
                     current_frame_count = len(state.all_predictions) * self.frame_stacking_factor
                     end_frame_idx = current_frame_count + eos_frame_idx
                     state.audio_prediction_end_idx = torch.where(
-                        newly_ended_audio,
-                        end_frame_idx,
-                        state.audio_prediction_end_idx
+                        newly_ended_audio, end_frame_idx, state.audio_prediction_end_idx
                     )
 
                 # Store unstacked codes
@@ -2585,9 +2598,7 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor:
 
         # Sample phonemes
         if state.phoneme_sampling_method == 'argmax':
-            pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(
-                all_code_logits_t_phoneme, temperature=0.01
-            )
+            pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.01)
         else:
             pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(
                 all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk
@@ -2595,9 +2606,7 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor:
         # (B, phoneme_stacking_factor)
         return pred_phoneme_tokens
 
-    def _predict_audio_codes(
-        self, state: StreamingState
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def _predict_audio_codes(self, state: StreamingState) -> Tuple[torch.Tensor, torch.Tensor]:
         """Predict audio codes from the last hidden state."""
         actual_batch_size = state.batch_size
         last_hidden = state.last_hidden
@@ -2745,7 +2754,7 @@ def streaming_finalize(
             end_indices = torch.where(
                 state.audio_prediction_end_idx >= 0,
                 state.audio_prediction_end_idx,
-                torch.full_like(state.audio_prediction_end_idx, total_frames)
+                torch.full_like(state.audio_prediction_end_idx, total_frames),
             )
 
             # Calculate per-item lengths (in frames)
@@ -2765,8 +2774,7 @@ def streaming_finalize(
 
             # Create padded output tensor and slice each item's valid predictions
             predicted_codes = torch.zeros(
-                batch_size, num_codebooks, max_len,
-                dtype=all_codes.dtype, device=state.device
+                batch_size, num_codebooks, max_len, dtype=all_codes.dtype, device=state.device
             )
             for i in range(batch_size):
                 start = start_indices[i].item()
@@ -2846,9 +2854,7 @@ def infer_batch(
             else:
                 context_audio = batch['context_audio']
                 context_audio_lens = batch['context_audio_lens']
-                context_audio_codes, context_audio_codes_lens = self.audio_to_codes(
-                    context_audio, context_audio_lens
-                )
+                context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
 
             # Optional GT phoneme tokens for teacher forcing
             gt_phoneme_tokens = batch.get('phoneme_tokens')
@@ -2886,7 +2892,9 @@ def infer_batch(
 
                 # For items that have exhausted their text, provide EOS token
                 text_exhausted = state.text_tokens_seen >= text_lens
-                current_tokens = torch.where(text_exhausted, torch.full_like(current_tokens, self.eos_id), current_tokens)
+                current_tokens = torch.where(
+                    text_exhausted, torch.full_like(current_tokens, self.eos_id), current_tokens
+                )
 
                 state, audio_codes, phoneme_tokens = self.streaming_step(
                     state=state,

From e96f34445697ea08641814f64fbbf451f829f599 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Thu, 5 Feb 2026 18:02:48 -0500
Subject: [PATCH 37/94] include vocab file

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 ...okenizer_2048_en_de_es_fr_hi_it_vi_zh.json | 9954 +++++++++++++++++
 1 file changed, 9954 insertions(+)
 create mode 100644 scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json

diff --git a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json
new file mode 100644
index 000000000000..6d7e35116405
--- /dev/null
+++ b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json
@@ -0,0 +1,9954 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<blank>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "ByteLevel",
+    "add_prefix_space": false,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "post_processor": null,
+  "decoder": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "<unk>",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {
+      "<pad>": 0,
+      "<blank>": 1,
+      "<unk>": 2,
+      "(": 3,
+      ")": 4,
+      "-": 5,
+      ".": 6,
+      "1": 7,
+      "2": 8,
+      "4": 9,
+      "5": 10,
+      "6": 11,
+      "7": 12,
+      "F": 13,
+      "a": 14,
+      "b": 15,
+      "c": 16,
+      "d": 17,
+      "e": 18,
+      "f": 19,
+      "h": 20,
+      "i": 21,
+      "j": 22,
+      "k": 23,
+      "l": 24,
+      "m": 25,
+      "n": 26,
+      "o": 27,
+      "p": 28,
+      "q": 29,
+      "r": 30,
+      "s": 31,
+      "t": 32,
+      "u": 33,
+      "v": 34,
+      "w": 35,
+      "x": 36,
+      "y": 37,
+      "z": 38,
+      "¡": 39,
+      "£": 40,
+      "¦": 41,
+      "§": 42,
+      "©": 43,
+      "ª": 44,
+      "¬": 45,
+      "°": 46,
+      "²": 47,
+      "³": 48,
+      "¸": 49,
+      "¹": 50,
+      "¾": 51,
+      "Ã": 52,
+      "Å": 53,
+      "É": 54,
+      "Ê": 55,
+      "Ë": 56,
+      "Ì": 57,
+      "Î": 58,
+      "Ï": 59,
+      "Ċ": 60,
+      "Ġ": 61,
+      "Ģ": 62,
+      "ģ": 63,
+      "Ĥ": 64,
+      "ĥ": 65,
+      "ĩ": 66,
+      "Ī": 67,
+      "Ĭ": 68,
+      "ĭ": 69,
+      "Į": 70,
+      "į": 71,
+      "İ": 72,
+      "ı": 73,
+      "Ĳ": 74,
+      "ĳ": 75,
+      "Ĵ": 76,
+      "ĵ": 77,
+      "Ķ": 78,
+      "ķ": 79,
+      "ĸ": 80,
+      "Ĺ": 81,
+      "Ļ": 82,
+      "Ľ": 83,
+      "ľ": 84,
+      "Ŀ": 85,
+      "Ł": 86,
+      "ËĪ": 87,
+      "ËĲ": 88,
+      "ËĪÉ": 89,
+      "ËĮ": 90,
+      "ÉĻ": 91,
+      "ËĪa": 92,
+      "ËĪi": 93,
+      "Ġt": 94,
+      "Éª": 95,
+      "É¾": 96,
+      "ĠÉ": 97,
+      "Ġk": 98,
+      "Éľ": 99,
+      "Ġs": 100,
+      "ËĪe": 101,
+      "ÉĽ": 102,
+      "ËĪo": 103,
+      "Ġl": 104,
+      "ËĪÉĽ": 105,
+      "Ġd": 106,
+      "ÊĬ": 107,
+      "ËĪaËĲ": 108,
+      "Ġp": 109,
+      "Ìĥ": 110,
+      "Ġm": 111,
+      "ËĪu": 112,
+      "Åĭ": 113,
+      "Ã°": 114,
+      "ËĪÉĶ": 115,
+      "ÊĮ": 116,
+      "ËĮa": 117,
+      "Ġh": 118,
+      "ËĪÊĮ": 119,
+      "Ġn": 120,
+      "Êģ": 121,
+      "ËĪÉĳ": 122,
+      "Êĥ": 123,
+      "eËĲ": 124,
+      "Ġa": 125,
+      "Ġb": 126,
+      "ÉĶ": 127,
+      "ËĪÉĻ": 128,
+      "ÉĻn": 129,
+      "Ġf": 130,
+      "ËĪÉª": 131,
+      "É¡": 132,
+      "ËĪeËĲ": 133,
+      "Ġj": 134,
+      "nt": 135,
+      "ĠÃ°": 136,
+      "ĠËĮ": 137,
+      "Ġts": 138,
+      "ĠÉ¡": 139,
+      "Éķ": 140,
+      "ËĪoËĲ": 141,
+      "Ê°": 142,
+      "aËĲ": 143,
+      "ËĪy": 144,
+      "ĠtÉķ": 145,
+      "ËĪiËĲ": 146,
+      "ĠÊ": 147,
+      "Ġv": 148,
+      "Ġw": 149,
+      "st": 150,
+      "Éĳ": 151,
+      "nd": 152,
+      "ËĮi": 153,
+      "Ìª": 154,
+      "ËĮe": 155,
+      "Ġz": 156,
+      "ËĪaÉª": 157,
+      "ËĪiÉĽ": 158,
+      "Î²": 159,
+      "É¹": 160,
+      "ĠËĮa": 161,
+      "Î¸": 162,
+      "ĠhÉĽ": 163,
+      "ÊĪ": 164,
+      "iËĲ": 165,
+      "ËĮo": 166,
+      "ĠÉª": 167,
+      "Éľn": 168,
+      "Ġx": 169,
+      "ĠtÉĻ": 170,
+      "ËĪuËĲ": 171,
+      "ËĮÉĻ": 172,
+      "ĠjËĪi": 173,
+      "ËĮÉĽ": 174,
+      "ĠÉĽ": 175,
+      "ĠËĪa": 176,
+      "ËĮaËĲ": 177,
+      "Ġla": 178,
+      "ĠÃ°e": 179,
+      "ĠhÉĽËĲ": 180,
+      "Ġe": 181,
+      "Ã§": 182,
+      "ÉĻl": 183,
+      "oËĲ": 184,
+      "ËĪÉĳu": 185,
+      "ÊĴ": 186,
+      "uËĲ": 187,
+      "ĠÉĹ": 188,
+      "ĠÉķ": 189,
+      "ËĮeËĲ": 190,
+      "ĠtÉķËĪi": 191,
+      "os": 192,
+      "ËĪÉĶËĲ": 193,
+      "as": 194,
+      "ËĪÊĬ": 195,
+      "Ġi": 196,
+      "ËĪai": 197,
+      "É²": 198,
+      "Éªn": 199,
+      "ts": 200,
+      "ÉľÅĭ": 201,
+      "ĠÉŁ": 202,
+      "ĠÊĥ": 203,
+      "ËĪeÉª": 204,
+      "ÉĽÉ¾": 205,
+      "ËĪÉĽËĲ": 206,
+      "ËĪÉĽÉ¾": 207,
+      "Ġr": 208,
+      "tÊĥ": 209,
+      "ËĮÉĶ": 210,
+      "ĠdÉĻ": 211,
+      "tÉĻ": 212,
+      "ou": 213,
+      "ËĪyÉĻ": 214,
+      "ĠËĮi": 215,
+      "ÉĻÉ¾": 216,
+      "ËĪÉĻÊĬ": 217,
+      "ËĪÊĮÉ¾": 218,
+      "ËĪÉĴ": 219,
+      "Ġth": 220,
+      "ËĪon": 221,
+      "Êĭ": 222,
+      "ËĪÉĳËĲ": 223,
+      "ËĪÊĮh": 224,
+      "wËĪa": 225,
+      "ËĪei": 226,
+      "ll": 227,
+      "ĠÉĲ": 228,
+      "ÉĳËĲ": 229,
+      "an": 230,
+      "ÉŁ": 231,
+      "ĠÊĭ": 232,
+      "Ġko": 233,
+      "kh": 234,
+      "ÉªÅĭ": 235,
+      "ËĪaËĲÉª": 236,
+      "ĠtÊĥ": 237,
+      "ËĪaËĲt": 238,
+      "ĠËĮe": 239,
+      "ĠtÉķh": 240,
+      "ËĪuo": 241,
+      "ËĪonÉ¡": 242,
+      "Éĸ": 243,
+      "at": 244,
+      "Ġke": 245,
+      "ÉĴ": 246,
+      "ĠÉķËĪi": 247,
+      "Ã¸": 248,
+      "ĠÉĳ": 249,
+      "ËĪeËĲk": 250,
+      "Åĵ": 251,
+      "re": 252,
+      "ĠÉ¾": 253,
+      "ĠkÉĶ": 254,
+      "ËĮÊĬ": 255,
+      "sk": 256,
+      "ĠÊĬ": 257,
+      "Ġand": 258,
+      "ÉªÃ§": 259,
+      "Ġme": 260,
+      "ËĪaÉ¾": 261,
+      "ĠËĪÉª": 262,
+      "na": 263,
+      "ĠÎ²": 264,
+      "ĠlËĪi": 265,
+      "jaËĲ": 266,
+      "li": 267,
+      "no": 268,
+      "ĠÉªn": 269,
+      "ĠdËĮi": 270,
+      "ĠÉ²": 271,
+      "tËĲ": 272,
+      "ÉĻm": 273,
+      "ĠlÉĻ": 274,
+      "ĠÃ°ÉĻ": 275,
+      "Éªk": 276,
+      "ËĪÉĽl": 277,
+      "Éľt": 278,
+      "Ġse": 279,
+      "es": 280,
+      "ËĪou": 281,
+      "ËĪaÊĬ": 282,
+      "ĠÉĶ": 283,
+      "Éªt": 284,
+      "ĠÅĭ": 285,
+      "ËĪÉĽn": 286,
+      "Êİ": 287,
+      "Ġkh": 288,
+      "ËĪÉĽnt": 289,
+      "ËĪaËĲÉ¾": 290,
+      "Ġki": 291,
+      "mp": 292,
+      "lt": 293,
+      "É£": 294,
+      "Ġpa": 295,
+      "ËĪÉĻËĲ": 296,
+      "Éªs": 297,
+      "ĠÉĴ": 298,
+      "Ġle": 299,
+      "ÉªÉľ": 300,
+      "ËĪÉĽt": 301,
+      "Ġde": 302,
+      "ĠÉ¹": 303,
+      "ĠtËĪoËĲ": 304,
+      "ĠÊģ": 305,
+      "ÊĥÉĻn": 306,
+      "ĠÊĬnt": 307,
+      "ËĪÉĶÉ¾": 308,
+      "ËĪaÃ°": 309,
+      "ĠaÉª": 310,
+      "ĠÊĲ": 311,
+      "ĠmËĪa": 312,
+      "ra": 313,
+      "ĠkËĪÉª": 314,
+      "kt": 315,
+      "ËĲp": 316,
+      "ĠÊĪ": 317,
+      "ËĪaËĲÊĬ": 318,
+      "ĠkËĪÊĮÉ¾": 319,
+      "ĠËĪÊĮ": 320,
+      "ĠÉĴv": 321,
+      "Ġel": 322,
+      "ks": 323,
+      "Ġkw": 324,
+      "ÉĻt": 325,
+      "ndo": 326,
+      "ei": 327,
+      "ĠËĮaËĲp": 328,
+      "se": 329,
+      "ÉĻÉ¹": 330,
+      "ËĪuei": 331,
+      "ÉĻs": 332,
+      "ĠkËĮo": 333,
+      "ĠÊĤ": 334,
+      "ĠËĮÊĬ": 335,
+      "Ġc": 336,
+      "ĠÉĽn": 337,
+      "ËĪant": 338,
+      "Î¸j": 339,
+      "ËĮoËĲ": 340,
+      "ĠËĪaËĲ": 341,
+      "ĠpÉ¾": 342,
+      "si": 343,
+      "ĠËĪe": 344,
+      "ĠjuËĲ": 345,
+      "ĠkËĮe": 346,
+      "ËĮÉª": 347,
+      "ÉĶn": 348,
+      "ĠsËĪÊĮ": 349,
+      "ĠËĪu": 350,
+      "ni": 351,
+      "Ġst": 352,
+      "ĠdiËĲ": 353,
+      "ĠkeËĲ": 354,
+      "ĠjËĪiou": 355,
+      "ËĪaiÉľ": 356,
+      "ĠdÊĴ": 357,
+      "ĠËĪÉĶ": 358,
+      "va": 359,
+      "ËĲÉ¾": 360,
+      "ËĪÃ¸": 361,
+      "ËĮÉĻÊĬ": 362,
+      "ĠpËĪu": 363,
+      "Ġsu": 364,
+      "Ġma": 365,
+      "ĠÉĻ": 366,
+      "dÊĴ": 367,
+      "ĠpÊ°": 368,
+      "le": 369,
+      "in": 370,
+      "ĠtÉķhËĪi": 371,
+      "ĠwËĪo": 372,
+      "ro": 373,
+      "ËĮy": 374,
+      "É¾a": 375,
+      "ĠsËĪi": 376,
+      "Ã°ÉĻ": 377,
+      "ĠseËĲ": 378,
+      "la": 379,
+      "ĠÊĴ": 380,
+      "mb": 381,
+      "ĠhËĪoËĲ": 382,
+      "ĠbÊ°": 383,
+      "ĠÉĽÉ¾": 384,
+      "ĠÃ°at": 385,
+      "sp": 386,
+      "ÉĶÉ¾": 387,
+      "en": 388,
+      "ĠsÉĻ": 389,
+      "ËĪÉĶÉľ": 390,
+      "ĠlËĮa": 391,
+      "ĠËĮÉĽ": 392,
+      "ĠËĪy": 393,
+      "É¡aËĲ": 394,
+      "ĠdÉĽÉ¾": 395,
+      "ËĪÉĽÊģ": 396,
+      "Éľkh": 397,
+      "ËĪiÉĻ": 398,
+      "ËĪan": 399,
+      "ĠmËĪo": 400,
+      "ËĪaÎ²": 401,
+      "Ġal": 402,
+      "ĠËĪeËĲ": 403,
+      "ĠÎ¸": 404,
+      "ĠnËĪi": 405,
+      "pÊ°": 406,
+      "lla": 407,
+      "Ġpl": 408,
+      "ËĪÅĵ": 409,
+      "jËĪÉĳu": 410,
+      "Ġav": 411,
+      "ĠmËĪi": 412,
+      "ĠfËĪa": 413,
+      "ËĪÉľ": 414,
+      "me": 415,
+      "ËĮÉĻh": 416,
+      "ËĪuÉĻ": 417,
+      "it": 418,
+      "jËĪe": 419,
+      "Ġo": 420,
+      "ËĪÉľËĲ": 421,
+      "ĠtÉķËĪiou": 422,
+      "ÉĶËĲ": 423,
+      "ĠnÉĻ": 424,
+      "ËĪÉĻÉľn": 425,
+      "ĠmÉĻ": 426,
+      "ĠdeËĲ": 427,
+      "mo": 428,
+      "sa": 429,
+      "jËĪÉĶ": 430,
+      "ËĪal": 431,
+      "ĠtÉķËĪiÉĽ": 432,
+      "ĠÉ¡ÉĻ": 433,
+      "Ã°a": 434,
+      "ĠÉªz": 435,
+      "Ġsa": 436,
+      "ri": 437,
+      "ĠËĮil": 438,
+      "ËĮu": 439,
+      "ĠkaËĲ": 440,
+      "ĠÉĻËĲ": 441,
+      "ĠÉĸ": 442,
+      "Ġka": 443,
+      "ËĪÊĮhi": 444,
+      "ĠjeËĲ": 445,
+      "ĠtÊ°": 446,
+      "ne": 447,
+      "kËĲ": 448,
+      "ĠtsËĪai": 449,
+      "ĠËĪeËĲk": 450,
+      "nk": 451,
+      "ti": 452,
+      "ËĪaÉľn": 453,
+      "ĠkËĲ": 454,
+      "É¡ÉĻn": 455,
+      "ËĪia": 456,
+      "ĠÉĶËĲÉ¾": 457,
+      "Êı": 458,
+      "ĠËĮÊĮ": 459,
+      "ĠzËĪaËĲ": 460,
+      "Ġlos": 461,
+      "ÉĽs": 462,
+      "ËĪÉĶn": 463,
+      "ÉĽnt": 464,
+      "ÉĽn": 465,
+      "ĠÉŁËĪoËĲ": 466,
+      "Ã§t": 467,
+      "Ġdas": 468,
+      "ĠxËĮo": 469,
+      "ËĪuÉľ": 470,
+      "ËĪas": 471,
+      "ĠbËĪÊĮ": 472,
+      "ËĪiÉĽÉľn": 473,
+      "ÉĲ": 474,
+      "ĠtsuËĲ": 475,
+      "ĠpËĮÉĽ": 476,
+      "ĠnËĪÉĶ": 477,
+      "ÊĬt": 478,
+      "ma": 479,
+      "ĠnËĪo": 480,
+      "ĠlËĪÉª": 481,
+      "ËĪÉĽs": 482,
+      "Éªl": 483,
+      "ĠÉķËĪiÉĽ": 484,
+      "ĠËĪÊĬ": 485,
+      "ÉĴt": 486,
+      "to": 487,
+      "ĠËĪo": 488,
+      "ËĮon": 489,
+      "ĠkwËĪa": 490,
+      "ĠÉªt": 491,
+      "ĠhoËĲ": 492,
+      "ËĪiËĲk": 493,
+      "ĠËĮaËĲpk": 494,
+      "ËĪaÉªn": 495,
+      "Ã¦": 496,
+      "ÉĻnt": 497,
+      "ta": 498,
+      "lo": 499,
+      "ĠnËĪÉĳ": 500,
+      "ĠlËĪa": 501,
+      "ËĪiÉľ": 502,
+      "ĠwËĪei": 503,
+      "ÉĽÊģ": 504,
+      "ĠtËĪa": 505,
+      "ĠÉ¾ËĮÉĻh": 506,
+      "ĠÉķËĪiÉĳ": 507,
+      "ËĮiËĲ": 508,
+      "ËĮÉĽl": 509,
+      "ĠtÉĻÉľ": 510,
+      "ĠkËĪuo": 511,
+      "ĠtËĪu": 512,
+      "jËĪÉĽ": 513,
+      "ĠËĮin": 514,
+      "É¾e": 515,
+      "ĠkoËĲ": 516,
+      "ĠkËĪa": 517,
+      "É¾i": 518,
+      "ĠtÉķËĪiÉĳ": 519,
+      "lÉĻ": 520,
+      "ĠkÉĻ": 521,
+      "ĠtËĪi": 522,
+      "ĠÅĭËĪyÉĻ": 523,
+      "Ġtsh": 524,
+      "er": 525,
+      "av": 526,
+      "ĠkÉĶn": 527,
+      "ËĪÉĻÉľÅĭ": 528,
+      "Ã°o": 529,
+      "ËĪaËĲn": 530,
+      "ĠbÊ°ËĪi": 531,
+      "ĠkËĲjaËĲ": 532,
+      "ÉĻz": 533,
+      "ĠpÊģ": 534,
+      "ĠdËĪÉª": 535,
+      "ĠziËĲ": 536,
+      "É¡eËĲ": 537,
+      "ĠtËĪÉĻ": 538,
+      "Éªz": 539,
+      "ĠnËĮon": 540,
+      "taËĲ": 541,
+      "bl": 542,
+      "te": 543,
+      "nËĮeËĲ": 544,
+      "ËĪÉªl": 545,
+      "so": 546,
+      "ko": 547,
+      "uÊģ": 548,
+      "ĠÉ£": 549,
+      "ĠpaÊģ": 550,
+      "ĠËĪÉĽ": 551,
+      "jËĪuËĲ": 552,
+      "ËĮÊĮ": 553,
+      "yn": 554,
+      "ËĪiËĲn": 555,
+      "ĠlËĪaÉª": 556,
+      "ËĪÉªÅĭ": 557,
+      "ĠtÉķhËĪy": 558,
+      "ĠnËĪÊĮhi": 559,
+      "ĠdËĮe": 560,
+      "ĠjËĪÉĳu": 561,
+      "ĠtËĪÉĳu": 562,
+      "ĠhËĪo": 563,
+      "Éªd": 564,
+      "ĠthËĪÉĳ": 565,
+      "mËĪe": 566,
+      "ĠËĪÉĻ": 567,
+      "ja": 568,
+      "Ġph": 569,
+      "ÉĽt": 570,
+      "ĠkËĪÊĮ": 571,
+      "tÉĻn": 572,
+      "mËĪÉĳ": 573,
+      "wËĪe": 574,
+      "ĠËĮaÉªn": 575,
+      "ĠÃ°Éªs": 576,
+      "É¡ÉĻ": 577,
+      "ĠnËĪaËĲ": 578,
+      "ĠbËĪaËĲ": 579,
+      "ĠaÎ¸": 580,
+      "ĠmËĮa": 581,
+      "ËĪÊĮha": 582,
+      "ĠdËĮa": 583,
+      "ËĪÊı": 584,
+      "ĠÉ²ËĮy": 585,
+      "ĠpËĪa": 586,
+      "ËĪaÃ°o": 587,
+      "di": 588,
+      "bÉľ": 589,
+      "É³": 590,
+      "ĠwiËĲ": 591,
+      "ĠnËĪÉª": 592,
+      "ĠÉ¡ËĪÉĶÉľ": 593,
+      "tËĲo": 594,
+      "ËĮÉĻm": 595,
+      "ËĪaËĲr": 596,
+      "ĠmÉĽ": 597,
+      "ËĪeËĲÉ¡aËĲ": 598,
+      "ĠsËĮi": 599,
+      "ĠlËĮaËĲ": 600,
+      "nËĮaËĲ": 601,
+      "Ġsp": 602,
+      "tÊģ": 603,
+      "ĠÊİ": 604,
+      "ËĮÉĳËĲ": 605,
+      "Ġkl": 606,
+      "kÊ°": 607,
+      "il": 608,
+      "ĠÊĥt": 609,
+      "ĠËĮÊĬn": 610,
+      "al": 611,
+      "ĠsËĪÉĽ": 612,
+      "ĠmËĪaËĲ": 613,
+      "ĠÅĵ": 614,
+      "ĠÉ¡ËĪÊĮ": 615,
+      "ĠpËĮÉĽr": 616,
+      "É¾ËĪa": 617,
+      "ËĲÊĪ": 618,
+      "ËĪaÎ²a": 619,
+      "ĠwËĪÉĴ": 620,
+      "ĠxËĪuei": 621,
+      "ĠkhËĪo": 622,
+      "Ġlas": 623,
+      "ĠÉĹËĪo": 624,
+      "ĠfÉĽÉ¾": 625,
+      "ĠjËĪiÉĽ": 626,
+      "ĠtËĪe": 627,
+      "ĠkËĮÉĶ": 628,
+      "ĠdeËĲn": 629,
+      "Ġmo": 630,
+      "ĠpËĪi": 631,
+      "ĠtËĪÉĳ": 632,
+      "ËĪÉĽst": 633,
+      "wËĪÉĳ": 634,
+      "ËĪaÉªt": 635,
+      "ÉĻÊĬ": 636,
+      "ĠËĪi": 637,
+      "Éªj": 638,
+      "aÉª": 639,
+      "ËĪaËĲÉľ": 640,
+      "ĠËĪÉªs": 641,
+      "ĠpÉĶÉ¾": 642,
+      "Ã¦Éľn": 643,
+      "ka": 644,
+      "ÅĭÉ¡": 645,
+      "bÉĻn": 646,
+      "ÊĬf": 647,
+      "ĠpÉ¹": 648,
+      "ĠlËĮe": 649,
+      "ËĪiËĲd": 650,
+      "ËĪaËĲre": 651,
+      "ĠmËĪÊĮ": 652,
+      "ÉĻr": 653,
+      "ĠdÉĳ": 654,
+      "ËĪaËĲto": 655,
+      "ĠpËĪeËĲ": 656,
+      "ĠdËĪoËĲ": 657,
+      "ĠsËĮÊĬ": 658,
+      "ĠhËĪi": 659,
+      "ĠsËĪa": 660,
+      "ËĪeËĲn": 661,
+      "dÉĻ": 662,
+      "Ġpj": 663,
+      "ËĪÅĵÊģ": 664,
+      "lÉªÃ§": 665,
+      "ÉĴn": 666,
+      "ĠËĪÉĻr": 667,
+      "tËĪe": 668,
+      "Ġil": 669,
+      "ËĪaËĲl": 670,
+      "ĠsËĮÉĻÊĬ": 671,
+      "sÊĪ": 672,
+      "ĠdËĪuËĲ": 673,
+      "hËĪÉĳ": 674,
+      "ĠxËĪou": 675,
+      "ĠlËĪaiÉľ": 676,
+      "wËĪo": 677,
+      "ËĪÉĽnte": 678,
+      "Ġsy": 679,
+      "ĠzÉªÃ§": 680,
+      "ĠÉ¡ËĪu": 681,
+      "ĠÉķËĪy": 682,
+      "ËĪÉĶËĲl": 683,
+      "ÉĶl": 684,
+      "ĠtËĪo": 685,
+      "ĠÊĭoËĲ": 686,
+      "ĠiËĲ": 687,
+      "wËĪaÃ°a": 688,
+      "ËĪando": 689,
+      "ĠaÎ¸ÉĽnt": 690,
+      "ĠaÎ¸ÉĽntwËĪaÃ°a": 691,
+      "ĠtËĪiÉĽ": 692,
+      "ËĪeiÉľ": 693,
+      "ĠpËĮa": 694,
+      "ĠnËĪaÉª": 695,
+      "wa": 696,
+      "Ġfr": 697,
+      "ĠÊĲËĪÉĻÉľn": 698,
+      "ËĪua": 699,
+      "mi": 700,
+      "ĠmËĪÉĽ": 701,
+      "ËĪeËĲkÊ°": 702,
+      "cÊ°": 703,
+      "ĠwËĪÉĳ": 704,
+      "sta": 705,
+      "Ġtu": 706,
+      "Ġsk": 707,
+      "ËĪÉĶl": 708,
+      "ËĪeËĲÊĪ": 709,
+      "ĠlËĪaËĲÉª": 710,
+      "ĠlËĪaËĲ": 711,
+      "ËĪÉĽËĲs": 712,
+      "ËĪÉĽÉ¾a": 713,
+      "ËĪÉĻÉľt": 714,
+      "Ġyn": 715,
+      "dÉĻn": 716,
+      "Ġdi": 717,
+      "ËĪiËĲs": 718,
+      "ĠÃ°el": 719,
+      "ËĪÊĮr": 720,
+      "ĠhËĪaËĲ": 721,
+      "ĠbÉĻ": 722,
+      "ĠjËĪuËĲ": 723,
+      "lle": 724,
+      "sto": 725,
+      "ËĪÉªt": 726,
+      "ËĪoËĲÉ¾": 727,
+      "bÊ°": 728,
+      "mÉĻn": 729,
+      "ËĮuÉĻ": 730,
+      "ËĮÉĻÉ¾": 731,
+      "ËĪÊĮn": 732,
+      "ĠlËĪaÉªk": 733,
+      "ĠbËĪa": 734,
+      "ÉªÃ°": 735,
+      "Ġlo": 736,
+      "zi": 737,
+      "ËĪÊĮst": 738,
+      "mËĪi": 739,
+      "ÉĶÊģ": 740,
+      "ĠnËĪÉªÃ§t": 741,
+      "ĠtÉ¾": 742,
+      "ĠdËĪeËĲkÊ°": 743,
+      "ĠsËĮe": 744,
+      "ĠnËĪÉĻÊĬ": 745,
+      "Ġu": 746,
+      "Ġsi": 747,
+      "ĠÉªÃ§": 748,
+      "Ġpr": 749,
+      "ĠtÉķËĪy": 750,
+      "ĠmËĪu": 751,
+      "za": 752,
+      "ĠtÊģ": 753,
+      "ĠwÉªÃ°": 754,
+      "tËĪÉĽ": 755,
+      "ĠpËĪÊĮÉ¾": 756,
+      "ĠkËĪÉĶ": 757,
+      "ËĪoËĲr": 758,
+      "ĠhËĮa": 759,
+      "ĠkËĪonÉ¡": 760,
+      "ĠpuÊģ": 761,
+      "Ġdy": 762,
+      "ËĪÉªn": 763,
+      "nte": 764,
+      "ĠkËĮa": 765,
+      "ËĪÉĻÉª": 766,
+      "Ġmi": 767,
+      "ĠÉ¡ËĮuÉĻ": 768,
+      "ĠÊ²": 769,
+      "ĠfËĪÉĳ": 770,
+      "ĠvÉĳËĲ": 771,
+      "ĠËĮaÊĬ": 772,
+      "ËĮuËĲ": 773,
+      "ĠËĪun": 774,
+      "ĠjËĪÊĮha": 775,
+      "juËĲ": 776,
+      "ĠmÉªt": 777,
+      "ĠlËĪÉĽ": 778,
+      "ËĪeËĲÊĥ": 779,
+      "ĠfÉĶËĲ": 780,
+      "mÉĻ": 781,
+      "É¾t": 782,
+      "ĠkËĮon": 783,
+      "ĠlËĪÉĶ": 784,
+      "ĠxËĪÉĳu": 785,
+      "pl": 786,
+      "ĠdËĪi": 787,
+      "ĠlËĪoËĲ": 788,
+      "sÉĻ": 789,
+      "ËĪaËĲva": 790,
+      "ĠlËĪu": 791,
+      "ĠÉ¡ËĮÉĻÊĬ": 792,
+      "Ġhav": 793,
+      "ĠËĮaËĲpkËĮoËĲ": 794,
+      "É¾ËĪi": 795,
+      "ĠfËĪÉĻ": 796,
+      "ĠhËĮÉĻm": 797,
+      "ËĪonÉ¡Éľ": 798,
+      "jo": 799,
+      "ĠsÉĶ": 800,
+      "ËĪaËĲd": 801,
+      "wËĪiÉĻ": 802,
+      "ËĪand": 803,
+      "ËĮaÉªn": 804,
+      "tÉ¾": 805,
+      "ĠËĮÉª": 806,
+      "ĠËĪuna": 807,
+      "ĠxwËĪÉĳ": 808,
+      "ĠjÉĶËĲ": 809,
+      "ÊģËĪi": 810,
+      "ĠkËĪuoÉľ": 811,
+      "ĠaÎ²": 812,
+      "ĠÉ¡ËĪaËĲ": 813,
+      "ano": 814,
+      "tÉĻl": 815,
+      "ĠrËĮe": 816,
+      "ËĮÊĮt": 817,
+      "ĠjËĪiÉĳ": 818,
+      "ĠÉ¾ËĮÉĻhaËĲ": 819,
+      "ĠmËĪe": 820,
+      "ĠËĪyÃ¦Éľn": 821,
+      "ĠfËĪu": 822,
+      "Ġbl": 823,
+      "nËĪi": 824,
+      "sÉĻn": 825,
+      "ĠaÉªn": 826,
+      "ËĪiÊĬ": 827,
+      "ĠÃ°eÉª": 828,
+      "ĠÉªts": 829,
+      "Ġ(": 830,
+      "ËĪyËĲ": 831,
+      "ÉĻd": 832,
+      "ĠËĮo": 833,
+      "ĠÉĽs": 834,
+      "ĠviËĲ": 835,
+      "ËĲÉ¡eËĲ": 836,
+      "kËĪe": 837,
+      "ĠËĪal": 838,
+      "ÉĽl": 839,
+      "ĠÊĮ": 840,
+      "ËĲo": 841,
+      "ĠkËĪo": 842,
+      "ĠÊĪËĪuËĲ": 843,
+      "ĠsËĪÉª": 844,
+      "ËĪeËĲÉ¾": 845,
+      "Éľm": 846,
+      "ËĮÉĻn": 847,
+      "ËĪaËĲi": 848,
+      "ËĪoËĲl": 849,
+      "ÉªËĮeËĲ": 850,
+      "ĠÊ²ËĪy": 851,
+      "ĠkËĪÉĶËĲ": 852,
+      "sËĪi": 853,
+      "ĠlËĪe": 854,
+      "ËĮÉĴt": 855,
+      "ËĪiËĲp": 856,
+      "aÊģ": 857,
+      "ĠÎ¸ËĪÉªÅĭ": 858,
+      "ËĪÉĻËĲÉª": 859,
+      "ËĪÊĮl": 860,
+      "ĠhËĪoËĲtaËĲ": 861,
+      "ËĪoÉª": 862,
+      "nto": 863,
+      "zh": 864,
+      "ĠdeËĲm": 865,
+      "ĠkÉĶm": 866,
+      "Ê°ËĪiËĲk": 867,
+      "ĠdÊĴËĪÊĮst": 868,
+      "pÉ¾": 869,
+      "Ġly": 870,
+      "hËĪu": 871,
+      "ËĪÉĶÃ¸": 872,
+      "ËĪaËĲs": 873,
+      "ĠËĪan": 874,
+      "ĠËĪÉĴ": 875,
+      "Ġkan": 876,
+      "ĠtsËĪuo": 877,
+      "ËĪeËĲva": 878,
+      "ĠÉ¡É¾": 879,
+      "Ġpo": 880,
+      "ĠtÊĥËĪÉĶ": 881,
+      "Êİa": 882,
+      "ĠmËĮi": 883,
+      "Êĥt": 884,
+      "tËĪi": 885,
+      "ĠhËĪÊĮ": 886,
+      "tÊĥe": 887,
+      "ĠfÉĶn": 888,
+      "ve": 889,
+      "ĠnËĮe": 890,
+      "ËĪÉĶÊģ": 891,
+      "iz": 892,
+      "ĠsËĪuo": 893,
+      "ËĪÉĽËĲr": 894,
+      "wËĪaÊģ": 895,
+      "ËĪaÃ°a": 896,
+      "Åĭk": 897,
+      "po": 898,
+      "ĠkËĪi": 899,
+      "ËĪad": 900,
+      "ĠvËĪi": 901,
+      "tÉķ": 902,
+      "ĠkËĪÉĻ": 903,
+      "ĠwËĪu": 904,
+      "ÉĴz": 905,
+      "ĠvÉĳËĲÉ¾": 906,
+      "ÊģËĪÉĽ": 907,
+      "ĠkËĪaËĲ": 908,
+      "ke": 909,
+      "nÉĻ": 910,
+      "ËĪÊĮb": 911,
+      "ËĪuËĲÉ¾": 912,
+      "ËĮÉĻËĲ": 913,
+      "ĠÊĪÊ°ËĪiËĲk": 914,
+      "ĠkËĪu": 915,
+      "ĠbËĮÊĮt": 916,
+      "Ġat": 917,
+      "ĠfÉ¹": 918,
+      "ËĪax": 919,
+      "ĠzoËĲ": 920,
+      "ĠtËĪaËĲ": 921,
+      "ĠÃ°ËĮe": 922,
+      "neËĲ": 923,
+      "ĠÉĳËĲ": 924,
+      "ĠaÊĬf": 925,
+      "am": 926,
+      "ÊĬÅĭ": 927,
+      "ĠÉĶËĲ": 928,
+      "ĠÉķËĪiÉľÅĭ": 929,
+      "ĠËĪÉĶËĲl": 930,
+      "Éªm": 931,
+      "jËĪo": 932,
+      "ËĪiËĲÉŁ": 933,
+      "ĠkwËĮÉĽ": 934,
+      "ĠmËĪas": 935,
+      "ÉĻh": 936,
+      "ĠËĪaÊĬ": 937,
+      "ËĪÉĶÉª": 938,
+      "É¡ÉĻÉ¾": 939,
+      "rÉĻn": 940,
+      "ËĪÉªk": 941,
+      "sse": 942,
+      "ĠpËĪÉĳ": 943,
+      "ĠÉĹËĮe": 944,
+      "ĠÉĹËĪi": 945,
+      "Ġaz": 946,
+      "ĠÉ¡ËĪÊĮjaËĲ": 947,
+      "ze": 948,
+      "ĠÉĹËĮaËĲ": 949,
+      "ĠfËĪi": 950,
+      "ĠËĮÉĴn": 951,
+      "ĠxËĪo": 952,
+      "ĠËĮÊĬna": 953,
+      "ĠtÊ°aËĲ": 954,
+      "ĠsÉĳ": 955,
+      "ËĪeÉªÊĥÉĻn": 956,
+      "ĠtÉķËĪiÉľ": 957,
+      "ĠÉŁaËĲ": 958,
+      "pËĲ": 959,
+      "Ġply": 960,
+      "Î¸ËĪi": 961,
+      "ËĲÉĸ": 962,
+      "ĠtËĪuei": 963,
+      "ĠlËĪÉĻ": 964,
+      "ĠdÉĳËĲ": 965,
+      "ft": 966,
+      "ËĪam": 967,
+      "ĠsËĪÊĮkt": 968,
+      "ĠtËĪou": 969,
+      "ĠpËĪiÉĽ": 970,
+      "ĠËĪai": 971,
+      "ĠwËĪÉĴn": 972,
+      "ĠzËĮaÉªn": 973,
+      "Ġest": 974,
+      "ĠmÉĶ": 975,
+      "ĠtÉķjËĪÉĳu": 976,
+      "Éľp": 977,
+      "ËĪÊĮz": 978,
+      "bi": 979,
+      "ËĪÉĽËĲseËĲ": 980,
+      "ĠlËĪy": 981,
+      "ĠmËĮe": 982,
+      "ĠdËĮÉĽl": 983,
+      "ËĪiËĲl": 984,
+      "ĠkËĮomo": 985,
+      "ĠhËĪaÉľn": 986,
+      "ËĪoËĲne": 987,
+      "ĠkËĪÊĮÉ¾t": 988,
+      "ĠsyÊģ": 989,
+      "ËĮÉĶÉ¾": 990,
+      "ĠÉªf": 991,
+      "uv": 992,
+      "zÉĻn": 993,
+      "ol": 994,
+      "Ïĩ": 995,
+      "im": 996,
+      "ĠmËĪiÉĽ": 997,
+      "ĠÃ°Éª": 998,
+      "ĠvËĪÉĽ": 999,
+      "ÊĬd": 1000,
+      "Ġtr": 1001,
+      "ËĪeËĲs": 1002,
+      "Ã°e": 1003,
+      "de": 1004,
+      "Ê°Ïĩ": 1005,
+      "ÉŁÊ°": 1006,
+      "ËĮÉĻËĲÉªÉľ": 1007,
+      "bËĲ": 1008,
+      "ËĪÊĬk": 1009,
+      "ĠnËĪÉĶÉªÉľ": 1010,
+      "ĠËĮiËĲ": 1011,
+      "ËĪÉĳËĲt": 1012,
+      "ËĪiËĲÉ¾": 1013,
+      "ĠtÉ¹": 1014,
+      "É¾ÉĶ": 1015,
+      "ĠwÉĴz": 1016,
+      "Ġvu": 1017,
+      "bÉĻl": 1018,
+      "bÉĻ": 1019,
+      "É¹i": 1020,
+      "nts": 1021,
+      "ĠsËĪaËĲ": 1022,
+      "dÊ°": 1023,
+      "ĠtÊĬ": 1024,
+      "ĠÊİËĮi": 1025,
+      "Î²a": 1026,
+      "hËĪÉĻÉľÅĭ": 1027,
+      "ĠsËĪiËĲ": 1028,
+      "ĠpËĮaÉ¾a": 1029,
+      "ËĪÉĽÉ¾ÉĶ": 1030,
+      "ËĪÉªs": 1031,
+      "É£o": 1032,
+      "ĠËĮal": 1033,
+      "or": 1034,
+      "ĠbËĪÊĮh": 1035,
+      "ĠkËĪoËĲ": 1036,
+      "ĠtËĪÉĽ": 1037,
+      "ĠpËĪo": 1038,
+      "ĠÊĴÉĻ": 1039,
+      "pÊģ": 1040,
+      "ĠËĪaÉª": 1041,
+      "hËĪÉĳÉľÅĭ": 1042,
+      "ÉĻli": 1043,
+      "ËĪeÉªt": 1044,
+      "ĠjËĪiouÉľ": 1045,
+      "ĠdËĪÉĻ": 1046,
+      "ĠmËĪÉĶËĲ": 1047,
+      "lËĪi": 1048,
+      "ËĮyÉĻ": 1049,
+      "ĠlËĪoËĲÉ¡": 1050,
+      "ĠnËĪÊĮ": 1051,
+      "ĠhËĪÊĬ": 1052,
+      "ĠnËĪÉĻÉľÅĭ": 1053,
+      "ĠÊģÉĻ": 1054,
+      "zËĪi": 1055,
+      "ĠtËĪuËĲ": 1056,
+      "ĠkËĮome": 1057,
+      "ĠlËĪeËĲ": 1058,
+      "ËĪaËĲtaËĲ": 1059,
+      "Ġan": 1060,
+      "ĠËĪyu": 1061,
+      "ĠËĮÊĮÉ¡ÉĻÉ¾": 1062,
+      "ĠËĪÉªn": 1063,
+      "ĠhËĪoÉĻ": 1064,
+      "vÉĻ": 1065,
+      "ËĪÃ¸ËĲ": 1066,
+      "Î¸ja": 1067,
+      "ËĪuÉĻÉľn": 1068,
+      "ĠkÉĻÉ¾": 1069,
+      "ËĪat": 1070,
+      "jËĪÃ¸": 1071,
+      "ËĪÉĽtÊģ": 1072,
+      "ĠpËĪÉĳu": 1073,
+      "stÉĻ": 1074,
+      "ĠwÉĴt": 1075,
+      "ËĪeËĲl": 1076,
+      "ÊĪi": 1077,
+      "ĠxËĪaiÉľ": 1078,
+      "ËĪyÊģ": 1079,
+      "ĠhËĪoËĲÉ¡aËĲ": 1080,
+      "ĠtsËĪi": 1081,
+      "ĠËĪÊĮp": 1082,
+      "ĠnËĮÉĴt": 1083,
+      "ĠlËĪÉªeËĲ": 1084,
+      "ĠhËĪa": 1085,
+      "Ġfl": 1086,
+      "ĠnËĪeËĲ": 1087,
+      "ËĮaËĲÉª": 1088,
+      "ĠtËĪuo": 1089,
+      "tÊĥËĲ": 1090,
+      "sËĪe": 1091,
+      "bÊ°i": 1092,
+      "ĠbËĪÊĮhÊĬt": 1093,
+      "ËĪÉĽnd": 1094,
+      "ĠsËĪÉĶ": 1095,
+      "ÉĻns": 1096,
+      "ËĮÉĻl": 1097,
+      "ÉĽÉľ": 1098,
+      "ĠÉ¡l": 1099,
+      "ËĪÉªÉ¾": 1100,
+      "ËĪaËĲta": 1101,
+      "ÉľËĲ": 1102,
+      "ËĪÉĽnto": 1103,
+      "skËĮoËĲ": 1104,
+      "ËĪÉĽk": 1105,
+      "tsi": 1106,
+      "ĠtËĪonÉ¡": 1107,
+      "ĠbiËĲ": 1108,
+      "ĠhËĪaËĲÉª": 1109,
+      "ĠbËĪi": 1110,
+      "jj": 1111,
+      "Êİi": 1112,
+      "ĠkÊ°": 1113,
+      "ĠsËĪo": 1114,
+      "llo": 1115,
+      "ĠbaÉª": 1116,
+      "ĠÉĽnt": 1117,
+      "ĠËĪiËĲ": 1118,
+      "ĠÉ¡ËĪo": 1119,
+      "É¾eËĲ": 1120,
+      "ĠkÊĭ": 1121,
+      "ĠmËĪeiÉľ": 1122,
+      "ÊĬËĪÉĶËĲ": 1123,
+      "ĠtËĪaÉª": 1124,
+      "Ġsus": 1125,
+      "Ġri": 1126,
+      "ĠvËĮÉĽ": 1127,
+      "ËĪiËĲno": 1128,
+      "vano": 1129,
+      "ĠdËĮiËĲ": 1130,
+      "ĠÊĲËĪaÉľn": 1131,
+      "ÊĤ": 1132,
+      "ĠÉĲb": 1133,
+      "ËĪaËĲh": 1134,
+      "ÉªÊĥ": 1135,
+      "ĠdËĮella": 1136,
+      "tËĲi": 1137,
+      "ĠËĪÊĬn": 1138,
+      "ĠhiËĲ": 1139,
+      "ĠbËĪaËĲt": 1140,
+      "ĠthËĪi": 1141,
+      "Ġam": 1142,
+      "ĠËĪoËĲ": 1143,
+      "Ġhu": 1144,
+      "ĠkËĪÊĮh": 1145,
+      "ĠzËĪÉĳËĲ": 1146,
+      "ĠÉ¡ËĮÉĶ": 1147,
+      "ĠËĪÉĻÊĬ": 1148,
+      "yËĪi": 1149,
+      "ĠlËĪÊĮ": 1150,
+      "ĠdËĪeËĲ": 1151,
+      "ĠsËĪÉĶËĲ": 1152,
+      "skËĮeËĲ": 1153,
+      "É¾o": 1154,
+      "ÊģËĪÉĳ": 1155,
+      "tËĪa": 1156,
+      "ĠkËĪÊĬ": 1157,
+      "ËĪante": 1158,
+      "ĠdÉĶ": 1159,
+      "ĠsËĪeÉª": 1160,
+      "ĠsÉĽt": 1161,
+      "É¹Éª": 1162,
+      "ĠÉ¡ËĮÉĻÊĬÉªÅĭ": 1163,
+      "zo": 1164,
+      "ĠjËĪaËĲ": 1165,
+      "ĠÉĴvÃ°ÉĻ": 1166,
+      "ĠÊĿ": 1167,
+      "ĠÉĽl": 1168,
+      "ĠsËĪoËĲ": 1169,
+      "ĠthËĪiÉľ": 1170,
+      "ĠËĪÉĽl": 1171,
+      "ĠlyËĮi": 1172,
+      "ndÊĴ": 1173,
+      "ĠÉķjËĪÉĳu": 1174,
+      "Î¸a": 1175,
+      "ĠÉ¾ËĮÉĻheËĲ": 1176,
+      "ĠmaÉª": 1177,
+      "jÉĻ": 1178,
+      "ĠËĪÊĮb": 1179,
+      "asjËĪÉĶ": 1180,
+      "dÊģ": 1181,
+      "ĠkhËĪa": 1182,
+      "ĠËĪes": 1183,
+      "vi": 1184,
+      "fi": 1185,
+      "ËĮÉĻb": 1186,
+      "Ġre": 1187,
+      "ĠavËĮÉĽ": 1188,
+      "ĠtËĮi": 1189,
+      "ĠkÉ¾": 1190,
+      "ĠbÉªk": 1191,
+      "ste": 1192,
+      "ËĪeËĲÊĥc": 1193,
+      "pt": 1194,
+      "zÉĻ": 1195,
+      "ĠwËĪaËĲ": 1196,
+      "kl": 1197,
+      "ĠsËĪÊĮm": 1198,
+      "ÉªÊĪ": 1199,
+      "dz": 1200,
+      "vo": 1201,
+      "ËĮaÊĬt": 1202,
+      "nde": 1203,
+      "ĠdÉĽs": 1204,
+      "ĠÉŁËĪaËĲ": 1205,
+      "ĠrËĮi": 1206,
+      "sËĮeËĲ": 1207,
+      "É¡i": 1208,
+      "Ġals": 1209,
+      "ËĪiÃ°o": 1210,
+      "ĠnËĪiÉľn": 1211,
+      "ÊĬl": 1212,
+      "tsËĲ": 1213,
+      "ËĪanto": 1214,
+      "ĠÉĹËĪÉĻÊĬ": 1215,
+      "kËĲi": 1216,
+      "ĠsËĪÊĮb": 1217,
+      "ĠnËĪa": 1218,
+      "ĠlËĮo": 1219,
+      "ĠphËĪi": 1220,
+      "mËĮe": 1221,
+      "Ġfa": 1222,
+      "kÉĻ": 1223,
+      "ĠzËĪu": 1224,
+      "ns": 1225,
+      "ĠÊģe": 1226,
+      "ĠbËĪo": 1227,
+      "ËĪaËĲti": 1228,
+      "Ġman": 1229,
+      "ĠlËĪiÉĳ": 1230,
+      "ĠÉĹËĮyÉĻ": 1231,
+      "ĠfËĪÉĶËĲ": 1232,
+      "ĠkÊĭËĪeËĲÊĥc": 1233,
+      "ĠxËĪÉĳ": 1234,
+      "ĠtÉķËĪu": 1235,
+      "jÉĻÉ¾": 1236,
+      "ĠÉªst": 1237,
+      "wËĪi": 1238,
+      "ĠËĮaÉªnÉĻ": 1239,
+      "ÉªÉ¡": 1240,
+      "ĠsÊĪ": 1241,
+      "ËĪiÉĻl": 1242,
+      "ĠnËĪiÉĽÉľn": 1243,
+      "ĠËĮÉĽËĲ": 1244,
+      "ËĪaÉªnd": 1245,
+      "ĠzËĪi": 1246,
+      "vÉĻn": 1247,
+      "mz": 1248,
+      "Ã°os": 1249,
+      "dÊĴËĲ": 1250,
+      "jËĪa": 1251,
+      "É¾ËĪÉĶ": 1252,
+      "lËĪe": 1253,
+      "Ê²": 1254,
+      "ĠvËĪÉĶ": 1255,
+      "ĠlËĪiÉĽ": 1256,
+      "Î¸e": 1257,
+      "mËĪente": 1258,
+      "ĠÉªnÃ°ÉĻ": 1259,
+      "ĠaÉªm": 1260,
+      "nÉĻn": 1261,
+      "ĠhÉĻm": 1262,
+      "É¾aËĲ": 1263,
+      "ĠsËĪuoÉľ": 1264,
+      "ĠÉ²ËĪi": 1265,
+      "ĠÉ¹ËĪiÉĻl": 1266,
+      "lËĪa": 1267,
+      "ĠbËĪÉĶ": 1268,
+      "ĠkËĪai": 1269,
+      "ÊģËĪa": 1270,
+      "ĠwËĪÉľËĲ": 1271,
+      "ĠaËĲ": 1272,
+      "Ġpas": 1273,
+      "ËĪÊĮs": 1274,
+      "wËĪÉĽÉ¾": 1275,
+      "ĠÉĹËĪe": 1276,
+      "ĠhËĮatÉĻ": 1277,
+      "aÉªn": 1278,
+      "ĠËĪÉĶpÊ°": 1279,
+      "ÊģËĪe": 1280,
+      "ĠÉŁaËĲËĪeËĲÉ¡aËĲ": 1281,
+      "ĠËĪÊĬs": 1282,
+      "ĠtÉķhËĪiÉľ": 1283,
+      "ntÊĥ": 1284,
+      "ĠxËĪuo": 1285,
+      "ËĪuÊģ": 1286,
+      "ĠÉªm": 1287,
+      "É³Éĸ": 1288,
+      "ËĪyÉĻÉľkh": 1289,
+      "ĠËĪyÉĽ": 1290,
+      "ĠmËĮaËĲ": 1291,
+      "ÅĵÊģ": 1292,
+      "ĠËĪalt": 1293,
+      "ĠkÉĻm": 1294,
+      "Êİo": 1295,
+      "ĠÉĲn": 1296,
+      "Ġfy": 1297,
+      "ĠËĮÉĽra": 1298,
+      "ĠÉ¡ËĪÊĬ": 1299,
+      "ĠpËĪÊĮ": 1300,
+      "ls": 1301,
+      "ĠlËĪiËĲ": 1302,
+      "ĠÊĤËĪy": 1303,
+      "ĠbÉªkËĪÊĮz": 1304,
+      "ĠÉ¡ÉĽt": 1305,
+      "ĠbÉ¾": 1306,
+      "tÊ°": 1307,
+      "tÉĻlËĮÉĻb": 1308,
+      "xo": 1309,
+      "skËĮaËĲ": 1310,
+      "É²Ê²": 1311,
+      "ËĪeËĲkÊĪ": 1312,
+      "rÉĻ": 1313,
+      "tÊĥo": 1314,
+      "ĠpÊģÉĶ": 1315,
+      "ĠÉ¹ËĪaÉªt": 1316,
+      "ĠpËĪei": 1317,
+      "ËĮÉªÃ§": 1318,
+      "jËĪÉĽÉ¾": 1319,
+      "tËĲa": 1320,
+      "ĠÉĲbËĮaÊĬt": 1321,
+      "ĠkÊĭËĪeËĲÊĥcÉĻn": 1322,
+      "ĠvËĪe": 1323,
+      "ÊĬÉľ": 1324,
+      "ĠakËĪe": 1325,
+      "ĠpËĪai": 1326,
+      "vËĪÉĽ": 1327,
+      "ĠÎ¸É¹": 1328,
+      "Éªf": 1329,
+      "ĠavËĪÉĽ": 1330,
+      "ĠkËĪe": 1331,
+      "dËĪi": 1332,
+      "ËĪeËĲÉĸ": 1333,
+      "ĠbÉĻt": 1334,
+      "ÊĪÊ°": 1335,
+      "teËĲ": 1336,
+      "Î¸jËĪÉĶn": 1337,
+      "dÉľ": 1338,
+      "ĠjËĪiÉľ": 1339,
+      "Ġve": 1340,
+      "É£ËĪu": 1341,
+      "ËĪÊĮhÉĻl": 1342,
+      "ĠpÉĶ": 1343,
+      "ĠÉ¡r": 1344,
+      "ĠÃ°a": 1345,
+      "ĠvËĪiËĲ": 1346,
+      "ĠËĮÉĳËĲ": 1347,
+      "ËĪÉĻÊĬnt": 1348,
+      "ĠbËĪaËĲÉ¾": 1349,
+      "ĠmËĪÊĮtÉĻlËĮÉĻb": 1350,
+      "ld": 1351,
+      "ĠtÉķËĮÉĶ": 1352,
+      "pa": 1353,
+      "Ã°ËĪad": 1354,
+      "ËĪiÉ¾": 1355,
+      "ĠxËĪu": 1356,
+      "ĠlËĪiÉľÅĭ": 1357,
+      "ËĪeÉªs": 1358,
+      "ĠÉĹËĮeÉľn": 1359,
+      "ĠthËĪiÉĽ": 1360,
+      "tËĲe": 1361,
+      "ĠavËĮÉĽk": 1362,
+      "ĠËĮÉĶ": 1363,
+      "ĠkËĪÉĳu": 1364,
+      "Éªv": 1365,
+      "iËĲz": 1366,
+      "ËĪos": 1367,
+      "ĠÉ¡É¹": 1368,
+      "and": 1369,
+      "ĠlËĪiou": 1370,
+      "ĠËĪoÉľ": 1371,
+      "É¡l": 1372,
+      "ĠpËĪÉĶËĲ": 1373,
+      "ĠmËĮeËĲ": 1374,
+      "ĠkËĪÉĴ": 1375,
+      "nos": 1376,
+      "Ã§ÉĻn": 1377,
+      "fÉĻn": 1378,
+      "ĠsËĪÊĮktËĮeËĲ": 1379,
+      "ĠËĪaÉªn": 1380,
+      "ËĪoËĲre": 1381,
+      "jËĪÉĽn": 1382,
+      "ĠÃ°ËĪÉĽn": 1383,
+      "ĠtÉķhËĪiÉĽÉľn": 1384,
+      "ĠhËĪaÉª": 1385,
+      "É¾ËĪÉĽ": 1386,
+      "ĠsËĪu": 1387,
+      "ĠkËĪÉªjaËĲ": 1388,
+      "ĠpjËĮÊĬ": 1389,
+      "ĠhÉĻmËĮaËĲ": 1390,
+      "ĠËĮÊĮp": 1391,
+      "ĠpËĪÊĮhÉĻl": 1392,
+      "ĠxËĪÉĻ": 1393,
+      "dËĪe": 1394,
+      "ĠmÉĳ": 1395,
+      "ĠÊĬm": 1396,
+      "ndÉĻ": 1397,
+      "ĠdËĪÉĻÊĬnt": 1398,
+      "ËĪeËĲÊĥÉĻn": 1399,
+      "ĠÃ°ats": 1400,
+      "is": 1401,
+      "ĠcËĪaËĲh": 1402,
+      "pe": 1403,
+      "ĠsËĮo": 1404,
+      "ĠÃ°ËĪe": 1405,
+      "ĠsËĪaËĲt": 1406,
+      "ËĪaÊģ": 1407,
+      "ĠsËĪe": 1408,
+      "ÉĻk": 1409,
+      "ÉªÊĭ": 1410,
+      "ĠkËĪoËĲi": 1411,
+      "kÉĶ": 1412,
+      "ĠvËĪaËĲÊĬ": 1413,
+      "ĠfËĪei": 1414,
+      "ĠlËĪeËĲk": 1415,
+      "ĠhËĪiÉĻ": 1416,
+      "ĠaÊĬ": 1417,
+      "ËĪÉĽndo": 1418,
+      "ËĪes": 1419,
+      "ĠzËĪÉĶ": 1420,
+      "ĠËĪÉĽÉ¾a": 1421,
+      "nËĪiÉľn": 1422,
+      "ĠkËĪÊĮm": 1423,
+      "ĠlËĪÉĴ": 1424,
+      "Éªst": 1425,
+      "ĠpÉĳ": 1426,
+      "ĠfËĪÉĶ": 1427,
+      "ĠthËĪonÉ¡": 1428,
+      "nke": 1429,
+      "ËĮÉªk": 1430,
+      "ĠÉ²ËĪÉĻ": 1431,
+      "ËĮÊĮm": 1432,
+      "ËĪiËĲt": 1433,
+      "ĠwËĪÉĴnt": 1434,
+      "ËĪaÎ²an": 1435,
+      "ĠbËĪÊĮr": 1436,
+      "ÉĽnd": 1437,
+      "ĠËĮÉĳËĲbÉľ": 1438,
+      "ĠvËĪaÉª": 1439,
+      "ĠtÊĥËĮi": 1440,
+      "ĠÎ¸ËĪÉªÅĭk": 1441,
+      "sti": 1442,
+      "ĠkÉ¹": 1443,
+      "ĠËĪaÊĬt": 1444,
+      "stÉĻn": 1445,
+      "ĠÊĭËĪÊĮn": 1446,
+      "ĠÉ¡ËĮaËĲ": 1447,
+      "ËĪaËĲÉľÉ²": 1448,
+      "Êģi": 1449,
+      "ĠnËĪÉĶx": 1450,
+      "ĠÉ¹ËĪiÉĻlÉª": 1451,
+      "ĠvËĮi": 1452,
+      "ĠÃ°eÉĻ": 1453,
+      "ËĮÉªtÊĥ": 1454,
+      "ĠvËĪyÉĻ": 1455,
+      "ĠËĮaËĲpkËĮaËĲ": 1456,
+      "ĠfËĮaËĲÉª": 1457,
+      "ĠpËĪÉĶ": 1458,
+      "ĠnËĪÊĮmb": 1459,
+      "Î¸es": 1460,
+      "jËĪÉĽÊģ": 1461,
+      "ĠkËĪÊĬcÊ°": 1462,
+      "mËĪÉĽ": 1463,
+      "ĠvËĪu": 1464,
+      "ĠlÅĵÊģ": 1465,
+      "ĠiËĲm": 1466,
+      "ÊĪÉĻÉ¾": 1467,
+      "tÊĥi": 1468,
+      "ËĲs": 1469,
+      "ĠtËĪy": 1470,
+      "ĠmËĪiÉľÅĭ": 1471,
+      "É¾ËĪe": 1472,
+      "mËĮa": 1473,
+      "ĠmËĮiËĲ": 1474,
+      "ĠÉĽks": 1475,
+      "Éªp": 1476,
+      "ĠkËĪÊĮÉ¾nËĮaËĲ": 1477,
+      "ĠËĮaÊĬx": 1478,
+      "rËĪiËĲ": 1479,
+      "ĠcËĪÊĮl": 1480,
+      "mos": 1481,
+      "ĠkËĪÊĮÉ¾tËĮeËĲ": 1482,
+      "iËĲÉ¾": 1483,
+      "kÉĻn": 1484,
+      "ĠdËĪu": 1485,
+      "naËĲ": 1486,
+      "ĠpwËĪe": 1487,
+      "ËĮÉĶÉª": 1488,
+      "ĠtÉķhËĪiÉĽ": 1489,
+      "ĠÎ²ËĪi": 1490,
+      "ËĪiÉĽÉľt": 1491,
+      "Ġte": 1492,
+      "ËĪaÃ°os": 1493,
+      "mËĪa": 1494,
+      "ĠvËĪo": 1495,
+      "ĠmËĪÉª": 1496,
+      "ĠbËĮi": 1497,
+      "ad": 1498,
+      "do": 1499,
+      "ĠnËĪaÊĬ": 1500,
+      "ĠÊ²ËĪyÉľ": 1501,
+      "wËĪÉĽ": 1502,
+      "ËĪis": 1503,
+      "el": 1504,
+      "Ġpar": 1505,
+      "ĠtËĪai": 1506,
+      "ĠdËĪÉªjaËĲ": 1507,
+      "hËĪi": 1508,
+      "ĠÉ¾ËĪÊĮ": 1509,
+      "ĠdËĪe": 1510,
+      "ËĪaÉªd": 1511,
+      "Ġper": 1512,
+      "ĠsËĮÉĶ": 1513,
+      "we": 1514,
+      "ÊĬm": 1515,
+      "Ġin": 1516,
+      "ĠjËĪuËĲz": 1517,
+      "ËĪiËĲpÉĻl": 1518,
+      "ĠÊĭËĪaËĲl": 1519,
+      "ĠetËĪÉĽ": 1520,
+      "ËĮÉĽm": 1521,
+      "ĠnËĪu": 1522,
+      "ËĪÉĽkt": 1523,
+      "ĠiËĲÉ¾": 1524,
+      "ĠbÉ¹": 1525,
+      "ĠtshËĪi": 1526,
+      "ĠÉĹËĪÉĶÉľ": 1527,
+      "ĠkwËĮa": 1528,
+      "ĠfËĪuÉľ": 1529,
+      "wËĮa": 1530,
+      "ĠdËĪiËĲ": 1531,
+      "ĠÉ¡ËĪyÉĻ": 1532,
+      "ËĮÉĽËĲ": 1533,
+      "rËĪa": 1534,
+      "Ġne": 1535,
+      "ĠzËĪyÉĻ": 1536,
+      "ĠbËĪaÉª": 1537,
+      "ĠÉŁËĪÊĮb": 1538,
+      "ËĪuËĲto": 1539,
+      "ÊĬnt": 1540,
+      "ĠcÊ°": 1541,
+      "ËĪÉĽnti": 1542,
+      "ËĪoÉĻ": 1543,
+      "ĠsËĮÊĮm": 1544,
+      "ĠlÉĳ": 1545,
+      "ËĮeva": 1546,
+      "É¾ÉĽ": 1547,
+      "ntÉľ": 1548,
+      "ĠmËĪÉĽn": 1549,
+      "ËĪÉĳËĲk": 1550,
+      "Ġkil": 1551,
+      "ËĪones": 1552,
+      "ff": 1553,
+      "ĠmËĪÉĽËĲ": 1554,
+      "ĠvËĪÉĻÉª": 1555,
+      "ĠËĪÉĶËĲ": 1556,
+      "ĠËĮÉªnt": 1557,
+      "ÊĬn": 1558,
+      "ĠwÉªl": 1559,
+      "Ġsin": 1560,
+      "ĠËĮalla": 1561,
+      "ĠaÎ²ËĪia": 1562,
+      "pi": 1563,
+      "ËĪoÉľ": 1564,
+      "ÉªjËĮaËĲ": 1565,
+      "ku": 1566,
+      "ĠvËĪÉª": 1567,
+      "Ġtut": 1568,
+      "ĠtËĪeÉľ": 1569,
+      "ĠhËĪÉĶ": 1570,
+      "Î²É¾e": 1571,
+      "sÉĻÉ¾": 1572,
+      "ĠkhËĪai": 1573,
+      "ĠmËĪÉĶ": 1574,
+      "Ġta": 1575,
+      "ĠÉ²ËĪaËĲ": 1576,
+      "Ġnu": 1577,
+      "ËĪuËĲn": 1578,
+      "ĠÉĻËĲÉľ": 1579,
+      "ĠËĪaÊĬf": 1580,
+      "ËĪiËĲdÉľ": 1581,
+      "nti": 1582,
+      "ĠpËĪiËĲpÉĻl": 1583,
+      "Ġkj": 1584,
+      "Ġpe": 1585,
+      "ĠmËĪÉĳ": 1586,
+      "ËĮaÉª": 1587,
+      "ËĪaËĲle": 1588,
+      "ĠvËĮÉĻËĲÉªÉľ": 1589,
+      "mpo": 1590,
+      "ĠkËĪÉªt": 1591,
+      "ĠnËĮÉĽ": 1592,
+      "ĠÉŁËĪaËĲtaËĲ": 1593,
+      "ĠsËĪaËĲtÊ°": 1594,
+      "ĠÉŁËĪi": 1595,
+      "Ġso": 1596,
+      "ĠbËĪÉĽ": 1597,
+      "kËĪi": 1598,
+      "Éªti": 1599,
+      "Ġtsi": 1600,
+      "ĠkÊģ": 1601,
+      "ËĮÉĴ": 1602,
+      "É¡ÉĻl": 1603,
+      "kst": 1604,
+      "ĠmËĪÉĻËĲ": 1605,
+      "ËĪÊĮk": 1606,
+      "ĠnËĪaËĲÊĬ": 1607,
+      "Ġap": 1608,
+      "ĠlËĪÉªkÊ°": 1609,
+      "lli": 1610,
+      "ĠkwËĪal": 1611,
+      "ĠËĪÉĻËĲ": 1612,
+      "ĠtsËĪuei": 1613,
+      "Ġdo": 1614,
+      "ĠkËĲjËĪo": 1615,
+      "ÊĬz": 1616,
+      "ĠpËĪaËĲ": 1617,
+      "ĠmËĪuËĲ": 1618,
+      "ĠÉ¡ÉĻv": 1619,
+      "rËĪi": 1620,
+      "Ġtw": 1621,
+      "ËĮÉªn": 1622,
+      "dËĪÉĳ": 1623,
+      "ĠÃ°ËĪi": 1624,
+      "ĠËĪaËĲi": 1625,
+      "ĠhËĪiÉĽ": 1626,
+      "ĠÃ°ËĮÉĽm": 1627,
+      "ĠpÊ°ËĪÉªÉ¾": 1628,
+      "ÉĴm": 1629,
+      "ĠËĮeËĲ": 1630,
+      "ĠthËĪaiÉľ": 1631,
+      "ĠvËĪas": 1632,
+      "ĠnÉĳËĲ": 1633,
+      "pÉĻn": 1634,
+      "ĠpËĮÉĻÉ¾": 1635,
+      "ĠÉĹËĪaËĲÉª": 1636,
+      "ËĪouÉľ": 1637,
+      "ĠÊĲËĪuÉľ": 1638,
+      "ĠmËĪan": 1639,
+      "ĠtËĪÉĻÉªÉľ": 1640,
+      "ĠlËĪaËĲÊĬ": 1641,
+      "mËĪÉĽnte": 1642,
+      "ĠfËĪam": 1643,
+      "sjËĪÉĶ": 1644,
+      "ĠpËĪÉĻ": 1645,
+      "ËĪeËĲm": 1646,
+      "ĠpËĪÊĮr": 1647,
+      "jËĪi": 1648,
+      "ĠlÉĽ": 1649,
+      "Ġten": 1650,
+      "ËĪoËĲra": 1651,
+      "ki": 1652,
+      "ĠÊĤËĪaËĲÊĬ": 1653,
+      "kÉª": 1654,
+      "bËĲe": 1655,
+      "ËĪalt": 1656,
+      "Ã°Éª": 1657,
+      "pËĪi": 1658,
+      "ĠËĮÉĽnt": 1659,
+      "ĠmËĪei": 1660,
+      "ĠhËĪÉĻÊĬ": 1661,
+      "ĠhËĪÉĽÉ¾": 1662,
+      "jËĪÉĳ": 1663,
+      "ĠhËĪÊĬaËĲ": 1664,
+      "mÉľ": 1665,
+      "ĠdÊ°": 1666,
+      "ĠtÊĥËĪe": 1667,
+      "lËĪÉĽ": 1668,
+      "ËĪaËĲte": 1669,
+      "ĠpËĪuËĲ": 1670,
+      "ĠmËĪÊĬ": 1671,
+      "ËĪaËĲÉªÊĪ": 1672,
+      "diËĲ": 1673,
+      "ĠfÉ¹ÉĴm": 1674,
+      "ĠhËĪÉĳËĲ": 1675,
+      "Î²o": 1676,
+      "ĠmËĪiÉľn": 1677,
+      "ĠÃ°iËĲz": 1678,
+      "ĠkËĪou": 1679,
+      "ËĪiËĲna": 1680,
+      "ĠavËĮeva": 1681,
+      "ĠËĪaËĲÉ¾": 1682,
+      "ĠnËĪuËĲÉ¾": 1683,
+      "ĠÎ²ËĪe": 1684,
+      "ĠzaÉªn": 1685,
+      "ËĪÉĽd": 1686,
+      "ÉĹ": 1687,
+      "ËĪeÉªk": 1688,
+      "sËĮÉĻÊĬ": 1689,
+      "ËĪeËĲÉŁ": 1690,
+      "ĠÊĤËĪÉĻËĲ": 1691,
+      "je": 1692,
+      "cÊ°ËĲ": 1693,
+      "ËĪÉĶr": 1694,
+      "ÉĽËĲ": 1695,
+      "ĠtÉķhËĪyÃ¦Éľn": 1696,
+      "ĠËĮaÉªnÉĻn": 1697,
+      "ĠiËĲn": 1698,
+      "ĠbËĪÊĮc": 1699,
+      "ËĪiËĲm": 1700,
+      "É¾as": 1701,
+      "ËĮÉĻs": 1702,
+      "ĠvËĪeËĲ": 1703,
+      "ĠËĪÉĻrÉľ": 1704,
+      "ĠduËĲ": 1705,
+      "ntÉĻ": 1706,
+      "ĠpÉ¹ËĪÉĴ": 1707,
+      "ĠbËĪÉª": 1708,
+      "ĠwËĪoÉľ": 1709,
+      "nËĮi": 1710,
+      "ĠhÉĲ": 1711,
+      "ĠkËĪÉĽ": 1712,
+      "Ġet": 1713,
+      "jËĪÉĽndo": 1714,
+      "ĠËĪaiÉľ": 1715,
+      "Ġli": 1716,
+      "ĠËĪaÊĬs": 1717,
+      "kËĲo": 1718,
+      "ĠÉĹËĪyÉĻ": 1719,
+      "keËĲ": 1720,
+      "ĠfËĪiËĲl": 1721,
+      "ĠbÊ°ËĪaËĲi": 1722,
+      "ĠÉ¡ÉĻÊĥ": 1723,
+      "ÊĴËĪe": 1724,
+      "ĠnjËĪuËĲ": 1725,
+      "ĠËĪak": 1726,
+      "ĠÉĹËĪaËĲ": 1727,
+      "zËĪa": 1728,
+      "vËĪe": 1729,
+      "ĠhËĮaÊĬ": 1730,
+      "ÉĲÃ§": 1731,
+      "ĠÉ¾ËĪÊĮkÊ°": 1732,
+      "pËĪe": 1733,
+      "ĠtÉĻbi": 1734,
+      "ĠpËĪÊĮhÉĻlËĮeËĲ": 1735,
+      "ĠfËĪÉĽ": 1736,
+      "ĠwËĮÉªtÊĥ": 1737,
+      "ĠtÉķËĪyÉĽÉľ": 1738,
+      "wËĮe": 1739,
+      "ËĮaÉªt": 1740,
+      "ĠnÉĳËĲx": 1741,
+      "ĠkËĪÉĶËĲn": 1742,
+      "ÊĬk": 1743,
+      "ĠbËĪaËĲd": 1744,
+      "ÅĭÉĻn": 1745,
+      "Ġni": 1746,
+      "ĠbËĪe": 1747,
+      "ĠmËĮÊĬ": 1748,
+      "ËĪar": 1749,
+      "ĠmËĮeÉªk": 1750,
+      "ĠsËĪaËĲÉ¾": 1751,
+      "Î²e": 1752,
+      "ĠtÉķhËĪiÉľÅĭ": 1753,
+      "itËĪe": 1754,
+      "kËĮe": 1755,
+      "ËĪÉĽËĲl": 1756,
+      "ËĮÉĴn": 1757,
+      "ËĮÉĳ": 1758,
+      "ĠbËĪÉªl": 1759,
+      "ĠwÊĬd": 1760,
+      "ĠbËĪoËĲl": 1761,
+      "rd": 1762,
+      "iÉĻ": 1763,
+      "Ġda": 1764,
+      "ĠbËĪaËĲÊĬ": 1765,
+      "ĠnËĪÊĮmbÉĻÉ¾": 1766,
+      "ËĪaËĲÉªÉľ": 1767,
+      "ĠÉĽm": 1768,
+      "ĠmiËĲÉ¾": 1769,
+      "ËĪeÉªm": 1770,
+      "los": 1771,
+      "ËĮÉĽt": 1772,
+      "ĠËĮaÊĬs": 1773,
+      "ĠmËĪaÉľt": 1774,
+      "ĠwËĪuÉĻ": 1775,
+      "ĠwËĪeÉª": 1776,
+      "ĠseÉ²": 1777,
+      "ĠbjËĪÉĽ": 1778,
+      "ĠwÉĽn": 1779,
+      "fl": 1780,
+      "ĠkhwËĪa": 1781,
+      "dËĪÉĽ": 1782,
+      "vÉ¹Éª": 1783,
+      "ĠËĪaÉ¾": 1784,
+      "jËĪÉĳuÉľ": 1785,
+      "ĠËĮaËĲpkËĮeËĲ": 1786,
+      "bÊģ": 1787,
+      "ĠtËĪaÉªm": 1788,
+      "ĠËĪÉĳ": 1789,
+      "ĠsËĮa": 1790,
+      "ĠzËĪoÉª": 1791,
+      "ËĪÉĶÉ¾a": 1792,
+      "ĠdËĪÃ¸": 1793,
+      "ËĪÉĶÉ¾t": 1794,
+      "ĠÅĭËĪÉĶ": 1795,
+      "min": 1796,
+      "ĠlËĪÊĬk": 1797,
+      "ËĪÉĶËĲt": 1798,
+      "ĠËĪÉĶtÉ¾": 1799,
+      "ĠfËĪaÉª": 1800,
+      "ĠÉ¡ÉĴt": 1801,
+      "ËĪeËĲÉĻn": 1802,
+      "kËĪÉĶ": 1803,
+      "ĠvËĪÉĽÉ¹i": 1804,
+      "mÉĽ": 1805,
+      "ËĪaÉªz": 1806,
+      "Ġesp": 1807,
+      "É²a": 1808,
+      "ĠlËĪo": 1809,
+      "ËĪÉĽËĲra": 1810,
+      "Î²ËĪi": 1811,
+      "ouÉľ": 1812,
+      "ËĮÉĻk": 1813,
+      "tÊĥuËĲ": 1814,
+      "ĠnËĪyÉĻ": 1815,
+      "ÊĪÉ¾": 1816,
+      "ĠÉ¡ËĪy": 1817,
+      "ĠtËĪoÃ°o": 1818,
+      "ËĪÉªÃ§t": 1819,
+      "ĠmÉªÃ§": 1820,
+      "ĠËĪand": 1821,
+      "ĠkwËĮÉĽl": 1822,
+      "ĠÊĤËĪaËĲ": 1823,
+      "ĠnËĪiÉľ": 1824,
+      "ËĪÉĶp": 1825,
+      "ËĪiËĲz": 1826,
+      "ĠÊĤËĪaÊĬ": 1827,
+      "ĠÉ¾ËĮÉĻhi": 1828,
+      "ĠsËĮÊĬo": 1829,
+      "ĠÉĽÉ¡": 1830,
+      "ĠdÅĵ": 1831,
+      "ĠÉ¡ËĮaËĲÉªÉľ": 1832,
+      "dÉª": 1833,
+      "lËĮa": 1834,
+      "stËĪi": 1835,
+      "ĠdËĮiËĲz": 1836,
+      "ĠtËĮÊĬ": 1837,
+      "Î¸i": 1838,
+      "ĠËĪÉªskËĮoËĲ": 1839,
+      "ndÉĻn": 1840,
+      "Ġtsv": 1841,
+      "ĠhËĪÉĻËĲ": 1842,
+      "ĠÊĥËĪÊĬ": 1843,
+      "ÉĻtËĮeËĲ": 1844,
+      "pËĮÉĽ": 1845,
+      "ËĪaÉ¾ÉĶn": 1846,
+      "ĠpÉĽÊģ": 1847,
+      "Ġy": 1848,
+      "mnËĮeËĲ": 1849,
+      "ËĪÉĽllo": 1850,
+      "ĠÉ¡ËĪÉĻ": 1851,
+      "ĠËĮad": 1852,
+      "ĠÊĥv": 1853,
+      "ËĪÊıÉ¾": 1854,
+      "rËĪe": 1855,
+      "yËĲ": 1856,
+      "ĠpËĪaËĲs": 1857,
+      "ĠËĪÉĽn": 1858,
+      "ÉªdÊĴ": 1859,
+      "ËĪuai": 1860,
+      "Ġfi": 1861,
+      "ĠtËĪyÉĻ": 1862,
+      "ËĪaËĲÉŁ": 1863,
+      "ĠtjËĪe": 1864,
+      "ËĪaËĲnaËĲ": 1865,
+      "stÉ¾": 1866,
+      "Êİe": 1867,
+      "ËĮeÉªt": 1868,
+      "ba": 1869,
+      "Ã°as": 1870,
+      "vÊģ": 1871,
+      "ĠzËĪÉĻËĲ": 1872,
+      "ËĪaËĲli": 1873,
+      "ÉŁÊ°eËĲ": 1874,
+      "ËĪaËĲteËĲ": 1875,
+      "ĠvËĪa": 1876,
+      "Ġsal": 1877,
+      "ËĪaËĲno": 1878,
+      "ĠÉ¡ÉĻz": 1879,
+      "ĠhËĪoËĲti": 1880,
+      "ĠÉ²ËĪiÉĽ": 1881,
+      "tÉľ": 1882,
+      "ĠËĪaËĲp": 1883,
+      "ĠwËĪÉĽl": 1884,
+      "ĠmËĪÉªl": 1885,
+      "ĠfyËĲÉ¾": 1886,
+      "ËĪÉĽËĲsaËĲ": 1887,
+      "ĠbËĮiËĲ": 1888,
+      "ËĪaËĲjaËĲ": 1889,
+      "ËĪÉªp": 1890,
+      "ĠfÊģ": 1891,
+      "tsiËĪoËĲne": 1892,
+      "ĠwËĪuÉľ": 1893,
+      "Ġvi": 1894,
+      "ĠwËĪÉĳÉľn": 1895,
+      "ËĪoËĲn": 1896,
+      "ĠÉĹËĪÉĻÉª": 1897,
+      "ĠÊĿËĪo": 1898,
+      "Ġra": 1899,
+      "mÉĻnt": 1900,
+      "ËĪaÊĬnd": 1901,
+      "ĠpÉĽÉ¾": 1902,
+      "ĠÉĹËĪaËĲÊĬ": 1903,
+      "oËĲÉ¾": 1904,
+      "hËĪo": 1905,
+      "ĠÉĴn": 1906,
+      "ĠÊİe": 1907,
+      "ĠsËĪÉªks": 1908,
+      "É¡n": 1909,
+      "ĠÉ¡ËĪa": 1910,
+      "ĠÎ¸j": 1911,
+      "ĠpËĪe": 1912,
+      "spe": 1913,
+      "ĠvËĪÉĻ": 1914,
+      "ĠfËĪÉª": 1915,
+      "ĠËĮÉªntÊĬ": 1916,
+      "lÉĻn": 1917,
+      "ĠnËĪiËĲd": 1918,
+      "ĠsËĮÊĬa": 1919,
+      "ĠËĪum": 1920,
+      "ĠdËĪeÉª": 1921,
+      "ĠËĪÊĮbÊ°i": 1922,
+      "ËĪÉĳËĲÉ¾": 1923,
+      "ĠbËĪiÉĽÉľt": 1924,
+      "Êİos": 1925,
+      "ĠtshËĪaiÉľ": 1926,
+      "ĠËĮÉªskËĮaËĲ": 1927,
+      "ĠaÊĬÉĻ": 1928,
+      "ĠËĪyÃ¦": 1929,
+      "Ġdyn": 1930,
+      "ĠmËĪiËĲn": 1931,
+      "ĠËĪÊĮcÊ°ËĲ": 1932,
+      "ĠsÉĽ": 1933,
+      "ĠnËĪy": 1934,
+      "ĠnËĮÉĽl": 1935,
+      "É¡É¾": 1936,
+      "ÊĥËĪe": 1937,
+      "ĠÊĤËĮÉĽ": 1938,
+      "ĠËĪÉĽvÉ¹Éª": 1939,
+      "ËĪÉĽlp": 1940,
+      "ĠbËĪak": 1941,
+      "ĠeËĲ": 1942,
+      "ĠfËĪaËĲ": 1943,
+      "ĠkÉĽl": 1944,
+      "ĠËĪeËĲs": 1945,
+      "jËĪaËĲd": 1946,
+      "ĠlËĮi": 1947,
+      "mbÉ¾e": 1948,
+      "ktÉĻ": 1949,
+      "nta": 1950,
+      "tËĪu": 1951,
+      "ĠÃ°ËĪat": 1952,
+      "ĠËĪaÎ²": 1953,
+      "ÉĻÉ¹i": 1954,
+      "ĠkwËĮÉĽlla": 1955,
+      "ĠbÉĻn": 1956,
+      "rËĮÉĽ": 1957,
+      "ĠnÉĶ": 1958,
+      "ĠÉ¡ËĪÉª": 1959,
+      "ĠËĪap": 1960,
+      "É¹ÉĻ": 1961,
+      "ËĪaÉľkh": 1962,
+      "ĠÊĲËĪi": 1963,
+      "ĠËĪÉĳËĲ": 1964,
+      "ÉªÉ¡ÉĻn": 1965,
+      "ĠwËĪai": 1966,
+      "ĠpÉĻt": 1967,
+      "kËĲa": 1968,
+      "ĠbËĪÉĽËĲ": 1969,
+      "ËĪeËĲÊĭ": 1970,
+      "lsÉĻÊĬ": 1971,
+      "ĠcËĪaËĲhÉªËĮeËĲ": 1972,
+      "ĠkÉĻn": 1973,
+      "ĠËĮaÉªnÉĻm": 1974,
+      "ËĪuËĲt": 1975,
+      "ĠhËĪaÊĬ": 1976,
+      "ĠtËĪanto": 1977,
+      "ĠhÉĲz": 1978,
+      "ĠsËĪÊĮÉ¾": 1979,
+      "Ġno": 1980,
+      "ĠtËĪÉĶËĲ": 1981,
+      "ĠzËĪaÉª": 1982,
+      "ĠtÉķËĪiÉĽÉľ": 1983,
+      "ĠkozËĪi": 1984,
+      "ĠkËĪei": 1985,
+      "Ã°ËĪÉĶÉ¾": 1986,
+      "ËĮÉĶÊģ": 1987,
+      "ĠtËĪÊĮÉ¾": 1988,
+      "ĠÊĲËĪÉĻ": 1989,
+      "ĠÉķËĪyÉĽÉľ": 1990,
+      "ĠmËĮÊĬÉŁÊ°eËĲ": 1991,
+      "mf": 1992,
+      "ĠvËĪiËĲdÉľ": 1993,
+      "kËĪa": 1994,
+      "ĠÉĲÉ¡": 1995,
+      "kw": 1996,
+      "ĠÊģÉĽ": 1997,
+      "xÉĻn": 1998,
+      "ĠdÊĬ": 1999,
+      "ĠkËĪÊĮÉ¾nËĮeËĲ": 2000,
+      "jËĪaËĲdaËĲ": 2001,
+      "ĠfÉĻ": 2002,
+      "ĠËĮimp": 2003,
+      "ĠhÉªz": 2004,
+      "ĠÊ°Ïĩ": 2005,
+      "ËĪoËĲni": 2006,
+      "ĠxËĪiÉľ": 2007,
+      "ËĪeËĲsÊĪ": 2008,
+      "ÊıbÉľ": 2009,
+      "ËĮÉĶÉ¾ke": 2010,
+      "ĠÉ¡ËĪÉĻÊĬ": 2011,
+      "ËĪÉªÊĥÉĻn": 2012,
+      "les": 2013,
+      "ĠfËĪiËĲ": 2014,
+      "É¡tÉĻ": 2015,
+      "ËĪeËĲre": 2016,
+      "ĠvËĮaËĲ": 2017,
+      "ĠËĪeÉª": 2018,
+      "ĠmËĪuÉĻÉľn": 2019,
+      "ĠÉ¡ËĪÊĬd": 2020,
+      "ĠmËĮaÉªn": 2021,
+      "zËĪe": 2022,
+      "ĠlËĪiÉľ": 2023,
+      "Ġmu": 2024,
+      "ĠkËĮÉĽl": 2025,
+      "ĠjËĮÉĻh": 2026,
+      "ĠfËĮÉĶÉ¾": 2027,
+      "fÉ¹": 2028,
+      "ĠkËĪaÉªn": 2029,
+      "ĠËĪÉĴlsÉĻÊĬ": 2030,
+      "Î¸ÉªÅĭ": 2031,
+      "ĠthËĪonÉ¡Éľ": 2032,
+      "tËĪÉĳ": 2033,
+      "Î¸jo": 2034,
+      "mËĪÉĶ": 2035,
+      "Ġos": 2036,
+      "ĠsÊĬ": 2037,
+      "ĠsËĪÊĮmÉĻ": 2038,
+      "ĠvËĮÉĽn": 2039,
+      "nËĪo": 2040,
+      "ĠËĪaktÊĥuËĲ": 2041,
+      "É£a": 2042,
+      "ĠtÊ°i": 2043,
+      "ĠfËĮi": 2044,
+      "ĠvËĪÉĽl": 2045,
+      "ĠtËĪutËĲi": 2046,
+      "xos": 2047
+    },
+    "merges": [
+      [
+        "Ë",
+        "Ī"
+      ],
+      [
+        "Ë",
+        "Ĳ"
+      ],
+      [
+        "ËĪ",
+        "É"
+      ],
+      [
+        "Ë",
+        "Į"
+      ],
+      [
+        "É",
+        "Ļ"
+      ],
+      [
+        "ËĪ",
+        "a"
+      ],
+      [
+        "ËĪ",
+        "i"
+      ],
+      [
+        "Ġ",
+        "t"
+      ],
+      [
+        "É",
+        "ª"
+      ],
+      [
+        "É",
+        "¾"
+      ],
+      [
+        "Ġ",
+        "É"
+      ],
+      [
+        "Ġ",
+        "k"
+      ],
+      [
+        "É",
+        "ľ"
+      ],
+      [
+        "Ġ",
+        "s"
+      ],
+      [
+        "ËĪ",
+        "e"
+      ],
+      [
+        "É",
+        "Ľ"
+      ],
+      [
+        "ËĪ",
+        "o"
+      ],
+      [
+        "Ġ",
+        "l"
+      ],
+      [
+        "ËĪÉ",
+        "Ľ"
+      ],
+      [
+        "Ġ",
+        "d"
+      ],
+      [
+        "Ê",
+        "Ĭ"
+      ],
+      [
+        "ËĪa",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "p"
+      ],
+      [
+        "Ì",
+        "ĥ"
+      ],
+      [
+        "Ġ",
+        "m"
+      ],
+      [
+        "ËĪ",
+        "u"
+      ],
+      [
+        "Å",
+        "ĭ"
+      ],
+      [
+        "Ã",
+        "°"
+      ],
+      [
+        "ËĪÉ",
+        "Ķ"
+      ],
+      [
+        "Ê",
+        "Į"
+      ],
+      [
+        "ËĮ",
+        "a"
+      ],
+      [
+        "Ġ",
+        "h"
+      ],
+      [
+        "ËĪ",
+        "ÊĮ"
+      ],
+      [
+        "Ġ",
+        "n"
+      ],
+      [
+        "Ê",
+        "ģ"
+      ],
+      [
+        "ËĪÉ",
+        "ĳ"
+      ],
+      [
+        "Ê",
+        "ĥ"
+      ],
+      [
+        "e",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "a"
+      ],
+      [
+        "Ġ",
+        "b"
+      ],
+      [
+        "É",
+        "Ķ"
+      ],
+      [
+        "ËĪÉ",
+        "Ļ"
+      ],
+      [
+        "ÉĻ",
+        "n"
+      ],
+      [
+        "Ġ",
+        "f"
+      ],
+      [
+        "ËĪÉ",
+        "ª"
+      ],
+      [
+        "É",
+        "¡"
+      ],
+      [
+        "ËĪe",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "j"
+      ],
+      [
+        "n",
+        "t"
+      ],
+      [
+        "Ġ",
+        "Ã°"
+      ],
+      [
+        "Ġ",
+        "ËĮ"
+      ],
+      [
+        "Ġt",
+        "s"
+      ],
+      [
+        "ĠÉ",
+        "¡"
+      ],
+      [
+        "É",
+        "ķ"
+      ],
+      [
+        "ËĪo",
+        "ËĲ"
+      ],
+      [
+        "Ê",
+        "°"
+      ],
+      [
+        "a",
+        "ËĲ"
+      ],
+      [
+        "ËĪ",
+        "y"
+      ],
+      [
+        "Ġt",
+        "Éķ"
+      ],
+      [
+        "ËĪi",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "Ê"
+      ],
+      [
+        "Ġ",
+        "v"
+      ],
+      [
+        "Ġ",
+        "w"
+      ],
+      [
+        "s",
+        "t"
+      ],
+      [
+        "É",
+        "ĳ"
+      ],
+      [
+        "n",
+        "d"
+      ],
+      [
+        "ËĮ",
+        "i"
+      ],
+      [
+        "Ì",
+        "ª"
+      ],
+      [
+        "ËĮ",
+        "e"
+      ],
+      [
+        "Ġ",
+        "z"
+      ],
+      [
+        "ËĪa",
+        "Éª"
+      ],
+      [
+        "ËĪi",
+        "ÉĽ"
+      ],
+      [
+        "Î",
+        "²"
+      ],
+      [
+        "É",
+        "¹"
+      ],
+      [
+        "Ġ",
+        "ËĮa"
+      ],
+      [
+        "Î",
+        "¸"
+      ],
+      [
+        "Ġh",
+        "ÉĽ"
+      ],
+      [
+        "Ê",
+        "Ī"
+      ],
+      [
+        "i",
+        "ËĲ"
+      ],
+      [
+        "ËĮ",
+        "o"
+      ],
+      [
+        "Ġ",
+        "Éª"
+      ],
+      [
+        "Éľ",
+        "n"
+      ],
+      [
+        "Ġ",
+        "x"
+      ],
+      [
+        "Ġt",
+        "ÉĻ"
+      ],
+      [
+        "ËĪu",
+        "ËĲ"
+      ],
+      [
+        "ËĮ",
+        "ÉĻ"
+      ],
+      [
+        "Ġj",
+        "ËĪi"
+      ],
+      [
+        "ËĮ",
+        "ÉĽ"
+      ],
+      [
+        "ĠÉ",
+        "Ľ"
+      ],
+      [
+        "Ġ",
+        "ËĪa"
+      ],
+      [
+        "ËĮa",
+        "ËĲ"
+      ],
+      [
+        "Ġl",
+        "a"
+      ],
+      [
+        "ĠÃ°",
+        "e"
+      ],
+      [
+        "ĠhÉĽ",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "e"
+      ],
+      [
+        "Ã",
+        "§"
+      ],
+      [
+        "ÉĻ",
+        "l"
+      ],
+      [
+        "o",
+        "ËĲ"
+      ],
+      [
+        "ËĪÉĳ",
+        "u"
+      ],
+      [
+        "Ê",
+        "Ĵ"
+      ],
+      [
+        "u",
+        "ËĲ"
+      ],
+      [
+        "ĠÉ",
+        "Ĺ"
+      ],
+      [
+        "ĠÉ",
+        "ķ"
+      ],
+      [
+        "ËĮ",
+        "eËĲ"
+      ],
+      [
+        "ĠtÉķ",
+        "ËĪi"
+      ],
+      [
+        "o",
+        "s"
+      ],
+      [
+        "ËĪÉĶ",
+        "ËĲ"
+      ],
+      [
+        "a",
+        "s"
+      ],
+      [
+        "ËĪ",
+        "ÊĬ"
+      ],
+      [
+        "Ġ",
+        "i"
+      ],
+      [
+        "ËĪa",
+        "i"
+      ],
+      [
+        "É",
+        "²"
+      ],
+      [
+        "Éª",
+        "n"
+      ],
+      [
+        "t",
+        "s"
+      ],
+      [
+        "Éľ",
+        "Åĭ"
+      ],
+      [
+        "ĠÉ",
+        "Ł"
+      ],
+      [
+        "Ġ",
+        "Êĥ"
+      ],
+      [
+        "ËĪe",
+        "Éª"
+      ],
+      [
+        "ÉĽ",
+        "É¾"
+      ],
+      [
+        "ËĪÉĽ",
+        "ËĲ"
+      ],
+      [
+        "ËĪÉĽ",
+        "É¾"
+      ],
+      [
+        "Ġ",
+        "r"
+      ],
+      [
+        "t",
+        "Êĥ"
+      ],
+      [
+        "ËĮ",
+        "ÉĶ"
+      ],
+      [
+        "Ġd",
+        "ÉĻ"
+      ],
+      [
+        "t",
+        "ÉĻ"
+      ],
+      [
+        "o",
+        "u"
+      ],
+      [
+        "ËĪy",
+        "ÉĻ"
+      ],
+      [
+        "ĠËĮ",
+        "i"
+      ],
+      [
+        "ÉĻ",
+        "É¾"
+      ],
+      [
+        "ËĪÉĻ",
+        "ÊĬ"
+      ],
+      [
+        "ËĪÊĮ",
+        "É¾"
+      ],
+      [
+        "ËĪÉ",
+        "Ĵ"
+      ],
+      [
+        "Ġt",
+        "h"
+      ],
+      [
+        "ËĪo",
+        "n"
+      ],
+      [
+        "Ê",
+        "ĭ"
+      ],
+      [
+        "ËĪÉĳ",
+        "ËĲ"
+      ],
+      [
+        "ËĪÊĮ",
+        "h"
+      ],
+      [
+        "w",
+        "ËĪa"
+      ],
+      [
+        "ËĪe",
+        "i"
+      ],
+      [
+        "l",
+        "l"
+      ],
+      [
+        "ĠÉ",
+        "Ĳ"
+      ],
+      [
+        "Éĳ",
+        "ËĲ"
+      ],
+      [
+        "a",
+        "n"
+      ],
+      [
+        "É",
+        "Ł"
+      ],
+      [
+        "ĠÊ",
+        "ĭ"
+      ],
+      [
+        "Ġk",
+        "o"
+      ],
+      [
+        "k",
+        "h"
+      ],
+      [
+        "Éª",
+        "Åĭ"
+      ],
+      [
+        "ËĪaËĲ",
+        "Éª"
+      ],
+      [
+        "Ġt",
+        "Êĥ"
+      ],
+      [
+        "ËĪaËĲ",
+        "t"
+      ],
+      [
+        "ĠËĮ",
+        "e"
+      ],
+      [
+        "ĠtÉķ",
+        "h"
+      ],
+      [
+        "ËĪu",
+        "o"
+      ],
+      [
+        "ËĪon",
+        "É¡"
+      ],
+      [
+        "É",
+        "ĸ"
+      ],
+      [
+        "a",
+        "t"
+      ],
+      [
+        "Ġk",
+        "e"
+      ],
+      [
+        "É",
+        "Ĵ"
+      ],
+      [
+        "ĠÉķ",
+        "ËĪi"
+      ],
+      [
+        "Ã",
+        "¸"
+      ],
+      [
+        "ĠÉ",
+        "ĳ"
+      ],
+      [
+        "ËĪeËĲ",
+        "k"
+      ],
+      [
+        "Å",
+        "ĵ"
+      ],
+      [
+        "r",
+        "e"
+      ],
+      [
+        "Ġ",
+        "É¾"
+      ],
+      [
+        "Ġk",
+        "ÉĶ"
+      ],
+      [
+        "ËĮ",
+        "ÊĬ"
+      ],
+      [
+        "s",
+        "k"
+      ],
+      [
+        "Ġ",
+        "ÊĬ"
+      ],
+      [
+        "Ġa",
+        "nd"
+      ],
+      [
+        "Éª",
+        "Ã§"
+      ],
+      [
+        "Ġm",
+        "e"
+      ],
+      [
+        "ËĪa",
+        "É¾"
+      ],
+      [
+        "Ġ",
+        "ËĪÉª"
+      ],
+      [
+        "n",
+        "a"
+      ],
+      [
+        "Ġ",
+        "Î²"
+      ],
+      [
+        "Ġl",
+        "ËĪi"
+      ],
+      [
+        "j",
+        "aËĲ"
+      ],
+      [
+        "l",
+        "i"
+      ],
+      [
+        "n",
+        "o"
+      ],
+      [
+        "ĠÉª",
+        "n"
+      ],
+      [
+        "Ġd",
+        "ËĮi"
+      ],
+      [
+        "ĠÉ",
+        "²"
+      ],
+      [
+        "t",
+        "ËĲ"
+      ],
+      [
+        "ÉĻ",
+        "m"
+      ],
+      [
+        "Ġl",
+        "ÉĻ"
+      ],
+      [
+        "ĠÃ°",
+        "ÉĻ"
+      ],
+      [
+        "Éª",
+        "k"
+      ],
+      [
+        "ËĪÉĽ",
+        "l"
+      ],
+      [
+        "Éľ",
+        "t"
+      ],
+      [
+        "Ġs",
+        "e"
+      ],
+      [
+        "e",
+        "s"
+      ],
+      [
+        "ËĪo",
+        "u"
+      ],
+      [
+        "ËĪa",
+        "ÊĬ"
+      ],
+      [
+        "ĠÉ",
+        "Ķ"
+      ],
+      [
+        "Éª",
+        "t"
+      ],
+      [
+        "Ġ",
+        "Åĭ"
+      ],
+      [
+        "ËĪÉĽ",
+        "n"
+      ],
+      [
+        "Ê",
+        "İ"
+      ],
+      [
+        "Ġk",
+        "h"
+      ],
+      [
+        "ËĪÉĽ",
+        "nt"
+      ],
+      [
+        "ËĪaËĲ",
+        "É¾"
+      ],
+      [
+        "Ġk",
+        "i"
+      ],
+      [
+        "m",
+        "p"
+      ],
+      [
+        "l",
+        "t"
+      ],
+      [
+        "É",
+        "£"
+      ],
+      [
+        "Ġp",
+        "a"
+      ],
+      [
+        "ËĪÉĻ",
+        "ËĲ"
+      ],
+      [
+        "Éª",
+        "s"
+      ],
+      [
+        "ĠÉ",
+        "Ĵ"
+      ],
+      [
+        "Ġl",
+        "e"
+      ],
+      [
+        "Éª",
+        "Éľ"
+      ],
+      [
+        "ËĪÉĽ",
+        "t"
+      ],
+      [
+        "Ġd",
+        "e"
+      ],
+      [
+        "ĠÉ",
+        "¹"
+      ],
+      [
+        "Ġt",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġ",
+        "Êģ"
+      ],
+      [
+        "Êĥ",
+        "ÉĻn"
+      ],
+      [
+        "ĠÊĬ",
+        "nt"
+      ],
+      [
+        "ËĪÉĶ",
+        "É¾"
+      ],
+      [
+        "ËĪa",
+        "Ã°"
+      ],
+      [
+        "Ġa",
+        "Éª"
+      ],
+      [
+        "ĠÊ",
+        "Ĳ"
+      ],
+      [
+        "Ġm",
+        "ËĪa"
+      ],
+      [
+        "r",
+        "a"
+      ],
+      [
+        "Ġk",
+        "ËĪÉª"
+      ],
+      [
+        "k",
+        "t"
+      ],
+      [
+        "ËĲ",
+        "p"
+      ],
+      [
+        "ĠÊ",
+        "Ī"
+      ],
+      [
+        "ËĪaËĲ",
+        "ÊĬ"
+      ],
+      [
+        "Ġk",
+        "ËĪÊĮÉ¾"
+      ],
+      [
+        "Ġ",
+        "ËĪÊĮ"
+      ],
+      [
+        "ĠÉĴ",
+        "v"
+      ],
+      [
+        "Ġe",
+        "l"
+      ],
+      [
+        "k",
+        "s"
+      ],
+      [
+        "Ġk",
+        "w"
+      ],
+      [
+        "ÉĻ",
+        "t"
+      ],
+      [
+        "nd",
+        "o"
+      ],
+      [
+        "e",
+        "i"
+      ],
+      [
+        "ĠËĮa",
+        "ËĲp"
+      ],
+      [
+        "s",
+        "e"
+      ],
+      [
+        "ÉĻ",
+        "É¹"
+      ],
+      [
+        "ËĪu",
+        "ei"
+      ],
+      [
+        "ÉĻ",
+        "s"
+      ],
+      [
+        "Ġk",
+        "ËĮo"
+      ],
+      [
+        "ĠÊ",
+        "Ĥ"
+      ],
+      [
+        "ĠËĮ",
+        "ÊĬ"
+      ],
+      [
+        "Ġ",
+        "c"
+      ],
+      [
+        "ĠÉĽ",
+        "n"
+      ],
+      [
+        "ËĪa",
+        "nt"
+      ],
+      [
+        "Î¸",
+        "j"
+      ],
+      [
+        "ËĮo",
+        "ËĲ"
+      ],
+      [
+        "Ġ",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġp",
+        "É¾"
+      ],
+      [
+        "s",
+        "i"
+      ],
+      [
+        "Ġ",
+        "ËĪe"
+      ],
+      [
+        "Ġj",
+        "uËĲ"
+      ],
+      [
+        "Ġk",
+        "ËĮe"
+      ],
+      [
+        "ËĮ",
+        "Éª"
+      ],
+      [
+        "ÉĶ",
+        "n"
+      ],
+      [
+        "Ġs",
+        "ËĪÊĮ"
+      ],
+      [
+        "Ġ",
+        "ËĪu"
+      ],
+      [
+        "n",
+        "i"
+      ],
+      [
+        "Ġs",
+        "t"
+      ],
+      [
+        "Ġd",
+        "iËĲ"
+      ],
+      [
+        "Ġk",
+        "eËĲ"
+      ],
+      [
+        "ĠjËĪi",
+        "ou"
+      ],
+      [
+        "ËĪai",
+        "Éľ"
+      ],
+      [
+        "Ġd",
+        "ÊĴ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĶ"
+      ],
+      [
+        "v",
+        "a"
+      ],
+      [
+        "ËĲ",
+        "É¾"
+      ],
+      [
+        "ËĪ",
+        "Ã¸"
+      ],
+      [
+        "ËĮÉĻ",
+        "ÊĬ"
+      ],
+      [
+        "Ġp",
+        "ËĪu"
+      ],
+      [
+        "Ġs",
+        "u"
+      ],
+      [
+        "Ġm",
+        "a"
+      ],
+      [
+        "Ġ",
+        "ÉĻ"
+      ],
+      [
+        "d",
+        "ÊĴ"
+      ],
+      [
+        "Ġp",
+        "Ê°"
+      ],
+      [
+        "l",
+        "e"
+      ],
+      [
+        "i",
+        "n"
+      ],
+      [
+        "ĠtÉķh",
+        "ËĪi"
+      ],
+      [
+        "Ġw",
+        "ËĪo"
+      ],
+      [
+        "r",
+        "o"
+      ],
+      [
+        "ËĮ",
+        "y"
+      ],
+      [
+        "É¾",
+        "a"
+      ],
+      [
+        "Ġs",
+        "ËĪi"
+      ],
+      [
+        "Ã°",
+        "ÉĻ"
+      ],
+      [
+        "Ġs",
+        "eËĲ"
+      ],
+      [
+        "l",
+        "a"
+      ],
+      [
+        "ĠÊ",
+        "Ĵ"
+      ],
+      [
+        "m",
+        "b"
+      ],
+      [
+        "Ġh",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġb",
+        "Ê°"
+      ],
+      [
+        "ĠÉĽ",
+        "É¾"
+      ],
+      [
+        "ĠÃ°",
+        "at"
+      ],
+      [
+        "s",
+        "p"
+      ],
+      [
+        "ÉĶ",
+        "É¾"
+      ],
+      [
+        "e",
+        "n"
+      ],
+      [
+        "Ġs",
+        "ÉĻ"
+      ],
+      [
+        "ËĪÉĶ",
+        "Éľ"
+      ],
+      [
+        "Ġl",
+        "ËĮa"
+      ],
+      [
+        "ĠËĮ",
+        "ÉĽ"
+      ],
+      [
+        "Ġ",
+        "ËĪy"
+      ],
+      [
+        "É¡",
+        "aËĲ"
+      ],
+      [
+        "Ġd",
+        "ÉĽÉ¾"
+      ],
+      [
+        "ËĪÉĽ",
+        "Êģ"
+      ],
+      [
+        "Éľ",
+        "kh"
+      ],
+      [
+        "ËĪi",
+        "ÉĻ"
+      ],
+      [
+        "ËĪa",
+        "n"
+      ],
+      [
+        "Ġm",
+        "ËĪo"
+      ],
+      [
+        "ËĪa",
+        "Î²"
+      ],
+      [
+        "Ġa",
+        "l"
+      ],
+      [
+        "Ġ",
+        "ËĪeËĲ"
+      ],
+      [
+        "Ġ",
+        "Î¸"
+      ],
+      [
+        "Ġn",
+        "ËĪi"
+      ],
+      [
+        "p",
+        "Ê°"
+      ],
+      [
+        "ll",
+        "a"
+      ],
+      [
+        "Ġp",
+        "l"
+      ],
+      [
+        "ËĪ",
+        "Åĵ"
+      ],
+      [
+        "j",
+        "ËĪÉĳu"
+      ],
+      [
+        "Ġa",
+        "v"
+      ],
+      [
+        "Ġm",
+        "ËĪi"
+      ],
+      [
+        "Ġf",
+        "ËĪa"
+      ],
+      [
+        "ËĪÉ",
+        "ľ"
+      ],
+      [
+        "m",
+        "e"
+      ],
+      [
+        "ËĮÉĻ",
+        "h"
+      ],
+      [
+        "ËĪu",
+        "ÉĻ"
+      ],
+      [
+        "i",
+        "t"
+      ],
+      [
+        "j",
+        "ËĪe"
+      ],
+      [
+        "Ġ",
+        "o"
+      ],
+      [
+        "ËĪÉľ",
+        "ËĲ"
+      ],
+      [
+        "ĠtÉķËĪi",
+        "ou"
+      ],
+      [
+        "ÉĶ",
+        "ËĲ"
+      ],
+      [
+        "Ġn",
+        "ÉĻ"
+      ],
+      [
+        "ËĪÉĻ",
+        "Éľn"
+      ],
+      [
+        "Ġm",
+        "ÉĻ"
+      ],
+      [
+        "Ġd",
+        "eËĲ"
+      ],
+      [
+        "m",
+        "o"
+      ],
+      [
+        "s",
+        "a"
+      ],
+      [
+        "j",
+        "ËĪÉĶ"
+      ],
+      [
+        "ËĪa",
+        "l"
+      ],
+      [
+        "ĠtÉķ",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ĠÉ¡",
+        "ÉĻ"
+      ],
+      [
+        "Ã°",
+        "a"
+      ],
+      [
+        "ĠÉª",
+        "z"
+      ],
+      [
+        "Ġs",
+        "a"
+      ],
+      [
+        "r",
+        "i"
+      ],
+      [
+        "ĠËĮi",
+        "l"
+      ],
+      [
+        "ËĮ",
+        "u"
+      ],
+      [
+        "Ġk",
+        "aËĲ"
+      ],
+      [
+        "ĠÉĻ",
+        "ËĲ"
+      ],
+      [
+        "ĠÉ",
+        "ĸ"
+      ],
+      [
+        "Ġk",
+        "a"
+      ],
+      [
+        "ËĪÊĮh",
+        "i"
+      ],
+      [
+        "Ġj",
+        "eËĲ"
+      ],
+      [
+        "Ġt",
+        "Ê°"
+      ],
+      [
+        "n",
+        "e"
+      ],
+      [
+        "k",
+        "ËĲ"
+      ],
+      [
+        "Ġts",
+        "ËĪai"
+      ],
+      [
+        "Ġ",
+        "ËĪeËĲk"
+      ],
+      [
+        "n",
+        "k"
+      ],
+      [
+        "t",
+        "i"
+      ],
+      [
+        "ËĪa",
+        "Éľn"
+      ],
+      [
+        "Ġk",
+        "ËĲ"
+      ],
+      [
+        "É¡",
+        "ÉĻn"
+      ],
+      [
+        "ËĪi",
+        "a"
+      ],
+      [
+        "ĠÉĶ",
+        "ËĲÉ¾"
+      ],
+      [
+        "Ê",
+        "ı"
+      ],
+      [
+        "ĠËĮ",
+        "ÊĮ"
+      ],
+      [
+        "Ġz",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġl",
+        "os"
+      ],
+      [
+        "ÉĽ",
+        "s"
+      ],
+      [
+        "ËĪÉĶ",
+        "n"
+      ],
+      [
+        "ÉĽ",
+        "nt"
+      ],
+      [
+        "ÉĽ",
+        "n"
+      ],
+      [
+        "ĠÉŁ",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ã§",
+        "t"
+      ],
+      [
+        "Ġd",
+        "as"
+      ],
+      [
+        "Ġx",
+        "ËĮo"
+      ],
+      [
+        "ËĪu",
+        "Éľ"
+      ],
+      [
+        "ËĪa",
+        "s"
+      ],
+      [
+        "Ġb",
+        "ËĪÊĮ"
+      ],
+      [
+        "ËĪiÉĽ",
+        "Éľn"
+      ],
+      [
+        "É",
+        "Ĳ"
+      ],
+      [
+        "Ġts",
+        "uËĲ"
+      ],
+      [
+        "Ġp",
+        "ËĮÉĽ"
+      ],
+      [
+        "Ġn",
+        "ËĪÉĶ"
+      ],
+      [
+        "ÊĬ",
+        "t"
+      ],
+      [
+        "m",
+        "a"
+      ],
+      [
+        "Ġn",
+        "ËĪo"
+      ],
+      [
+        "Ġl",
+        "ËĪÉª"
+      ],
+      [
+        "ËĪÉĽ",
+        "s"
+      ],
+      [
+        "Éª",
+        "l"
+      ],
+      [
+        "ĠÉķ",
+        "ËĪiÉĽ"
+      ],
+      [
+        "Ġ",
+        "ËĪÊĬ"
+      ],
+      [
+        "ÉĴ",
+        "t"
+      ],
+      [
+        "t",
+        "o"
+      ],
+      [
+        "Ġ",
+        "ËĪo"
+      ],
+      [
+        "ËĮo",
+        "n"
+      ],
+      [
+        "Ġk",
+        "wËĪa"
+      ],
+      [
+        "ĠÉª",
+        "t"
+      ],
+      [
+        "Ġh",
+        "oËĲ"
+      ],
+      [
+        "ËĪiËĲ",
+        "k"
+      ],
+      [
+        "ĠËĮaËĲp",
+        "k"
+      ],
+      [
+        "ËĪaÉª",
+        "n"
+      ],
+      [
+        "Ã",
+        "¦"
+      ],
+      [
+        "ÉĻn",
+        "t"
+      ],
+      [
+        "t",
+        "a"
+      ],
+      [
+        "l",
+        "o"
+      ],
+      [
+        "Ġn",
+        "ËĪÉĳ"
+      ],
+      [
+        "Ġl",
+        "ËĪa"
+      ],
+      [
+        "ËĪi",
+        "Éľ"
+      ],
+      [
+        "Ġw",
+        "ËĪei"
+      ],
+      [
+        "ÉĽ",
+        "Êģ"
+      ],
+      [
+        "Ġt",
+        "ËĪa"
+      ],
+      [
+        "ĠÉ¾",
+        "ËĮÉĻh"
+      ],
+      [
+        "ĠÉķËĪi",
+        "Éĳ"
+      ],
+      [
+        "ËĮi",
+        "ËĲ"
+      ],
+      [
+        "ËĮÉĽ",
+        "l"
+      ],
+      [
+        "ĠtÉĻ",
+        "Éľ"
+      ],
+      [
+        "Ġk",
+        "ËĪuo"
+      ],
+      [
+        "Ġt",
+        "ËĪu"
+      ],
+      [
+        "j",
+        "ËĪÉĽ"
+      ],
+      [
+        "ĠËĮi",
+        "n"
+      ],
+      [
+        "É¾",
+        "e"
+      ],
+      [
+        "Ġk",
+        "oËĲ"
+      ],
+      [
+        "Ġk",
+        "ËĪa"
+      ],
+      [
+        "É¾",
+        "i"
+      ],
+      [
+        "ĠtÉķËĪi",
+        "Éĳ"
+      ],
+      [
+        "l",
+        "ÉĻ"
+      ],
+      [
+        "Ġk",
+        "ÉĻ"
+      ],
+      [
+        "Ġt",
+        "ËĪi"
+      ],
+      [
+        "ĠÅĭ",
+        "ËĪyÉĻ"
+      ],
+      [
+        "Ġts",
+        "h"
+      ],
+      [
+        "e",
+        "r"
+      ],
+      [
+        "a",
+        "v"
+      ],
+      [
+        "ĠkÉĶ",
+        "n"
+      ],
+      [
+        "ËĪÉĻ",
+        "ÉľÅĭ"
+      ],
+      [
+        "Ã°",
+        "o"
+      ],
+      [
+        "ËĪaËĲ",
+        "n"
+      ],
+      [
+        "ĠbÊ°",
+        "ËĪi"
+      ],
+      [
+        "ĠkËĲ",
+        "jaËĲ"
+      ],
+      [
+        "ÉĻ",
+        "z"
+      ],
+      [
+        "Ġp",
+        "Êģ"
+      ],
+      [
+        "Ġd",
+        "ËĪÉª"
+      ],
+      [
+        "Ġz",
+        "iËĲ"
+      ],
+      [
+        "É¡",
+        "eËĲ"
+      ],
+      [
+        "Ġt",
+        "ËĪÉĻ"
+      ],
+      [
+        "Éª",
+        "z"
+      ],
+      [
+        "Ġn",
+        "ËĮon"
+      ],
+      [
+        "t",
+        "aËĲ"
+      ],
+      [
+        "b",
+        "l"
+      ],
+      [
+        "t",
+        "e"
+      ],
+      [
+        "n",
+        "ËĮeËĲ"
+      ],
+      [
+        "ËĪÉª",
+        "l"
+      ],
+      [
+        "s",
+        "o"
+      ],
+      [
+        "k",
+        "o"
+      ],
+      [
+        "u",
+        "Êģ"
+      ],
+      [
+        "ĠÉ",
+        "£"
+      ],
+      [
+        "Ġpa",
+        "Êģ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĽ"
+      ],
+      [
+        "j",
+        "ËĪuËĲ"
+      ],
+      [
+        "ËĮ",
+        "ÊĮ"
+      ],
+      [
+        "y",
+        "n"
+      ],
+      [
+        "ËĪiËĲ",
+        "n"
+      ],
+      [
+        "Ġl",
+        "ËĪaÉª"
+      ],
+      [
+        "ËĪÉª",
+        "Åĭ"
+      ],
+      [
+        "ĠtÉķh",
+        "ËĪy"
+      ],
+      [
+        "Ġn",
+        "ËĪÊĮhi"
+      ],
+      [
+        "Ġd",
+        "ËĮe"
+      ],
+      [
+        "Ġj",
+        "ËĪÉĳu"
+      ],
+      [
+        "Ġt",
+        "ËĪÉĳu"
+      ],
+      [
+        "Ġh",
+        "ËĪo"
+      ],
+      [
+        "Éª",
+        "d"
+      ],
+      [
+        "Ġth",
+        "ËĪÉĳ"
+      ],
+      [
+        "m",
+        "ËĪe"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĻ"
+      ],
+      [
+        "j",
+        "a"
+      ],
+      [
+        "Ġp",
+        "h"
+      ],
+      [
+        "ÉĽ",
+        "t"
+      ],
+      [
+        "Ġk",
+        "ËĪÊĮ"
+      ],
+      [
+        "t",
+        "ÉĻn"
+      ],
+      [
+        "m",
+        "ËĪÉĳ"
+      ],
+      [
+        "w",
+        "ËĪe"
+      ],
+      [
+        "ĠËĮa",
+        "Éªn"
+      ],
+      [
+        "ĠÃ°",
+        "Éªs"
+      ],
+      [
+        "É¡",
+        "ÉĻ"
+      ],
+      [
+        "Ġn",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġb",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġa",
+        "Î¸"
+      ],
+      [
+        "Ġm",
+        "ËĮa"
+      ],
+      [
+        "ËĪÊĮh",
+        "a"
+      ],
+      [
+        "Ġd",
+        "ËĮa"
+      ],
+      [
+        "ËĪ",
+        "Êı"
+      ],
+      [
+        "ĠÉ²",
+        "ËĮy"
+      ],
+      [
+        "Ġp",
+        "ËĪa"
+      ],
+      [
+        "ËĪaÃ°",
+        "o"
+      ],
+      [
+        "d",
+        "i"
+      ],
+      [
+        "b",
+        "Éľ"
+      ],
+      [
+        "É",
+        "³"
+      ],
+      [
+        "Ġw",
+        "iËĲ"
+      ],
+      [
+        "Ġn",
+        "ËĪÉª"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÉĶÉľ"
+      ],
+      [
+        "tËĲ",
+        "o"
+      ],
+      [
+        "ËĮÉĻ",
+        "m"
+      ],
+      [
+        "ËĪaËĲ",
+        "r"
+      ],
+      [
+        "Ġm",
+        "ÉĽ"
+      ],
+      [
+        "ËĪeËĲ",
+        "É¡aËĲ"
+      ],
+      [
+        "Ġs",
+        "ËĮi"
+      ],
+      [
+        "Ġl",
+        "ËĮaËĲ"
+      ],
+      [
+        "n",
+        "ËĮaËĲ"
+      ],
+      [
+        "Ġs",
+        "p"
+      ],
+      [
+        "t",
+        "Êģ"
+      ],
+      [
+        "ĠÊ",
+        "İ"
+      ],
+      [
+        "ËĮ",
+        "ÉĳËĲ"
+      ],
+      [
+        "Ġk",
+        "l"
+      ],
+      [
+        "k",
+        "Ê°"
+      ],
+      [
+        "i",
+        "l"
+      ],
+      [
+        "ĠÊĥ",
+        "t"
+      ],
+      [
+        "ĠËĮÊĬ",
+        "n"
+      ],
+      [
+        "a",
+        "l"
+      ],
+      [
+        "Ġs",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġm",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġ",
+        "Åĵ"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÊĮ"
+      ],
+      [
+        "ĠpËĮÉĽ",
+        "r"
+      ],
+      [
+        "É¾",
+        "ËĪa"
+      ],
+      [
+        "ËĲ",
+        "ÊĪ"
+      ],
+      [
+        "ËĪaÎ²",
+        "a"
+      ],
+      [
+        "Ġw",
+        "ËĪÉĴ"
+      ],
+      [
+        "Ġx",
+        "ËĪuei"
+      ],
+      [
+        "Ġkh",
+        "ËĪo"
+      ],
+      [
+        "Ġla",
+        "s"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪo"
+      ],
+      [
+        "Ġf",
+        "ÉĽÉ¾"
+      ],
+      [
+        "Ġj",
+        "ËĪiÉĽ"
+      ],
+      [
+        "Ġt",
+        "ËĪe"
+      ],
+      [
+        "Ġk",
+        "ËĮÉĶ"
+      ],
+      [
+        "ĠdeËĲ",
+        "n"
+      ],
+      [
+        "Ġm",
+        "o"
+      ],
+      [
+        "Ġp",
+        "ËĪi"
+      ],
+      [
+        "Ġt",
+        "ËĪÉĳ"
+      ],
+      [
+        "ËĪÉĽ",
+        "st"
+      ],
+      [
+        "w",
+        "ËĪÉĳ"
+      ],
+      [
+        "ËĪaÉª",
+        "t"
+      ],
+      [
+        "ÉĻ",
+        "ÊĬ"
+      ],
+      [
+        "Ġ",
+        "ËĪi"
+      ],
+      [
+        "Éª",
+        "j"
+      ],
+      [
+        "a",
+        "Éª"
+      ],
+      [
+        "ËĪaËĲ",
+        "Éľ"
+      ],
+      [
+        "ĠËĪÉª",
+        "s"
+      ],
+      [
+        "Ġp",
+        "ÉĶÉ¾"
+      ],
+      [
+        "Ã¦",
+        "Éľn"
+      ],
+      [
+        "k",
+        "a"
+      ],
+      [
+        "Åĭ",
+        "É¡"
+      ],
+      [
+        "b",
+        "ÉĻn"
+      ],
+      [
+        "ÊĬ",
+        "f"
+      ],
+      [
+        "Ġp",
+        "É¹"
+      ],
+      [
+        "Ġl",
+        "ËĮe"
+      ],
+      [
+        "ËĪiËĲ",
+        "d"
+      ],
+      [
+        "ËĪaËĲ",
+        "re"
+      ],
+      [
+        "Ġm",
+        "ËĪÊĮ"
+      ],
+      [
+        "ÉĻ",
+        "r"
+      ],
+      [
+        "Ġd",
+        "Éĳ"
+      ],
+      [
+        "ËĪaËĲt",
+        "o"
+      ],
+      [
+        "Ġp",
+        "ËĪeËĲ"
+      ],
+      [
+        "Ġd",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġs",
+        "ËĮÊĬ"
+      ],
+      [
+        "Ġh",
+        "ËĪi"
+      ],
+      [
+        "Ġs",
+        "ËĪa"
+      ],
+      [
+        "ËĪeËĲ",
+        "n"
+      ],
+      [
+        "d",
+        "ÉĻ"
+      ],
+      [
+        "Ġp",
+        "j"
+      ],
+      [
+        "ËĪÅĵ",
+        "Êģ"
+      ],
+      [
+        "l",
+        "ÉªÃ§"
+      ],
+      [
+        "ÉĴ",
+        "n"
+      ],
+      [
+        "ĠËĪÉĻ",
+        "r"
+      ],
+      [
+        "t",
+        "ËĪe"
+      ],
+      [
+        "Ġi",
+        "l"
+      ],
+      [
+        "ËĪaËĲ",
+        "l"
+      ],
+      [
+        "Ġs",
+        "ËĮÉĻÊĬ"
+      ],
+      [
+        "s",
+        "ÊĪ"
+      ],
+      [
+        "Ġd",
+        "ËĪuËĲ"
+      ],
+      [
+        "h",
+        "ËĪÉĳ"
+      ],
+      [
+        "Ġx",
+        "ËĪou"
+      ],
+      [
+        "Ġl",
+        "ËĪaiÉľ"
+      ],
+      [
+        "w",
+        "ËĪo"
+      ],
+      [
+        "ËĪÉĽnt",
+        "e"
+      ],
+      [
+        "Ġs",
+        "y"
+      ],
+      [
+        "Ġz",
+        "ÉªÃ§"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪu"
+      ],
+      [
+        "ĠÉķ",
+        "ËĪy"
+      ],
+      [
+        "ËĪÉĶËĲ",
+        "l"
+      ],
+      [
+        "ÉĶ",
+        "l"
+      ],
+      [
+        "Ġt",
+        "ËĪo"
+      ],
+      [
+        "ĠÊĭ",
+        "oËĲ"
+      ],
+      [
+        "Ġ",
+        "iËĲ"
+      ],
+      [
+        "wËĪa",
+        "Ã°a"
+      ],
+      [
+        "ËĪa",
+        "ndo"
+      ],
+      [
+        "ĠaÎ¸",
+        "ÉĽnt"
+      ],
+      [
+        "ĠaÎ¸ÉĽnt",
+        "wËĪaÃ°a"
+      ],
+      [
+        "Ġt",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ËĪei",
+        "Éľ"
+      ],
+      [
+        "Ġp",
+        "ËĮa"
+      ],
+      [
+        "Ġn",
+        "ËĪaÉª"
+      ],
+      [
+        "w",
+        "a"
+      ],
+      [
+        "Ġf",
+        "r"
+      ],
+      [
+        "ĠÊĲ",
+        "ËĪÉĻÉľn"
+      ],
+      [
+        "ËĪu",
+        "a"
+      ],
+      [
+        "m",
+        "i"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĽ"
+      ],
+      [
+        "ËĪeËĲk",
+        "Ê°"
+      ],
+      [
+        "c",
+        "Ê°"
+      ],
+      [
+        "Ġw",
+        "ËĪÉĳ"
+      ],
+      [
+        "st",
+        "a"
+      ],
+      [
+        "Ġt",
+        "u"
+      ],
+      [
+        "Ġs",
+        "k"
+      ],
+      [
+        "ËĪÉĶ",
+        "l"
+      ],
+      [
+        "ËĪeËĲ",
+        "ÊĪ"
+      ],
+      [
+        "Ġl",
+        "ËĪaËĲÉª"
+      ],
+      [
+        "Ġl",
+        "ËĪaËĲ"
+      ],
+      [
+        "ËĪÉĽËĲ",
+        "s"
+      ],
+      [
+        "ËĪÉĽÉ¾",
+        "a"
+      ],
+      [
+        "ËĪÉĻ",
+        "Éľt"
+      ],
+      [
+        "Ġ",
+        "yn"
+      ],
+      [
+        "d",
+        "ÉĻn"
+      ],
+      [
+        "Ġd",
+        "i"
+      ],
+      [
+        "ËĪiËĲ",
+        "s"
+      ],
+      [
+        "ĠÃ°e",
+        "l"
+      ],
+      [
+        "ËĪÊĮ",
+        "r"
+      ],
+      [
+        "Ġh",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġb",
+        "ÉĻ"
+      ],
+      [
+        "Ġj",
+        "ËĪuËĲ"
+      ],
+      [
+        "ll",
+        "e"
+      ],
+      [
+        "st",
+        "o"
+      ],
+      [
+        "ËĪÉª",
+        "t"
+      ],
+      [
+        "ËĪoËĲ",
+        "É¾"
+      ],
+      [
+        "b",
+        "Ê°"
+      ],
+      [
+        "m",
+        "ÉĻn"
+      ],
+      [
+        "ËĮu",
+        "ÉĻ"
+      ],
+      [
+        "ËĮÉĻ",
+        "É¾"
+      ],
+      [
+        "ËĪÊĮ",
+        "n"
+      ],
+      [
+        "ĠlËĪaÉª",
+        "k"
+      ],
+      [
+        "Ġb",
+        "ËĪa"
+      ],
+      [
+        "Éª",
+        "Ã°"
+      ],
+      [
+        "Ġl",
+        "o"
+      ],
+      [
+        "z",
+        "i"
+      ],
+      [
+        "ËĪÊĮ",
+        "st"
+      ],
+      [
+        "m",
+        "ËĪi"
+      ],
+      [
+        "ÉĶ",
+        "Êģ"
+      ],
+      [
+        "ĠnËĪÉª",
+        "Ã§t"
+      ],
+      [
+        "Ġt",
+        "É¾"
+      ],
+      [
+        "Ġd",
+        "ËĪeËĲkÊ°"
+      ],
+      [
+        "Ġs",
+        "ËĮe"
+      ],
+      [
+        "Ġn",
+        "ËĪÉĻÊĬ"
+      ],
+      [
+        "Ġ",
+        "u"
+      ],
+      [
+        "Ġs",
+        "i"
+      ],
+      [
+        "ĠÉª",
+        "Ã§"
+      ],
+      [
+        "Ġp",
+        "r"
+      ],
+      [
+        "ĠtÉķ",
+        "ËĪy"
+      ],
+      [
+        "Ġm",
+        "ËĪu"
+      ],
+      [
+        "z",
+        "a"
+      ],
+      [
+        "Ġt",
+        "Êģ"
+      ],
+      [
+        "Ġw",
+        "ÉªÃ°"
+      ],
+      [
+        "t",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġp",
+        "ËĪÊĮÉ¾"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĶ"
+      ],
+      [
+        "ËĪoËĲ",
+        "r"
+      ],
+      [
+        "Ġh",
+        "ËĮa"
+      ],
+      [
+        "Ġk",
+        "ËĪonÉ¡"
+      ],
+      [
+        "Ġp",
+        "uÊģ"
+      ],
+      [
+        "Ġd",
+        "y"
+      ],
+      [
+        "ËĪÉª",
+        "n"
+      ],
+      [
+        "nt",
+        "e"
+      ],
+      [
+        "Ġk",
+        "ËĮa"
+      ],
+      [
+        "ËĪÉĻ",
+        "Éª"
+      ],
+      [
+        "Ġm",
+        "i"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĮuÉĻ"
+      ],
+      [
+        "ĠÊ",
+        "²"
+      ],
+      [
+        "Ġf",
+        "ËĪÉĳ"
+      ],
+      [
+        "Ġv",
+        "ÉĳËĲ"
+      ],
+      [
+        "ĠËĮa",
+        "ÊĬ"
+      ],
+      [
+        "ËĮ",
+        "uËĲ"
+      ],
+      [
+        "ĠËĪu",
+        "n"
+      ],
+      [
+        "Ġj",
+        "ËĪÊĮha"
+      ],
+      [
+        "j",
+        "uËĲ"
+      ],
+      [
+        "Ġm",
+        "Éªt"
+      ],
+      [
+        "Ġl",
+        "ËĪÉĽ"
+      ],
+      [
+        "ËĪeËĲ",
+        "Êĥ"
+      ],
+      [
+        "Ġf",
+        "ÉĶËĲ"
+      ],
+      [
+        "m",
+        "ÉĻ"
+      ],
+      [
+        "É¾",
+        "t"
+      ],
+      [
+        "ĠkËĮo",
+        "n"
+      ],
+      [
+        "Ġl",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġx",
+        "ËĪÉĳu"
+      ],
+      [
+        "p",
+        "l"
+      ],
+      [
+        "Ġd",
+        "ËĪi"
+      ],
+      [
+        "Ġl",
+        "ËĪoËĲ"
+      ],
+      [
+        "s",
+        "ÉĻ"
+      ],
+      [
+        "ËĪaËĲ",
+        "va"
+      ],
+      [
+        "Ġl",
+        "ËĪu"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĮÉĻÊĬ"
+      ],
+      [
+        "Ġh",
+        "av"
+      ],
+      [
+        "ĠËĮaËĲpk",
+        "ËĮoËĲ"
+      ],
+      [
+        "É¾",
+        "ËĪi"
+      ],
+      [
+        "Ġf",
+        "ËĪÉĻ"
+      ],
+      [
+        "Ġh",
+        "ËĮÉĻm"
+      ],
+      [
+        "ËĪonÉ¡",
+        "Éľ"
+      ],
+      [
+        "j",
+        "o"
+      ],
+      [
+        "Ġs",
+        "ÉĶ"
+      ],
+      [
+        "ËĪaËĲ",
+        "d"
+      ],
+      [
+        "w",
+        "ËĪiÉĻ"
+      ],
+      [
+        "ËĪa",
+        "nd"
+      ],
+      [
+        "ËĮa",
+        "Éªn"
+      ],
+      [
+        "t",
+        "É¾"
+      ],
+      [
+        "ĠËĮ",
+        "Éª"
+      ],
+      [
+        "ĠËĪu",
+        "na"
+      ],
+      [
+        "Ġx",
+        "wËĪÉĳ"
+      ],
+      [
+        "Ġj",
+        "ÉĶËĲ"
+      ],
+      [
+        "Êģ",
+        "ËĪi"
+      ],
+      [
+        "ĠkËĪuo",
+        "Éľ"
+      ],
+      [
+        "Ġa",
+        "Î²"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪaËĲ"
+      ],
+      [
+        "an",
+        "o"
+      ],
+      [
+        "t",
+        "ÉĻl"
+      ],
+      [
+        "Ġr",
+        "ËĮe"
+      ],
+      [
+        "ËĮÊĮ",
+        "t"
+      ],
+      [
+        "ĠjËĪi",
+        "Éĳ"
+      ],
+      [
+        "ĠÉ¾ËĮÉĻh",
+        "aËĲ"
+      ],
+      [
+        "Ġm",
+        "ËĪe"
+      ],
+      [
+        "ĠËĪy",
+        "Ã¦Éľn"
+      ],
+      [
+        "Ġf",
+        "ËĪu"
+      ],
+      [
+        "Ġb",
+        "l"
+      ],
+      [
+        "n",
+        "ËĪi"
+      ],
+      [
+        "s",
+        "ÉĻn"
+      ],
+      [
+        "Ġa",
+        "Éªn"
+      ],
+      [
+        "ËĪi",
+        "ÊĬ"
+      ],
+      [
+        "ĠÃ°e",
+        "Éª"
+      ],
+      [
+        "ĠÉª",
+        "ts"
+      ],
+      [
+        "Ġ",
+        "("
+      ],
+      [
+        "ËĪy",
+        "ËĲ"
+      ],
+      [
+        "ÉĻ",
+        "d"
+      ],
+      [
+        "ĠËĮ",
+        "o"
+      ],
+      [
+        "ĠÉĽ",
+        "s"
+      ],
+      [
+        "Ġv",
+        "iËĲ"
+      ],
+      [
+        "ËĲ",
+        "É¡eËĲ"
+      ],
+      [
+        "k",
+        "ËĪe"
+      ],
+      [
+        "ĠËĪa",
+        "l"
+      ],
+      [
+        "ÉĽ",
+        "l"
+      ],
+      [
+        "Ġ",
+        "ÊĮ"
+      ],
+      [
+        "ËĲ",
+        "o"
+      ],
+      [
+        "Ġk",
+        "ËĪo"
+      ],
+      [
+        "ĠÊĪ",
+        "ËĪuËĲ"
+      ],
+      [
+        "Ġs",
+        "ËĪÉª"
+      ],
+      [
+        "ËĪeËĲ",
+        "É¾"
+      ],
+      [
+        "Éľ",
+        "m"
+      ],
+      [
+        "ËĮ",
+        "ÉĻn"
+      ],
+      [
+        "ËĪaËĲ",
+        "i"
+      ],
+      [
+        "ËĪoËĲ",
+        "l"
+      ],
+      [
+        "Éª",
+        "ËĮeËĲ"
+      ],
+      [
+        "ĠÊ²",
+        "ËĪy"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "s",
+        "ËĪi"
+      ],
+      [
+        "Ġl",
+        "ËĪe"
+      ],
+      [
+        "ËĮ",
+        "ÉĴt"
+      ],
+      [
+        "ËĪiËĲ",
+        "p"
+      ],
+      [
+        "a",
+        "Êģ"
+      ],
+      [
+        "ĠÎ¸",
+        "ËĪÉªÅĭ"
+      ],
+      [
+        "ËĪÉĻËĲ",
+        "Éª"
+      ],
+      [
+        "ËĪÊĮ",
+        "l"
+      ],
+      [
+        "ĠhËĪoËĲ",
+        "taËĲ"
+      ],
+      [
+        "ËĪo",
+        "Éª"
+      ],
+      [
+        "nt",
+        "o"
+      ],
+      [
+        "z",
+        "h"
+      ],
+      [
+        "ĠdeËĲ",
+        "m"
+      ],
+      [
+        "ĠkÉĶ",
+        "m"
+      ],
+      [
+        "Ê°",
+        "ËĪiËĲk"
+      ],
+      [
+        "ĠdÊĴ",
+        "ËĪÊĮst"
+      ],
+      [
+        "p",
+        "É¾"
+      ],
+      [
+        "Ġl",
+        "y"
+      ],
+      [
+        "h",
+        "ËĪu"
+      ],
+      [
+        "ËĪÉĶ",
+        "Ã¸"
+      ],
+      [
+        "ËĪaËĲ",
+        "s"
+      ],
+      [
+        "ĠËĪa",
+        "n"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĴ"
+      ],
+      [
+        "Ġk",
+        "an"
+      ],
+      [
+        "Ġts",
+        "ËĪuo"
+      ],
+      [
+        "ËĪeËĲ",
+        "va"
+      ],
+      [
+        "ĠÉ¡",
+        "É¾"
+      ],
+      [
+        "Ġp",
+        "o"
+      ],
+      [
+        "ĠtÊĥ",
+        "ËĪÉĶ"
+      ],
+      [
+        "Êİ",
+        "a"
+      ],
+      [
+        "Ġm",
+        "ËĮi"
+      ],
+      [
+        "Êĥ",
+        "t"
+      ],
+      [
+        "t",
+        "ËĪi"
+      ],
+      [
+        "Ġh",
+        "ËĪÊĮ"
+      ],
+      [
+        "tÊĥ",
+        "e"
+      ],
+      [
+        "Ġf",
+        "ÉĶn"
+      ],
+      [
+        "v",
+        "e"
+      ],
+      [
+        "Ġn",
+        "ËĮe"
+      ],
+      [
+        "ËĪÉĶ",
+        "Êģ"
+      ],
+      [
+        "i",
+        "z"
+      ],
+      [
+        "Ġs",
+        "ËĪuo"
+      ],
+      [
+        "ËĪÉĽËĲ",
+        "r"
+      ],
+      [
+        "wËĪa",
+        "Êģ"
+      ],
+      [
+        "ËĪaÃ°",
+        "a"
+      ],
+      [
+        "Åĭ",
+        "k"
+      ],
+      [
+        "p",
+        "o"
+      ],
+      [
+        "Ġk",
+        "ËĪi"
+      ],
+      [
+        "ËĪa",
+        "d"
+      ],
+      [
+        "Ġv",
+        "ËĪi"
+      ],
+      [
+        "t",
+        "Éķ"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĻ"
+      ],
+      [
+        "Ġw",
+        "ËĪu"
+      ],
+      [
+        "ÉĴ",
+        "z"
+      ],
+      [
+        "ĠvÉĳËĲ",
+        "É¾"
+      ],
+      [
+        "Êģ",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġk",
+        "ËĪaËĲ"
+      ],
+      [
+        "k",
+        "e"
+      ],
+      [
+        "n",
+        "ÉĻ"
+      ],
+      [
+        "ËĪÊĮ",
+        "b"
+      ],
+      [
+        "ËĪuËĲ",
+        "É¾"
+      ],
+      [
+        "ËĮÉĻ",
+        "ËĲ"
+      ],
+      [
+        "ĠÊĪ",
+        "Ê°ËĪiËĲk"
+      ],
+      [
+        "Ġk",
+        "ËĪu"
+      ],
+      [
+        "Ġb",
+        "ËĮÊĮt"
+      ],
+      [
+        "Ġa",
+        "t"
+      ],
+      [
+        "Ġf",
+        "É¹"
+      ],
+      [
+        "ËĪa",
+        "x"
+      ],
+      [
+        "Ġz",
+        "oËĲ"
+      ],
+      [
+        "Ġt",
+        "ËĪaËĲ"
+      ],
+      [
+        "ĠÃ°",
+        "ËĮe"
+      ],
+      [
+        "n",
+        "eËĲ"
+      ],
+      [
+        "ĠÉĳ",
+        "ËĲ"
+      ],
+      [
+        "Ġa",
+        "ÊĬf"
+      ],
+      [
+        "a",
+        "m"
+      ],
+      [
+        "ÊĬ",
+        "Åĭ"
+      ],
+      [
+        "ĠÉĶ",
+        "ËĲ"
+      ],
+      [
+        "ĠÉķËĪi",
+        "ÉľÅĭ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĶËĲl"
+      ],
+      [
+        "Éª",
+        "m"
+      ],
+      [
+        "j",
+        "ËĪo"
+      ],
+      [
+        "ËĪiËĲ",
+        "ÉŁ"
+      ],
+      [
+        "Ġkw",
+        "ËĮÉĽ"
+      ],
+      [
+        "ĠmËĪa",
+        "s"
+      ],
+      [
+        "ÉĻ",
+        "h"
+      ],
+      [
+        "ĠËĪa",
+        "ÊĬ"
+      ],
+      [
+        "ËĪÉĶ",
+        "Éª"
+      ],
+      [
+        "É¡",
+        "ÉĻÉ¾"
+      ],
+      [
+        "r",
+        "ÉĻn"
+      ],
+      [
+        "ËĪÉª",
+        "k"
+      ],
+      [
+        "s",
+        "se"
+      ],
+      [
+        "Ġp",
+        "ËĪÉĳ"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĮe"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪi"
+      ],
+      [
+        "Ġa",
+        "z"
+      ],
+      [
+        "ĠÉ¡ËĪÊĮ",
+        "jaËĲ"
+      ],
+      [
+        "z",
+        "e"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĮaËĲ"
+      ],
+      [
+        "Ġf",
+        "ËĪi"
+      ],
+      [
+        "ĠËĮ",
+        "ÉĴn"
+      ],
+      [
+        "Ġx",
+        "ËĪo"
+      ],
+      [
+        "ĠËĮÊĬ",
+        "na"
+      ],
+      [
+        "ĠtÊ°",
+        "aËĲ"
+      ],
+      [
+        "Ġs",
+        "Éĳ"
+      ],
+      [
+        "ËĪeÉª",
+        "ÊĥÉĻn"
+      ],
+      [
+        "ĠtÉķËĪi",
+        "Éľ"
+      ],
+      [
+        "ĠÉŁ",
+        "aËĲ"
+      ],
+      [
+        "p",
+        "ËĲ"
+      ],
+      [
+        "Ġpl",
+        "y"
+      ],
+      [
+        "Î¸",
+        "ËĪi"
+      ],
+      [
+        "ËĲ",
+        "Éĸ"
+      ],
+      [
+        "Ġt",
+        "ËĪuei"
+      ],
+      [
+        "Ġl",
+        "ËĪÉĻ"
+      ],
+      [
+        "Ġd",
+        "ÉĳËĲ"
+      ],
+      [
+        "f",
+        "t"
+      ],
+      [
+        "ËĪa",
+        "m"
+      ],
+      [
+        "ĠsËĪÊĮ",
+        "kt"
+      ],
+      [
+        "Ġt",
+        "ËĪou"
+      ],
+      [
+        "Ġp",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ĠËĪa",
+        "i"
+      ],
+      [
+        "ĠwËĪÉĴ",
+        "n"
+      ],
+      [
+        "Ġz",
+        "ËĮaÉªn"
+      ],
+      [
+        "Ġe",
+        "st"
+      ],
+      [
+        "Ġm",
+        "ÉĶ"
+      ],
+      [
+        "ĠtÉķ",
+        "jËĪÉĳu"
+      ],
+      [
+        "Éľ",
+        "p"
+      ],
+      [
+        "ËĪÊĮ",
+        "z"
+      ],
+      [
+        "b",
+        "i"
+      ],
+      [
+        "ËĪÉĽËĲs",
+        "eËĲ"
+      ],
+      [
+        "Ġl",
+        "ËĪy"
+      ],
+      [
+        "Ġm",
+        "ËĮe"
+      ],
+      [
+        "Ġd",
+        "ËĮÉĽl"
+      ],
+      [
+        "ËĪiËĲ",
+        "l"
+      ],
+      [
+        "ĠkËĮo",
+        "mo"
+      ],
+      [
+        "Ġh",
+        "ËĪaÉľn"
+      ],
+      [
+        "ËĪoËĲ",
+        "ne"
+      ],
+      [
+        "ĠkËĪÊĮÉ¾",
+        "t"
+      ],
+      [
+        "Ġsy",
+        "Êģ"
+      ],
+      [
+        "ËĮÉĶ",
+        "É¾"
+      ],
+      [
+        "ĠÉª",
+        "f"
+      ],
+      [
+        "u",
+        "v"
+      ],
+      [
+        "z",
+        "ÉĻn"
+      ],
+      [
+        "o",
+        "l"
+      ],
+      [
+        "Ï",
+        "ĩ"
+      ],
+      [
+        "i",
+        "m"
+      ],
+      [
+        "Ġm",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ĠÃ°",
+        "Éª"
+      ],
+      [
+        "Ġv",
+        "ËĪÉĽ"
+      ],
+      [
+        "ÊĬ",
+        "d"
+      ],
+      [
+        "Ġt",
+        "r"
+      ],
+      [
+        "ËĪeËĲ",
+        "s"
+      ],
+      [
+        "Ã°",
+        "e"
+      ],
+      [
+        "d",
+        "e"
+      ],
+      [
+        "Ê°",
+        "Ïĩ"
+      ],
+      [
+        "ÉŁ",
+        "Ê°"
+      ],
+      [
+        "ËĮÉĻËĲ",
+        "ÉªÉľ"
+      ],
+      [
+        "b",
+        "ËĲ"
+      ],
+      [
+        "ËĪÊĬ",
+        "k"
+      ],
+      [
+        "ĠnËĪÉĶ",
+        "ÉªÉľ"
+      ],
+      [
+        "ĠËĮ",
+        "iËĲ"
+      ],
+      [
+        "ËĪÉĳËĲ",
+        "t"
+      ],
+      [
+        "ËĪiËĲ",
+        "É¾"
+      ],
+      [
+        "Ġt",
+        "É¹"
+      ],
+      [
+        "É¾",
+        "ÉĶ"
+      ],
+      [
+        "Ġw",
+        "ÉĴz"
+      ],
+      [
+        "Ġv",
+        "u"
+      ],
+      [
+        "b",
+        "ÉĻl"
+      ],
+      [
+        "b",
+        "ÉĻ"
+      ],
+      [
+        "É¹",
+        "i"
+      ],
+      [
+        "nt",
+        "s"
+      ],
+      [
+        "Ġs",
+        "ËĪaËĲ"
+      ],
+      [
+        "d",
+        "Ê°"
+      ],
+      [
+        "Ġt",
+        "ÊĬ"
+      ],
+      [
+        "ĠÊİ",
+        "ËĮi"
+      ],
+      [
+        "Î²",
+        "a"
+      ],
+      [
+        "h",
+        "ËĪÉĻÉľÅĭ"
+      ],
+      [
+        "Ġs",
+        "ËĪiËĲ"
+      ],
+      [
+        "ĠpËĮa",
+        "É¾a"
+      ],
+      [
+        "ËĪÉĽÉ¾",
+        "ÉĶ"
+      ],
+      [
+        "ËĪÉª",
+        "s"
+      ],
+      [
+        "É£",
+        "o"
+      ],
+      [
+        "ĠËĮa",
+        "l"
+      ],
+      [
+        "o",
+        "r"
+      ],
+      [
+        "Ġb",
+        "ËĪÊĮh"
+      ],
+      [
+        "Ġk",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġt",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġp",
+        "ËĪo"
+      ],
+      [
+        "ĠÊĴ",
+        "ÉĻ"
+      ],
+      [
+        "p",
+        "Êģ"
+      ],
+      [
+        "Ġ",
+        "ËĪaÉª"
+      ],
+      [
+        "hËĪÉĳ",
+        "ÉľÅĭ"
+      ],
+      [
+        "ÉĻl",
+        "i"
+      ],
+      [
+        "ËĪeÉª",
+        "t"
+      ],
+      [
+        "ĠjËĪiou",
+        "Éľ"
+      ],
+      [
+        "Ġd",
+        "ËĪÉĻ"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "l",
+        "ËĪi"
+      ],
+      [
+        "ËĮy",
+        "ÉĻ"
+      ],
+      [
+        "ĠlËĪoËĲ",
+        "É¡"
+      ],
+      [
+        "Ġn",
+        "ËĪÊĮ"
+      ],
+      [
+        "Ġh",
+        "ËĪÊĬ"
+      ],
+      [
+        "Ġn",
+        "ËĪÉĻÉľÅĭ"
+      ],
+      [
+        "ĠÊģ",
+        "ÉĻ"
+      ],
+      [
+        "z",
+        "ËĪi"
+      ],
+      [
+        "Ġt",
+        "ËĪuËĲ"
+      ],
+      [
+        "ĠkËĮo",
+        "me"
+      ],
+      [
+        "Ġl",
+        "ËĪeËĲ"
+      ],
+      [
+        "ËĪaËĲt",
+        "aËĲ"
+      ],
+      [
+        "Ġa",
+        "n"
+      ],
+      [
+        "ĠËĪy",
+        "u"
+      ],
+      [
+        "ĠËĮÊĮ",
+        "É¡ÉĻÉ¾"
+      ],
+      [
+        "ĠËĪÉª",
+        "n"
+      ],
+      [
+        "ĠhËĪo",
+        "ÉĻ"
+      ],
+      [
+        "v",
+        "ÉĻ"
+      ],
+      [
+        "ËĪÃ¸",
+        "ËĲ"
+      ],
+      [
+        "Î¸j",
+        "a"
+      ],
+      [
+        "ËĪuÉĻ",
+        "Éľn"
+      ],
+      [
+        "Ġk",
+        "ÉĻÉ¾"
+      ],
+      [
+        "ËĪa",
+        "t"
+      ],
+      [
+        "j",
+        "ËĪÃ¸"
+      ],
+      [
+        "ËĪÉĽt",
+        "Êģ"
+      ],
+      [
+        "Ġp",
+        "ËĪÉĳu"
+      ],
+      [
+        "st",
+        "ÉĻ"
+      ],
+      [
+        "Ġw",
+        "ÉĴt"
+      ],
+      [
+        "ËĪeËĲ",
+        "l"
+      ],
+      [
+        "ÊĪ",
+        "i"
+      ],
+      [
+        "Ġx",
+        "ËĪaiÉľ"
+      ],
+      [
+        "ËĪy",
+        "Êģ"
+      ],
+      [
+        "ĠhËĪoËĲ",
+        "É¡aËĲ"
+      ],
+      [
+        "Ġts",
+        "ËĪi"
+      ],
+      [
+        "ĠËĪÊĮ",
+        "p"
+      ],
+      [
+        "Ġn",
+        "ËĮÉĴt"
+      ],
+      [
+        "ĠlËĪÉª",
+        "eËĲ"
+      ],
+      [
+        "Ġh",
+        "ËĪa"
+      ],
+      [
+        "Ġf",
+        "l"
+      ],
+      [
+        "Ġn",
+        "ËĪeËĲ"
+      ],
+      [
+        "ËĮaËĲ",
+        "Éª"
+      ],
+      [
+        "Ġt",
+        "ËĪuo"
+      ],
+      [
+        "tÊĥ",
+        "ËĲ"
+      ],
+      [
+        "s",
+        "ËĪe"
+      ],
+      [
+        "bÊ°",
+        "i"
+      ],
+      [
+        "ĠbËĪÊĮh",
+        "ÊĬt"
+      ],
+      [
+        "ËĪÉĽ",
+        "nd"
+      ],
+      [
+        "Ġs",
+        "ËĪÉĶ"
+      ],
+      [
+        "ÉĻn",
+        "s"
+      ],
+      [
+        "ËĮÉĻ",
+        "l"
+      ],
+      [
+        "ÉĽ",
+        "Éľ"
+      ],
+      [
+        "ĠÉ¡",
+        "l"
+      ],
+      [
+        "ËĪÉª",
+        "É¾"
+      ],
+      [
+        "ËĪaËĲt",
+        "a"
+      ],
+      [
+        "Éľ",
+        "ËĲ"
+      ],
+      [
+        "ËĪÉĽnt",
+        "o"
+      ],
+      [
+        "sk",
+        "ËĮoËĲ"
+      ],
+      [
+        "ËĪÉĽ",
+        "k"
+      ],
+      [
+        "ts",
+        "i"
+      ],
+      [
+        "Ġt",
+        "ËĪonÉ¡"
+      ],
+      [
+        "Ġb",
+        "iËĲ"
+      ],
+      [
+        "Ġh",
+        "ËĪaËĲÉª"
+      ],
+      [
+        "Ġb",
+        "ËĪi"
+      ],
+      [
+        "j",
+        "j"
+      ],
+      [
+        "Êİ",
+        "i"
+      ],
+      [
+        "Ġk",
+        "Ê°"
+      ],
+      [
+        "Ġs",
+        "ËĪo"
+      ],
+      [
+        "ll",
+        "o"
+      ],
+      [
+        "Ġb",
+        "aÉª"
+      ],
+      [
+        "ĠÉĽ",
+        "nt"
+      ],
+      [
+        "Ġ",
+        "ËĪiËĲ"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪo"
+      ],
+      [
+        "É¾",
+        "eËĲ"
+      ],
+      [
+        "Ġk",
+        "Êĭ"
+      ],
+      [
+        "Ġm",
+        "ËĪeiÉľ"
+      ],
+      [
+        "ÊĬ",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "Ġt",
+        "ËĪaÉª"
+      ],
+      [
+        "Ġsu",
+        "s"
+      ],
+      [
+        "Ġr",
+        "i"
+      ],
+      [
+        "Ġv",
+        "ËĮÉĽ"
+      ],
+      [
+        "ËĪiËĲ",
+        "no"
+      ],
+      [
+        "v",
+        "ano"
+      ],
+      [
+        "ĠdËĮi",
+        "ËĲ"
+      ],
+      [
+        "ĠÊĲ",
+        "ËĪaÉľn"
+      ],
+      [
+        "Ê",
+        "Ĥ"
+      ],
+      [
+        "ĠÉĲ",
+        "b"
+      ],
+      [
+        "ËĪaËĲ",
+        "h"
+      ],
+      [
+        "Éª",
+        "Êĥ"
+      ],
+      [
+        "ĠdËĮe",
+        "lla"
+      ],
+      [
+        "tËĲ",
+        "i"
+      ],
+      [
+        "ĠËĪÊĬ",
+        "n"
+      ],
+      [
+        "Ġh",
+        "iËĲ"
+      ],
+      [
+        "Ġb",
+        "ËĪaËĲt"
+      ],
+      [
+        "Ġth",
+        "ËĪi"
+      ],
+      [
+        "Ġa",
+        "m"
+      ],
+      [
+        "Ġ",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġh",
+        "u"
+      ],
+      [
+        "Ġk",
+        "ËĪÊĮh"
+      ],
+      [
+        "Ġz",
+        "ËĪÉĳËĲ"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĮÉĶ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĻÊĬ"
+      ],
+      [
+        "y",
+        "ËĪi"
+      ],
+      [
+        "Ġl",
+        "ËĪÊĮ"
+      ],
+      [
+        "Ġd",
+        "ËĪeËĲ"
+      ],
+      [
+        "Ġs",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "sk",
+        "ËĮeËĲ"
+      ],
+      [
+        "É¾",
+        "o"
+      ],
+      [
+        "Êģ",
+        "ËĪÉĳ"
+      ],
+      [
+        "t",
+        "ËĪa"
+      ],
+      [
+        "Ġk",
+        "ËĪÊĬ"
+      ],
+      [
+        "ËĪant",
+        "e"
+      ],
+      [
+        "Ġd",
+        "ÉĶ"
+      ],
+      [
+        "Ġs",
+        "ËĪeÉª"
+      ],
+      [
+        "Ġs",
+        "ÉĽt"
+      ],
+      [
+        "É¹",
+        "Éª"
+      ],
+      [
+        "ĠÉ¡ËĮÉĻÊĬ",
+        "ÉªÅĭ"
+      ],
+      [
+        "z",
+        "o"
+      ],
+      [
+        "Ġj",
+        "ËĪaËĲ"
+      ],
+      [
+        "ĠÉĴv",
+        "Ã°ÉĻ"
+      ],
+      [
+        "ĠÊ",
+        "Ŀ"
+      ],
+      [
+        "ĠÉĽ",
+        "l"
+      ],
+      [
+        "Ġs",
+        "ËĪoËĲ"
+      ],
+      [
+        "Ġth",
+        "ËĪiÉľ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĽl"
+      ],
+      [
+        "Ġly",
+        "ËĮi"
+      ],
+      [
+        "nd",
+        "ÊĴ"
+      ],
+      [
+        "ĠÉķ",
+        "jËĪÉĳu"
+      ],
+      [
+        "Î¸",
+        "a"
+      ],
+      [
+        "ĠÉ¾ËĮÉĻh",
+        "eËĲ"
+      ],
+      [
+        "Ġma",
+        "Éª"
+      ],
+      [
+        "j",
+        "ÉĻ"
+      ],
+      [
+        "ĠËĪÊĮ",
+        "b"
+      ],
+      [
+        "as",
+        "jËĪÉĶ"
+      ],
+      [
+        "d",
+        "Êģ"
+      ],
+      [
+        "Ġkh",
+        "ËĪa"
+      ],
+      [
+        "ĠËĪe",
+        "s"
+      ],
+      [
+        "v",
+        "i"
+      ],
+      [
+        "f",
+        "i"
+      ],
+      [
+        "ËĮÉĻ",
+        "b"
+      ],
+      [
+        "Ġr",
+        "e"
+      ],
+      [
+        "Ġav",
+        "ËĮÉĽ"
+      ],
+      [
+        "Ġt",
+        "ËĮi"
+      ],
+      [
+        "Ġk",
+        "É¾"
+      ],
+      [
+        "Ġb",
+        "Éªk"
+      ],
+      [
+        "st",
+        "e"
+      ],
+      [
+        "ËĪeËĲÊĥ",
+        "c"
+      ],
+      [
+        "p",
+        "t"
+      ],
+      [
+        "z",
+        "ÉĻ"
+      ],
+      [
+        "Ġw",
+        "ËĪaËĲ"
+      ],
+      [
+        "k",
+        "l"
+      ],
+      [
+        "ĠsËĪÊĮ",
+        "m"
+      ],
+      [
+        "Éª",
+        "ÊĪ"
+      ],
+      [
+        "d",
+        "z"
+      ],
+      [
+        "v",
+        "o"
+      ],
+      [
+        "ËĮa",
+        "ÊĬt"
+      ],
+      [
+        "nd",
+        "e"
+      ],
+      [
+        "Ġd",
+        "ÉĽs"
+      ],
+      [
+        "ĠÉŁ",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġr",
+        "ËĮi"
+      ],
+      [
+        "s",
+        "ËĮeËĲ"
+      ],
+      [
+        "É¡",
+        "i"
+      ],
+      [
+        "Ġal",
+        "s"
+      ],
+      [
+        "ËĪi",
+        "Ã°o"
+      ],
+      [
+        "ĠnËĪi",
+        "Éľn"
+      ],
+      [
+        "ÊĬ",
+        "l"
+      ],
+      [
+        "ts",
+        "ËĲ"
+      ],
+      [
+        "ËĪant",
+        "o"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪÉĻÊĬ"
+      ],
+      [
+        "kËĲ",
+        "i"
+      ],
+      [
+        "ĠsËĪÊĮ",
+        "b"
+      ],
+      [
+        "Ġn",
+        "ËĪa"
+      ],
+      [
+        "Ġl",
+        "ËĮo"
+      ],
+      [
+        "Ġph",
+        "ËĪi"
+      ],
+      [
+        "m",
+        "ËĮe"
+      ],
+      [
+        "Ġf",
+        "a"
+      ],
+      [
+        "k",
+        "ÉĻ"
+      ],
+      [
+        "Ġz",
+        "ËĪu"
+      ],
+      [
+        "n",
+        "s"
+      ],
+      [
+        "ĠÊģ",
+        "e"
+      ],
+      [
+        "Ġb",
+        "ËĪo"
+      ],
+      [
+        "ËĪaËĲt",
+        "i"
+      ],
+      [
+        "Ġm",
+        "an"
+      ],
+      [
+        "ĠlËĪi",
+        "Éĳ"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĮyÉĻ"
+      ],
+      [
+        "Ġf",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "ĠkÊĭ",
+        "ËĪeËĲÊĥc"
+      ],
+      [
+        "Ġx",
+        "ËĪÉĳ"
+      ],
+      [
+        "ĠtÉķ",
+        "ËĪu"
+      ],
+      [
+        "j",
+        "ÉĻÉ¾"
+      ],
+      [
+        "ĠÉª",
+        "st"
+      ],
+      [
+        "w",
+        "ËĪi"
+      ],
+      [
+        "ĠËĮaÉªn",
+        "ÉĻ"
+      ],
+      [
+        "Éª",
+        "É¡"
+      ],
+      [
+        "Ġs",
+        "ÊĪ"
+      ],
+      [
+        "ËĪi",
+        "ÉĻl"
+      ],
+      [
+        "Ġn",
+        "ËĪiÉĽÉľn"
+      ],
+      [
+        "ĠËĮÉĽ",
+        "ËĲ"
+      ],
+      [
+        "ËĪaÉª",
+        "nd"
+      ],
+      [
+        "Ġz",
+        "ËĪi"
+      ],
+      [
+        "v",
+        "ÉĻn"
+      ],
+      [
+        "m",
+        "z"
+      ],
+      [
+        "Ã°",
+        "os"
+      ],
+      [
+        "dÊĴ",
+        "ËĲ"
+      ],
+      [
+        "j",
+        "ËĪa"
+      ],
+      [
+        "É¾",
+        "ËĪÉĶ"
+      ],
+      [
+        "l",
+        "ËĪe"
+      ],
+      [
+        "Ê",
+        "²"
+      ],
+      [
+        "Ġv",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġl",
+        "ËĪiÉĽ"
+      ],
+      [
+        "Î¸",
+        "e"
+      ],
+      [
+        "mËĪe",
+        "nte"
+      ],
+      [
+        "ĠÉªn",
+        "Ã°ÉĻ"
+      ],
+      [
+        "ĠaÉª",
+        "m"
+      ],
+      [
+        "n",
+        "ÉĻn"
+      ],
+      [
+        "Ġh",
+        "ÉĻm"
+      ],
+      [
+        "É¾",
+        "aËĲ"
+      ],
+      [
+        "ĠsËĪuo",
+        "Éľ"
+      ],
+      [
+        "ĠÉ²",
+        "ËĪi"
+      ],
+      [
+        "ĠÉ¹",
+        "ËĪiÉĻl"
+      ],
+      [
+        "l",
+        "ËĪa"
+      ],
+      [
+        "Ġb",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġk",
+        "ËĪai"
+      ],
+      [
+        "Êģ",
+        "ËĪa"
+      ],
+      [
+        "Ġw",
+        "ËĪÉľËĲ"
+      ],
+      [
+        "Ġa",
+        "ËĲ"
+      ],
+      [
+        "Ġp",
+        "as"
+      ],
+      [
+        "ËĪÊĮ",
+        "s"
+      ],
+      [
+        "w",
+        "ËĪÉĽÉ¾"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪe"
+      ],
+      [
+        "ĠhËĮa",
+        "tÉĻ"
+      ],
+      [
+        "a",
+        "Éªn"
+      ],
+      [
+        "ĠËĪÉĶ",
+        "pÊ°"
+      ],
+      [
+        "Êģ",
+        "ËĪe"
+      ],
+      [
+        "ĠÉŁaËĲ",
+        "ËĪeËĲÉ¡aËĲ"
+      ],
+      [
+        "ĠËĪÊĬ",
+        "s"
+      ],
+      [
+        "ĠtÉķhËĪi",
+        "Éľ"
+      ],
+      [
+        "nt",
+        "Êĥ"
+      ],
+      [
+        "Ġx",
+        "ËĪuo"
+      ],
+      [
+        "ËĪu",
+        "Êģ"
+      ],
+      [
+        "ĠÉª",
+        "m"
+      ],
+      [
+        "É³",
+        "Éĸ"
+      ],
+      [
+        "ËĪyÉĻ",
+        "Éľkh"
+      ],
+      [
+        "ĠËĪy",
+        "ÉĽ"
+      ],
+      [
+        "Ġm",
+        "ËĮaËĲ"
+      ],
+      [
+        "Åĵ",
+        "Êģ"
+      ],
+      [
+        "ĠËĪa",
+        "lt"
+      ],
+      [
+        "Ġk",
+        "ÉĻm"
+      ],
+      [
+        "Êİ",
+        "o"
+      ],
+      [
+        "ĠÉĲ",
+        "n"
+      ],
+      [
+        "Ġf",
+        "y"
+      ],
+      [
+        "ĠËĮÉĽ",
+        "ra"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÊĬ"
+      ],
+      [
+        "Ġp",
+        "ËĪÊĮ"
+      ],
+      [
+        "l",
+        "s"
+      ],
+      [
+        "Ġl",
+        "ËĪiËĲ"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĪy"
+      ],
+      [
+        "ĠbÉªk",
+        "ËĪÊĮz"
+      ],
+      [
+        "ĠÉ¡",
+        "ÉĽt"
+      ],
+      [
+        "Ġb",
+        "É¾"
+      ],
+      [
+        "t",
+        "Ê°"
+      ],
+      [
+        "tÉĻl",
+        "ËĮÉĻb"
+      ],
+      [
+        "x",
+        "o"
+      ],
+      [
+        "sk",
+        "ËĮaËĲ"
+      ],
+      [
+        "É²",
+        "Ê²"
+      ],
+      [
+        "ËĪeËĲk",
+        "ÊĪ"
+      ],
+      [
+        "r",
+        "ÉĻ"
+      ],
+      [
+        "tÊĥ",
+        "o"
+      ],
+      [
+        "ĠpÊģ",
+        "ÉĶ"
+      ],
+      [
+        "ĠÉ¹",
+        "ËĪaÉªt"
+      ],
+      [
+        "Ġp",
+        "ËĪei"
+      ],
+      [
+        "ËĮ",
+        "ÉªÃ§"
+      ],
+      [
+        "j",
+        "ËĪÉĽÉ¾"
+      ],
+      [
+        "tËĲ",
+        "a"
+      ],
+      [
+        "ĠÉĲb",
+        "ËĮaÊĬt"
+      ],
+      [
+        "ĠkÊĭËĪeËĲÊĥc",
+        "ÉĻn"
+      ],
+      [
+        "Ġv",
+        "ËĪe"
+      ],
+      [
+        "ÊĬ",
+        "Éľ"
+      ],
+      [
+        "Ġa",
+        "kËĪe"
+      ],
+      [
+        "Ġp",
+        "ËĪai"
+      ],
+      [
+        "v",
+        "ËĪÉĽ"
+      ],
+      [
+        "ĠÎ¸",
+        "É¹"
+      ],
+      [
+        "Éª",
+        "f"
+      ],
+      [
+        "Ġav",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġk",
+        "ËĪe"
+      ],
+      [
+        "d",
+        "ËĪi"
+      ],
+      [
+        "ËĪeËĲ",
+        "Éĸ"
+      ],
+      [
+        "Ġb",
+        "ÉĻt"
+      ],
+      [
+        "ÊĪ",
+        "Ê°"
+      ],
+      [
+        "t",
+        "eËĲ"
+      ],
+      [
+        "Î¸j",
+        "ËĪÉĶn"
+      ],
+      [
+        "d",
+        "Éľ"
+      ],
+      [
+        "ĠjËĪi",
+        "Éľ"
+      ],
+      [
+        "Ġv",
+        "e"
+      ],
+      [
+        "É£",
+        "ËĪu"
+      ],
+      [
+        "ËĪÊĮh",
+        "ÉĻl"
+      ],
+      [
+        "Ġp",
+        "ÉĶ"
+      ],
+      [
+        "ĠÉ¡",
+        "r"
+      ],
+      [
+        "ĠÃ°",
+        "a"
+      ],
+      [
+        "Ġv",
+        "ËĪiËĲ"
+      ],
+      [
+        "ĠËĮ",
+        "ÉĳËĲ"
+      ],
+      [
+        "ËĪÉĻÊĬ",
+        "nt"
+      ],
+      [
+        "Ġb",
+        "ËĪaËĲÉ¾"
+      ],
+      [
+        "ĠmËĪÊĮ",
+        "tÉĻlËĮÉĻb"
+      ],
+      [
+        "l",
+        "d"
+      ],
+      [
+        "ĠtÉķ",
+        "ËĮÉĶ"
+      ],
+      [
+        "p",
+        "a"
+      ],
+      [
+        "Ã°",
+        "ËĪad"
+      ],
+      [
+        "ËĪi",
+        "É¾"
+      ],
+      [
+        "Ġx",
+        "ËĪu"
+      ],
+      [
+        "ĠlËĪi",
+        "ÉľÅĭ"
+      ],
+      [
+        "ËĪeÉª",
+        "s"
+      ],
+      [
+        "ĠÉĹËĮe",
+        "Éľn"
+      ],
+      [
+        "Ġth",
+        "ËĪiÉĽ"
+      ],
+      [
+        "tËĲ",
+        "e"
+      ],
+      [
+        "ĠavËĮÉĽ",
+        "k"
+      ],
+      [
+        "ĠËĮ",
+        "ÉĶ"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĳu"
+      ],
+      [
+        "Éª",
+        "v"
+      ],
+      [
+        "iËĲ",
+        "z"
+      ],
+      [
+        "ËĪo",
+        "s"
+      ],
+      [
+        "ĠÉ¡",
+        "É¹"
+      ],
+      [
+        "a",
+        "nd"
+      ],
+      [
+        "ĠlËĪi",
+        "ou"
+      ],
+      [
+        "ĠËĪo",
+        "Éľ"
+      ],
+      [
+        "É¡",
+        "l"
+      ],
+      [
+        "Ġp",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "Ġm",
+        "ËĮeËĲ"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĴ"
+      ],
+      [
+        "n",
+        "os"
+      ],
+      [
+        "Ã§",
+        "ÉĻn"
+      ],
+      [
+        "f",
+        "ÉĻn"
+      ],
+      [
+        "ĠsËĪÊĮkt",
+        "ËĮeËĲ"
+      ],
+      [
+        "Ġ",
+        "ËĪaÉªn"
+      ],
+      [
+        "ËĪoËĲ",
+        "re"
+      ],
+      [
+        "j",
+        "ËĪÉĽn"
+      ],
+      [
+        "ĠÃ°",
+        "ËĪÉĽn"
+      ],
+      [
+        "ĠtÉķh",
+        "ËĪiÉĽÉľn"
+      ],
+      [
+        "Ġh",
+        "ËĪaÉª"
+      ],
+      [
+        "É¾",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġs",
+        "ËĪu"
+      ],
+      [
+        "ĠkËĪÉª",
+        "jaËĲ"
+      ],
+      [
+        "Ġpj",
+        "ËĮÊĬ"
+      ],
+      [
+        "ĠhÉĻm",
+        "ËĮaËĲ"
+      ],
+      [
+        "ĠËĮÊĮ",
+        "p"
+      ],
+      [
+        "Ġp",
+        "ËĪÊĮhÉĻl"
+      ],
+      [
+        "Ġx",
+        "ËĪÉĻ"
+      ],
+      [
+        "d",
+        "ËĪe"
+      ],
+      [
+        "Ġm",
+        "Éĳ"
+      ],
+      [
+        "ĠÊĬ",
+        "m"
+      ],
+      [
+        "nd",
+        "ÉĻ"
+      ],
+      [
+        "Ġd",
+        "ËĪÉĻÊĬnt"
+      ],
+      [
+        "ËĪeËĲ",
+        "ÊĥÉĻn"
+      ],
+      [
+        "ĠÃ°a",
+        "ts"
+      ],
+      [
+        "i",
+        "s"
+      ],
+      [
+        "Ġc",
+        "ËĪaËĲh"
+      ],
+      [
+        "p",
+        "e"
+      ],
+      [
+        "Ġs",
+        "ËĮo"
+      ],
+      [
+        "ĠÃ°",
+        "ËĪe"
+      ],
+      [
+        "Ġs",
+        "ËĪaËĲt"
+      ],
+      [
+        "ËĪa",
+        "Êģ"
+      ],
+      [
+        "Ġs",
+        "ËĪe"
+      ],
+      [
+        "ÉĻ",
+        "k"
+      ],
+      [
+        "Éª",
+        "Êĭ"
+      ],
+      [
+        "ĠkËĪoËĲ",
+        "i"
+      ],
+      [
+        "k",
+        "ÉĶ"
+      ],
+      [
+        "Ġv",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "Ġf",
+        "ËĪei"
+      ],
+      [
+        "Ġl",
+        "ËĪeËĲk"
+      ],
+      [
+        "Ġh",
+        "ËĪiÉĻ"
+      ],
+      [
+        "Ġa",
+        "ÊĬ"
+      ],
+      [
+        "ËĪÉĽ",
+        "ndo"
+      ],
+      [
+        "ËĪe",
+        "s"
+      ],
+      [
+        "Ġz",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĽÉ¾a"
+      ],
+      [
+        "nËĪi",
+        "Éľn"
+      ],
+      [
+        "ĠkËĪÊĮ",
+        "m"
+      ],
+      [
+        "Ġl",
+        "ËĪÉĴ"
+      ],
+      [
+        "Éª",
+        "st"
+      ],
+      [
+        "Ġp",
+        "Éĳ"
+      ],
+      [
+        "Ġf",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġth",
+        "ËĪonÉ¡"
+      ],
+      [
+        "nk",
+        "e"
+      ],
+      [
+        "ËĮ",
+        "Éªk"
+      ],
+      [
+        "ĠÉ²",
+        "ËĪÉĻ"
+      ],
+      [
+        "ËĮÊĮ",
+        "m"
+      ],
+      [
+        "ËĪiËĲ",
+        "t"
+      ],
+      [
+        "ĠwËĪÉĴ",
+        "nt"
+      ],
+      [
+        "ËĪaÎ²",
+        "an"
+      ],
+      [
+        "ĠbËĪÊĮ",
+        "r"
+      ],
+      [
+        "ÉĽ",
+        "nd"
+      ],
+      [
+        "ĠËĮÉĳËĲ",
+        "bÉľ"
+      ],
+      [
+        "Ġv",
+        "ËĪaÉª"
+      ],
+      [
+        "ĠtÊĥ",
+        "ËĮi"
+      ],
+      [
+        "ĠÎ¸ËĪÉªÅĭ",
+        "k"
+      ],
+      [
+        "st",
+        "i"
+      ],
+      [
+        "Ġk",
+        "É¹"
+      ],
+      [
+        "ĠËĪa",
+        "ÊĬt"
+      ],
+      [
+        "st",
+        "ÉĻn"
+      ],
+      [
+        "ĠÊĭ",
+        "ËĪÊĮn"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĮaËĲ"
+      ],
+      [
+        "ËĪaËĲÉľ",
+        "É²"
+      ],
+      [
+        "Êģ",
+        "i"
+      ],
+      [
+        "ĠnËĪÉĶ",
+        "x"
+      ],
+      [
+        "ĠÉ¹ËĪiÉĻl",
+        "Éª"
+      ],
+      [
+        "Ġv",
+        "ËĮi"
+      ],
+      [
+        "ĠÃ°e",
+        "ÉĻ"
+      ],
+      [
+        "ËĮÉª",
+        "tÊĥ"
+      ],
+      [
+        "Ġv",
+        "ËĪyÉĻ"
+      ],
+      [
+        "ĠËĮaËĲpk",
+        "ËĮaËĲ"
+      ],
+      [
+        "Ġf",
+        "ËĮaËĲÉª"
+      ],
+      [
+        "Ġp",
+        "ËĪÉĶ"
+      ],
+      [
+        "ĠnËĪÊĮ",
+        "mb"
+      ],
+      [
+        "Î¸",
+        "es"
+      ],
+      [
+        "j",
+        "ËĪÉĽÊģ"
+      ],
+      [
+        "ĠkËĪÊĬ",
+        "cÊ°"
+      ],
+      [
+        "m",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġv",
+        "ËĪu"
+      ],
+      [
+        "Ġl",
+        "ÅĵÊģ"
+      ],
+      [
+        "ĠiËĲ",
+        "m"
+      ],
+      [
+        "ÊĪ",
+        "ÉĻÉ¾"
+      ],
+      [
+        "tÊĥ",
+        "i"
+      ],
+      [
+        "ËĲ",
+        "s"
+      ],
+      [
+        "Ġt",
+        "ËĪy"
+      ],
+      [
+        "ĠmËĪi",
+        "ÉľÅĭ"
+      ],
+      [
+        "É¾",
+        "ËĪe"
+      ],
+      [
+        "m",
+        "ËĮa"
+      ],
+      [
+        "Ġm",
+        "ËĮiËĲ"
+      ],
+      [
+        "ĠÉĽ",
+        "ks"
+      ],
+      [
+        "Éª",
+        "p"
+      ],
+      [
+        "ĠkËĪÊĮÉ¾",
+        "nËĮaËĲ"
+      ],
+      [
+        "ĠËĮaÊĬ",
+        "x"
+      ],
+      [
+        "r",
+        "ËĪiËĲ"
+      ],
+      [
+        "Ġc",
+        "ËĪÊĮl"
+      ],
+      [
+        "m",
+        "os"
+      ],
+      [
+        "ĠkËĪÊĮÉ¾t",
+        "ËĮeËĲ"
+      ],
+      [
+        "iËĲ",
+        "É¾"
+      ],
+      [
+        "k",
+        "ÉĻn"
+      ],
+      [
+        "Ġd",
+        "ËĪu"
+      ],
+      [
+        "n",
+        "aËĲ"
+      ],
+      [
+        "Ġp",
+        "wËĪe"
+      ],
+      [
+        "ËĮÉĶ",
+        "Éª"
+      ],
+      [
+        "ĠtÉķh",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ĠÎ²",
+        "ËĪi"
+      ],
+      [
+        "ËĪiÉĽ",
+        "Éľt"
+      ],
+      [
+        "Ġt",
+        "e"
+      ],
+      [
+        "ËĪaÃ°",
+        "os"
+      ],
+      [
+        "m",
+        "ËĪa"
+      ],
+      [
+        "Ġv",
+        "ËĪo"
+      ],
+      [
+        "Ġm",
+        "ËĪÉª"
+      ],
+      [
+        "Ġb",
+        "ËĮi"
+      ],
+      [
+        "a",
+        "d"
+      ],
+      [
+        "d",
+        "o"
+      ],
+      [
+        "Ġn",
+        "ËĪaÊĬ"
+      ],
+      [
+        "ĠÊ²ËĪy",
+        "Éľ"
+      ],
+      [
+        "w",
+        "ËĪÉĽ"
+      ],
+      [
+        "ËĪi",
+        "s"
+      ],
+      [
+        "e",
+        "l"
+      ],
+      [
+        "Ġpa",
+        "r"
+      ],
+      [
+        "Ġt",
+        "ËĪai"
+      ],
+      [
+        "ĠdËĪÉª",
+        "jaËĲ"
+      ],
+      [
+        "h",
+        "ËĪi"
+      ],
+      [
+        "ĠÉ¾",
+        "ËĪÊĮ"
+      ],
+      [
+        "Ġd",
+        "ËĪe"
+      ],
+      [
+        "ËĪaÉª",
+        "d"
+      ],
+      [
+        "Ġp",
+        "er"
+      ],
+      [
+        "Ġs",
+        "ËĮÉĶ"
+      ],
+      [
+        "w",
+        "e"
+      ],
+      [
+        "ÊĬ",
+        "m"
+      ],
+      [
+        "Ġi",
+        "n"
+      ],
+      [
+        "ĠjËĪuËĲ",
+        "z"
+      ],
+      [
+        "ËĪiËĲp",
+        "ÉĻl"
+      ],
+      [
+        "ĠÊĭ",
+        "ËĪaËĲl"
+      ],
+      [
+        "Ġe",
+        "tËĪÉĽ"
+      ],
+      [
+        "ËĮÉĽ",
+        "m"
+      ],
+      [
+        "Ġn",
+        "ËĪu"
+      ],
+      [
+        "ËĪÉĽ",
+        "kt"
+      ],
+      [
+        "ĠiËĲ",
+        "É¾"
+      ],
+      [
+        "Ġb",
+        "É¹"
+      ],
+      [
+        "Ġtsh",
+        "ËĪi"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪÉĶÉľ"
+      ],
+      [
+        "Ġkw",
+        "ËĮa"
+      ],
+      [
+        "Ġf",
+        "ËĪuÉľ"
+      ],
+      [
+        "w",
+        "ËĮa"
+      ],
+      [
+        "Ġd",
+        "ËĪiËĲ"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪyÉĻ"
+      ],
+      [
+        "ËĮÉĽ",
+        "ËĲ"
+      ],
+      [
+        "r",
+        "ËĪa"
+      ],
+      [
+        "Ġn",
+        "e"
+      ],
+      [
+        "Ġz",
+        "ËĪyÉĻ"
+      ],
+      [
+        "Ġb",
+        "ËĪaÉª"
+      ],
+      [
+        "ĠÉŁ",
+        "ËĪÊĮb"
+      ],
+      [
+        "ËĪuËĲ",
+        "to"
+      ],
+      [
+        "ÊĬ",
+        "nt"
+      ],
+      [
+        "Ġc",
+        "Ê°"
+      ],
+      [
+        "ËĪÉĽnt",
+        "i"
+      ],
+      [
+        "ËĪo",
+        "ÉĻ"
+      ],
+      [
+        "Ġs",
+        "ËĮÊĮm"
+      ],
+      [
+        "Ġl",
+        "Éĳ"
+      ],
+      [
+        "ËĮe",
+        "va"
+      ],
+      [
+        "É¾",
+        "ÉĽ"
+      ],
+      [
+        "nt",
+        "Éľ"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĽn"
+      ],
+      [
+        "ËĪÉĳËĲ",
+        "k"
+      ],
+      [
+        "Ġki",
+        "l"
+      ],
+      [
+        "ËĪon",
+        "es"
+      ],
+      [
+        "f",
+        "f"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĽËĲ"
+      ],
+      [
+        "Ġv",
+        "ËĪÉĻÉª"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "ĠËĮÉª",
+        "nt"
+      ],
+      [
+        "ÊĬ",
+        "n"
+      ],
+      [
+        "Ġw",
+        "Éªl"
+      ],
+      [
+        "Ġs",
+        "in"
+      ],
+      [
+        "ĠËĮa",
+        "lla"
+      ],
+      [
+        "ĠaÎ²",
+        "ËĪia"
+      ],
+      [
+        "p",
+        "i"
+      ],
+      [
+        "ËĪo",
+        "Éľ"
+      ],
+      [
+        "Éªj",
+        "ËĮaËĲ"
+      ],
+      [
+        "k",
+        "u"
+      ],
+      [
+        "Ġv",
+        "ËĪÉª"
+      ],
+      [
+        "Ġtu",
+        "t"
+      ],
+      [
+        "ĠtËĪe",
+        "Éľ"
+      ],
+      [
+        "Ġh",
+        "ËĪÉĶ"
+      ],
+      [
+        "Î²",
+        "É¾e"
+      ],
+      [
+        "s",
+        "ÉĻÉ¾"
+      ],
+      [
+        "Ġkh",
+        "ËĪai"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġt",
+        "a"
+      ],
+      [
+        "ĠÉ²",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġn",
+        "u"
+      ],
+      [
+        "ËĪuËĲ",
+        "n"
+      ],
+      [
+        "ĠÉĻËĲ",
+        "Éľ"
+      ],
+      [
+        "ĠËĪa",
+        "ÊĬf"
+      ],
+      [
+        "ËĪiËĲd",
+        "Éľ"
+      ],
+      [
+        "nt",
+        "i"
+      ],
+      [
+        "Ġp",
+        "ËĪiËĲpÉĻl"
+      ],
+      [
+        "Ġk",
+        "j"
+      ],
+      [
+        "Ġp",
+        "e"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĳ"
+      ],
+      [
+        "ËĮa",
+        "Éª"
+      ],
+      [
+        "ËĪaËĲ",
+        "le"
+      ],
+      [
+        "Ġv",
+        "ËĮÉĻËĲÉªÉľ"
+      ],
+      [
+        "mp",
+        "o"
+      ],
+      [
+        "ĠkËĪÉª",
+        "t"
+      ],
+      [
+        "Ġn",
+        "ËĮÉĽ"
+      ],
+      [
+        "ĠÉŁ",
+        "ËĪaËĲtaËĲ"
+      ],
+      [
+        "ĠsËĪaËĲt",
+        "Ê°"
+      ],
+      [
+        "ĠÉŁ",
+        "ËĪi"
+      ],
+      [
+        "Ġs",
+        "o"
+      ],
+      [
+        "Ġb",
+        "ËĪÉĽ"
+      ],
+      [
+        "k",
+        "ËĪi"
+      ],
+      [
+        "Éªt",
+        "i"
+      ],
+      [
+        "Ġts",
+        "i"
+      ],
+      [
+        "Ġk",
+        "Êģ"
+      ],
+      [
+        "ËĮ",
+        "ÉĴ"
+      ],
+      [
+        "É¡",
+        "ÉĻl"
+      ],
+      [
+        "k",
+        "st"
+      ],
+      [
+        "Ġm",
+        "ËĪÉĻËĲ"
+      ],
+      [
+        "ËĪÊĮ",
+        "k"
+      ],
+      [
+        "Ġn",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "Ġa",
+        "p"
+      ],
+      [
+        "ĠlËĪÉª",
+        "kÊ°"
+      ],
+      [
+        "ll",
+        "i"
+      ],
+      [
+        "ĠkwËĪa",
+        "l"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĻËĲ"
+      ],
+      [
+        "Ġts",
+        "ËĪuei"
+      ],
+      [
+        "Ġd",
+        "o"
+      ],
+      [
+        "ĠkËĲ",
+        "jËĪo"
+      ],
+      [
+        "ÊĬ",
+        "z"
+      ],
+      [
+        "Ġp",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġm",
+        "ËĪuËĲ"
+      ],
+      [
+        "ĠÉ¡ÉĻ",
+        "v"
+      ],
+      [
+        "r",
+        "ËĪi"
+      ],
+      [
+        "Ġt",
+        "w"
+      ],
+      [
+        "ËĮ",
+        "Éªn"
+      ],
+      [
+        "d",
+        "ËĪÉĳ"
+      ],
+      [
+        "ĠÃ°",
+        "ËĪi"
+      ],
+      [
+        "ĠËĪaËĲ",
+        "i"
+      ],
+      [
+        "Ġh",
+        "ËĪiÉĽ"
+      ],
+      [
+        "ĠÃ°",
+        "ËĮÉĽm"
+      ],
+      [
+        "ĠpÊ°",
+        "ËĪÉªÉ¾"
+      ],
+      [
+        "ÉĴ",
+        "m"
+      ],
+      [
+        "ĠËĮ",
+        "eËĲ"
+      ],
+      [
+        "Ġth",
+        "ËĪaiÉľ"
+      ],
+      [
+        "Ġv",
+        "ËĪas"
+      ],
+      [
+        "Ġn",
+        "ÉĳËĲ"
+      ],
+      [
+        "p",
+        "ÉĻn"
+      ],
+      [
+        "Ġp",
+        "ËĮÉĻÉ¾"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪaËĲÉª"
+      ],
+      [
+        "ËĪou",
+        "Éľ"
+      ],
+      [
+        "ĠÊĲ",
+        "ËĪuÉľ"
+      ],
+      [
+        "ĠmËĪa",
+        "n"
+      ],
+      [
+        "ĠtËĪÉĻ",
+        "ÉªÉľ"
+      ],
+      [
+        "Ġl",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "m",
+        "ËĪÉĽnte"
+      ],
+      [
+        "ĠfËĪa",
+        "m"
+      ],
+      [
+        "s",
+        "jËĪÉĶ"
+      ],
+      [
+        "Ġp",
+        "ËĪÉĻ"
+      ],
+      [
+        "ËĪeËĲ",
+        "m"
+      ],
+      [
+        "Ġp",
+        "ËĪÊĮr"
+      ],
+      [
+        "j",
+        "ËĪi"
+      ],
+      [
+        "Ġl",
+        "ÉĽ"
+      ],
+      [
+        "Ġt",
+        "en"
+      ],
+      [
+        "ËĪoËĲ",
+        "ra"
+      ],
+      [
+        "k",
+        "i"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "k",
+        "Éª"
+      ],
+      [
+        "bËĲ",
+        "e"
+      ],
+      [
+        "ËĪa",
+        "lt"
+      ],
+      [
+        "Ã°",
+        "Éª"
+      ],
+      [
+        "p",
+        "ËĪi"
+      ],
+      [
+        "ĠËĮÉĽ",
+        "nt"
+      ],
+      [
+        "Ġm",
+        "ËĪei"
+      ],
+      [
+        "Ġh",
+        "ËĪÉĻÊĬ"
+      ],
+      [
+        "Ġh",
+        "ËĪÉĽÉ¾"
+      ],
+      [
+        "j",
+        "ËĪÉĳ"
+      ],
+      [
+        "ĠhËĪÊĬ",
+        "aËĲ"
+      ],
+      [
+        "m",
+        "Éľ"
+      ],
+      [
+        "Ġd",
+        "Ê°"
+      ],
+      [
+        "ĠtÊĥ",
+        "ËĪe"
+      ],
+      [
+        "l",
+        "ËĪÉĽ"
+      ],
+      [
+        "ËĪaËĲt",
+        "e"
+      ],
+      [
+        "Ġp",
+        "ËĪuËĲ"
+      ],
+      [
+        "Ġm",
+        "ËĪÊĬ"
+      ],
+      [
+        "ËĪaËĲÉª",
+        "ÊĪ"
+      ],
+      [
+        "d",
+        "iËĲ"
+      ],
+      [
+        "ĠfÉ¹",
+        "ÉĴm"
+      ],
+      [
+        "Ġh",
+        "ËĪÉĳËĲ"
+      ],
+      [
+        "Î²",
+        "o"
+      ],
+      [
+        "ĠmËĪi",
+        "Éľn"
+      ],
+      [
+        "ĠÃ°",
+        "iËĲz"
+      ],
+      [
+        "Ġk",
+        "ËĪou"
+      ],
+      [
+        "ËĪiËĲ",
+        "na"
+      ],
+      [
+        "Ġav",
+        "ËĮeva"
+      ],
+      [
+        "Ġ",
+        "ËĪaËĲÉ¾"
+      ],
+      [
+        "Ġn",
+        "ËĪuËĲÉ¾"
+      ],
+      [
+        "ĠÎ²",
+        "ËĪe"
+      ],
+      [
+        "Ġz",
+        "aÉªn"
+      ],
+      [
+        "ËĪÉĽ",
+        "d"
+      ],
+      [
+        "É",
+        "Ĺ"
+      ],
+      [
+        "ËĪeÉª",
+        "k"
+      ],
+      [
+        "s",
+        "ËĮÉĻÊĬ"
+      ],
+      [
+        "ËĪeËĲ",
+        "ÉŁ"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĪÉĻËĲ"
+      ],
+      [
+        "j",
+        "e"
+      ],
+      [
+        "cÊ°",
+        "ËĲ"
+      ],
+      [
+        "ËĪÉĶ",
+        "r"
+      ],
+      [
+        "ÉĽ",
+        "ËĲ"
+      ],
+      [
+        "ĠtÉķhËĪy",
+        "Ã¦Éľn"
+      ],
+      [
+        "ĠËĮaÉªn",
+        "ÉĻn"
+      ],
+      [
+        "ĠiËĲ",
+        "n"
+      ],
+      [
+        "ĠbËĪÊĮ",
+        "c"
+      ],
+      [
+        "ËĪiËĲ",
+        "m"
+      ],
+      [
+        "É¾",
+        "as"
+      ],
+      [
+        "ËĮÉĻ",
+        "s"
+      ],
+      [
+        "Ġv",
+        "ËĪeËĲ"
+      ],
+      [
+        "ĠËĪÉĻr",
+        "Éľ"
+      ],
+      [
+        "Ġd",
+        "uËĲ"
+      ],
+      [
+        "nt",
+        "ÉĻ"
+      ],
+      [
+        "ĠpÉ¹",
+        "ËĪÉĴ"
+      ],
+      [
+        "Ġb",
+        "ËĪÉª"
+      ],
+      [
+        "ĠwËĪo",
+        "Éľ"
+      ],
+      [
+        "n",
+        "ËĮi"
+      ],
+      [
+        "Ġh",
+        "ÉĲ"
+      ],
+      [
+        "Ġk",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġe",
+        "t"
+      ],
+      [
+        "jËĪÉĽ",
+        "ndo"
+      ],
+      [
+        "ĠËĪai",
+        "Éľ"
+      ],
+      [
+        "Ġl",
+        "i"
+      ],
+      [
+        "ĠËĪaÊĬ",
+        "s"
+      ],
+      [
+        "kËĲ",
+        "o"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪyÉĻ"
+      ],
+      [
+        "k",
+        "eËĲ"
+      ],
+      [
+        "Ġf",
+        "ËĪiËĲl"
+      ],
+      [
+        "ĠbÊ°",
+        "ËĪaËĲi"
+      ],
+      [
+        "ĠÉ¡ÉĻ",
+        "Êĥ"
+      ],
+      [
+        "ÊĴ",
+        "ËĪe"
+      ],
+      [
+        "Ġn",
+        "jËĪuËĲ"
+      ],
+      [
+        "ĠËĪa",
+        "k"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪaËĲ"
+      ],
+      [
+        "z",
+        "ËĪa"
+      ],
+      [
+        "v",
+        "ËĪe"
+      ],
+      [
+        "ĠhËĮa",
+        "ÊĬ"
+      ],
+      [
+        "ÉĲ",
+        "Ã§"
+      ],
+      [
+        "ĠÉ¾ËĪÊĮ",
+        "kÊ°"
+      ],
+      [
+        "p",
+        "ËĪe"
+      ],
+      [
+        "ĠtÉĻ",
+        "bi"
+      ],
+      [
+        "ĠpËĪÊĮhÉĻl",
+        "ËĮeËĲ"
+      ],
+      [
+        "Ġf",
+        "ËĪÉĽ"
+      ],
+      [
+        "Ġw",
+        "ËĮÉªtÊĥ"
+      ],
+      [
+        "ĠtÉķËĪy",
+        "ÉĽÉľ"
+      ],
+      [
+        "w",
+        "ËĮe"
+      ],
+      [
+        "ËĮa",
+        "Éªt"
+      ],
+      [
+        "ĠnÉĳËĲ",
+        "x"
+      ],
+      [
+        "ĠkËĪÉĶËĲ",
+        "n"
+      ],
+      [
+        "ÊĬ",
+        "k"
+      ],
+      [
+        "ĠbËĪaËĲ",
+        "d"
+      ],
+      [
+        "Åĭ",
+        "ÉĻn"
+      ],
+      [
+        "Ġn",
+        "i"
+      ],
+      [
+        "Ġb",
+        "ËĪe"
+      ],
+      [
+        "Ġm",
+        "ËĮÊĬ"
+      ],
+      [
+        "ËĪa",
+        "r"
+      ],
+      [
+        "ĠmËĮe",
+        "Éªk"
+      ],
+      [
+        "Ġs",
+        "ËĪaËĲÉ¾"
+      ],
+      [
+        "Î²",
+        "e"
+      ],
+      [
+        "ĠtÉķhËĪi",
+        "ÉľÅĭ"
+      ],
+      [
+        "it",
+        "ËĪe"
+      ],
+      [
+        "k",
+        "ËĮe"
+      ],
+      [
+        "ËĪÉĽËĲ",
+        "l"
+      ],
+      [
+        "ËĮ",
+        "ÉĴn"
+      ],
+      [
+        "ËĮ",
+        "Éĳ"
+      ],
+      [
+        "Ġb",
+        "ËĪÉªl"
+      ],
+      [
+        "Ġw",
+        "ÊĬd"
+      ],
+      [
+        "Ġb",
+        "ËĪoËĲl"
+      ],
+      [
+        "r",
+        "d"
+      ],
+      [
+        "i",
+        "ÉĻ"
+      ],
+      [
+        "Ġd",
+        "a"
+      ],
+      [
+        "Ġb",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "ĠnËĪÊĮmb",
+        "ÉĻÉ¾"
+      ],
+      [
+        "ËĪaËĲÉª",
+        "Éľ"
+      ],
+      [
+        "ĠÉĽ",
+        "m"
+      ],
+      [
+        "Ġm",
+        "iËĲÉ¾"
+      ],
+      [
+        "ËĪeÉª",
+        "m"
+      ],
+      [
+        "l",
+        "os"
+      ],
+      [
+        "ËĮÉĽ",
+        "t"
+      ],
+      [
+        "ĠËĮaÊĬ",
+        "s"
+      ],
+      [
+        "ĠmËĪa",
+        "Éľt"
+      ],
+      [
+        "Ġw",
+        "ËĪuÉĻ"
+      ],
+      [
+        "Ġw",
+        "ËĪeÉª"
+      ],
+      [
+        "Ġse",
+        "É²"
+      ],
+      [
+        "Ġb",
+        "jËĪÉĽ"
+      ],
+      [
+        "Ġw",
+        "ÉĽn"
+      ],
+      [
+        "f",
+        "l"
+      ],
+      [
+        "Ġkh",
+        "wËĪa"
+      ],
+      [
+        "d",
+        "ËĪÉĽ"
+      ],
+      [
+        "v",
+        "É¹Éª"
+      ],
+      [
+        "ĠËĪa",
+        "É¾"
+      ],
+      [
+        "jËĪÉĳu",
+        "Éľ"
+      ],
+      [
+        "ĠËĮaËĲpk",
+        "ËĮeËĲ"
+      ],
+      [
+        "b",
+        "Êģ"
+      ],
+      [
+        "ĠtËĪaÉª",
+        "m"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĳ"
+      ],
+      [
+        "Ġs",
+        "ËĮa"
+      ],
+      [
+        "Ġz",
+        "ËĪoÉª"
+      ],
+      [
+        "ËĪÉĶÉ¾",
+        "a"
+      ],
+      [
+        "Ġd",
+        "ËĪÃ¸"
+      ],
+      [
+        "ËĪÉĶÉ¾",
+        "t"
+      ],
+      [
+        "ĠÅĭ",
+        "ËĪÉĶ"
+      ],
+      [
+        "m",
+        "in"
+      ],
+      [
+        "Ġl",
+        "ËĪÊĬk"
+      ],
+      [
+        "ËĪÉĶËĲ",
+        "t"
+      ],
+      [
+        "ĠËĪÉĶ",
+        "tÉ¾"
+      ],
+      [
+        "Ġf",
+        "ËĪaÉª"
+      ],
+      [
+        "ĠÉ¡",
+        "ÉĴt"
+      ],
+      [
+        "ËĪeËĲ",
+        "ÉĻn"
+      ],
+      [
+        "k",
+        "ËĪÉĶ"
+      ],
+      [
+        "ĠvËĪÉĽ",
+        "É¹i"
+      ],
+      [
+        "m",
+        "ÉĽ"
+      ],
+      [
+        "ËĪaÉª",
+        "z"
+      ],
+      [
+        "Ġe",
+        "sp"
+      ],
+      [
+        "É²",
+        "a"
+      ],
+      [
+        "Ġl",
+        "ËĪo"
+      ],
+      [
+        "ËĪÉĽËĲ",
+        "ra"
+      ],
+      [
+        "Î²",
+        "ËĪi"
+      ],
+      [
+        "ou",
+        "Éľ"
+      ],
+      [
+        "ËĮÉĻ",
+        "k"
+      ],
+      [
+        "tÊĥ",
+        "uËĲ"
+      ],
+      [
+        "Ġn",
+        "ËĪyÉĻ"
+      ],
+      [
+        "ÊĪ",
+        "É¾"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪy"
+      ],
+      [
+        "ĠtËĪo",
+        "Ã°o"
+      ],
+      [
+        "ËĪÉª",
+        "Ã§t"
+      ],
+      [
+        "Ġm",
+        "ÉªÃ§"
+      ],
+      [
+        "ĠËĪa",
+        "nd"
+      ],
+      [
+        "Ġkw",
+        "ËĮÉĽl"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĪaËĲ"
+      ],
+      [
+        "ĠnËĪi",
+        "Éľ"
+      ],
+      [
+        "ËĪÉĶ",
+        "p"
+      ],
+      [
+        "ËĪiËĲ",
+        "z"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĪaÊĬ"
+      ],
+      [
+        "ĠÉ¾ËĮÉĻh",
+        "i"
+      ],
+      [
+        "ĠsËĮÊĬ",
+        "o"
+      ],
+      [
+        "ĠÉĽ",
+        "É¡"
+      ],
+      [
+        "Ġd",
+        "Åĵ"
+      ],
+      [
+        "ĠÉ¡ËĮaËĲ",
+        "ÉªÉľ"
+      ],
+      [
+        "d",
+        "Éª"
+      ],
+      [
+        "l",
+        "ËĮa"
+      ],
+      [
+        "st",
+        "ËĪi"
+      ],
+      [
+        "ĠdËĮiËĲ",
+        "z"
+      ],
+      [
+        "Ġt",
+        "ËĮÊĬ"
+      ],
+      [
+        "Î¸",
+        "i"
+      ],
+      [
+        "ĠËĪÉª",
+        "skËĮoËĲ"
+      ],
+      [
+        "nd",
+        "ÉĻn"
+      ],
+      [
+        "Ġts",
+        "v"
+      ],
+      [
+        "Ġh",
+        "ËĪÉĻËĲ"
+      ],
+      [
+        "ĠÊĥ",
+        "ËĪÊĬ"
+      ],
+      [
+        "ÉĻt",
+        "ËĮeËĲ"
+      ],
+      [
+        "p",
+        "ËĮÉĽ"
+      ],
+      [
+        "ËĪaÉ¾",
+        "ÉĶn"
+      ],
+      [
+        "Ġp",
+        "ÉĽÊģ"
+      ],
+      [
+        "Ġ",
+        "y"
+      ],
+      [
+        "m",
+        "nËĮeËĲ"
+      ],
+      [
+        "ËĪÉĽ",
+        "llo"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÉĻ"
+      ],
+      [
+        "ĠËĮa",
+        "d"
+      ],
+      [
+        "ĠÊĥ",
+        "v"
+      ],
+      [
+        "ËĪÊı",
+        "É¾"
+      ],
+      [
+        "r",
+        "ËĪe"
+      ],
+      [
+        "y",
+        "ËĲ"
+      ],
+      [
+        "Ġp",
+        "ËĪaËĲs"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĽn"
+      ],
+      [
+        "Éª",
+        "dÊĴ"
+      ],
+      [
+        "ËĪua",
+        "i"
+      ],
+      [
+        "Ġf",
+        "i"
+      ],
+      [
+        "Ġt",
+        "ËĪyÉĻ"
+      ],
+      [
+        "ËĪaËĲ",
+        "ÉŁ"
+      ],
+      [
+        "Ġt",
+        "jËĪe"
+      ],
+      [
+        "ËĪaËĲn",
+        "aËĲ"
+      ],
+      [
+        "st",
+        "É¾"
+      ],
+      [
+        "Êİ",
+        "e"
+      ],
+      [
+        "ËĮe",
+        "Éªt"
+      ],
+      [
+        "b",
+        "a"
+      ],
+      [
+        "Ã°",
+        "as"
+      ],
+      [
+        "v",
+        "Êģ"
+      ],
+      [
+        "Ġz",
+        "ËĪÉĻËĲ"
+      ],
+      [
+        "ËĪaËĲ",
+        "li"
+      ],
+      [
+        "ÉŁÊ°",
+        "eËĲ"
+      ],
+      [
+        "ËĪaËĲt",
+        "eËĲ"
+      ],
+      [
+        "Ġv",
+        "ËĪa"
+      ],
+      [
+        "Ġsa",
+        "l"
+      ],
+      [
+        "ËĪaËĲ",
+        "no"
+      ],
+      [
+        "ĠÉ¡ÉĻ",
+        "z"
+      ],
+      [
+        "ĠhËĪoËĲ",
+        "ti"
+      ],
+      [
+        "ĠÉ²",
+        "ËĪiÉĽ"
+      ],
+      [
+        "t",
+        "Éľ"
+      ],
+      [
+        "ĠËĪaËĲ",
+        "p"
+      ],
+      [
+        "Ġw",
+        "ËĪÉĽl"
+      ],
+      [
+        "Ġm",
+        "ËĪÉªl"
+      ],
+      [
+        "Ġfy",
+        "ËĲÉ¾"
+      ],
+      [
+        "ËĪÉĽËĲs",
+        "aËĲ"
+      ],
+      [
+        "Ġb",
+        "ËĮiËĲ"
+      ],
+      [
+        "ËĪaËĲ",
+        "jaËĲ"
+      ],
+      [
+        "ËĪÉª",
+        "p"
+      ],
+      [
+        "Ġf",
+        "Êģ"
+      ],
+      [
+        "tsi",
+        "ËĪoËĲne"
+      ],
+      [
+        "Ġw",
+        "ËĪuÉľ"
+      ],
+      [
+        "Ġv",
+        "i"
+      ],
+      [
+        "ĠwËĪÉĳ",
+        "Éľn"
+      ],
+      [
+        "ËĪoËĲ",
+        "n"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪÉĻÉª"
+      ],
+      [
+        "ĠÊĿ",
+        "ËĪo"
+      ],
+      [
+        "Ġr",
+        "a"
+      ],
+      [
+        "m",
+        "ÉĻnt"
+      ],
+      [
+        "ËĪaÊĬ",
+        "nd"
+      ],
+      [
+        "Ġp",
+        "ÉĽÉ¾"
+      ],
+      [
+        "ĠÉĹ",
+        "ËĪaËĲÊĬ"
+      ],
+      [
+        "oËĲ",
+        "É¾"
+      ],
+      [
+        "h",
+        "ËĪo"
+      ],
+      [
+        "ĠÉĴ",
+        "n"
+      ],
+      [
+        "ĠÊİ",
+        "e"
+      ],
+      [
+        "ĠsËĪÉª",
+        "ks"
+      ],
+      [
+        "É¡",
+        "n"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪa"
+      ],
+      [
+        "Ġ",
+        "Î¸j"
+      ],
+      [
+        "Ġp",
+        "ËĪe"
+      ],
+      [
+        "sp",
+        "e"
+      ],
+      [
+        "Ġv",
+        "ËĪÉĻ"
+      ],
+      [
+        "Ġf",
+        "ËĪÉª"
+      ],
+      [
+        "ĠËĮÉªnt",
+        "ÊĬ"
+      ],
+      [
+        "l",
+        "ÉĻn"
+      ],
+      [
+        "Ġn",
+        "ËĪiËĲd"
+      ],
+      [
+        "ĠsËĮÊĬ",
+        "a"
+      ],
+      [
+        "ĠËĪu",
+        "m"
+      ],
+      [
+        "Ġd",
+        "ËĪeÉª"
+      ],
+      [
+        "ĠËĪÊĮ",
+        "bÊ°i"
+      ],
+      [
+        "ËĪÉĳËĲ",
+        "É¾"
+      ],
+      [
+        "Ġb",
+        "ËĪiÉĽÉľt"
+      ],
+      [
+        "Êİ",
+        "os"
+      ],
+      [
+        "Ġtsh",
+        "ËĪaiÉľ"
+      ],
+      [
+        "ĠËĮÉª",
+        "skËĮaËĲ"
+      ],
+      [
+        "ĠaÊĬ",
+        "ÉĻ"
+      ],
+      [
+        "ĠËĪy",
+        "Ã¦"
+      ],
+      [
+        "Ġd",
+        "yn"
+      ],
+      [
+        "Ġm",
+        "ËĪiËĲn"
+      ],
+      [
+        "ĠËĪÊĮ",
+        "cÊ°ËĲ"
+      ],
+      [
+        "Ġs",
+        "ÉĽ"
+      ],
+      [
+        "Ġn",
+        "ËĪy"
+      ],
+      [
+        "Ġn",
+        "ËĮÉĽl"
+      ],
+      [
+        "É¡",
+        "É¾"
+      ],
+      [
+        "Êĥ",
+        "ËĪe"
+      ],
+      [
+        "ĠÊĤ",
+        "ËĮÉĽ"
+      ],
+      [
+        "ĠËĪÉĽ",
+        "vÉ¹Éª"
+      ],
+      [
+        "ËĪÉĽl",
+        "p"
+      ],
+      [
+        "ĠbËĪa",
+        "k"
+      ],
+      [
+        "Ġ",
+        "eËĲ"
+      ],
+      [
+        "Ġf",
+        "ËĪaËĲ"
+      ],
+      [
+        "Ġk",
+        "ÉĽl"
+      ],
+      [
+        "ĠËĪeËĲ",
+        "s"
+      ],
+      [
+        "j",
+        "ËĪaËĲd"
+      ],
+      [
+        "Ġl",
+        "ËĮi"
+      ],
+      [
+        "mb",
+        "É¾e"
+      ],
+      [
+        "k",
+        "tÉĻ"
+      ],
+      [
+        "nt",
+        "a"
+      ],
+      [
+        "t",
+        "ËĪu"
+      ],
+      [
+        "ĠÃ°",
+        "ËĪat"
+      ],
+      [
+        "ĠËĪa",
+        "Î²"
+      ],
+      [
+        "ÉĻÉ¹",
+        "i"
+      ],
+      [
+        "ĠkwËĮÉĽ",
+        "lla"
+      ],
+      [
+        "Ġb",
+        "ÉĻn"
+      ],
+      [
+        "r",
+        "ËĮÉĽ"
+      ],
+      [
+        "Ġn",
+        "ÉĶ"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÉª"
+      ],
+      [
+        "ĠËĪa",
+        "p"
+      ],
+      [
+        "É¹",
+        "ÉĻ"
+      ],
+      [
+        "ËĪa",
+        "Éľkh"
+      ],
+      [
+        "ĠÊĲ",
+        "ËĪi"
+      ],
+      [
+        "Ġ",
+        "ËĪÉĳËĲ"
+      ],
+      [
+        "Éª",
+        "É¡ÉĻn"
+      ],
+      [
+        "Ġw",
+        "ËĪai"
+      ],
+      [
+        "Ġp",
+        "ÉĻt"
+      ],
+      [
+        "kËĲ",
+        "a"
+      ],
+      [
+        "Ġb",
+        "ËĪÉĽËĲ"
+      ],
+      [
+        "ËĪeËĲ",
+        "Êĭ"
+      ],
+      [
+        "ls",
+        "ÉĻÊĬ"
+      ],
+      [
+        "ĠcËĪaËĲh",
+        "ÉªËĮeËĲ"
+      ],
+      [
+        "Ġk",
+        "ÉĻn"
+      ],
+      [
+        "ĠËĮaÉªn",
+        "ÉĻm"
+      ],
+      [
+        "ËĪuËĲ",
+        "t"
+      ],
+      [
+        "Ġh",
+        "ËĪaÊĬ"
+      ],
+      [
+        "Ġt",
+        "ËĪanto"
+      ],
+      [
+        "ĠhÉĲ",
+        "z"
+      ],
+      [
+        "Ġs",
+        "ËĪÊĮÉ¾"
+      ],
+      [
+        "Ġn",
+        "o"
+      ],
+      [
+        "Ġt",
+        "ËĪÉĶËĲ"
+      ],
+      [
+        "Ġz",
+        "ËĪaÉª"
+      ],
+      [
+        "ĠtÉķËĪiÉĽ",
+        "Éľ"
+      ],
+      [
+        "Ġko",
+        "zËĪi"
+      ],
+      [
+        "Ġk",
+        "ËĪei"
+      ],
+      [
+        "Ã°",
+        "ËĪÉĶÉ¾"
+      ],
+      [
+        "ËĮÉĶ",
+        "Êģ"
+      ],
+      [
+        "Ġt",
+        "ËĪÊĮÉ¾"
+      ],
+      [
+        "ĠÊĲ",
+        "ËĪÉĻ"
+      ],
+      [
+        "ĠÉķËĪy",
+        "ÉĽÉľ"
+      ],
+      [
+        "ĠmËĮÊĬ",
+        "ÉŁÊ°eËĲ"
+      ],
+      [
+        "m",
+        "f"
+      ],
+      [
+        "Ġv",
+        "ËĪiËĲdÉľ"
+      ],
+      [
+        "k",
+        "ËĪa"
+      ],
+      [
+        "ĠÉĲ",
+        "É¡"
+      ],
+      [
+        "k",
+        "w"
+      ],
+      [
+        "ĠÊģ",
+        "ÉĽ"
+      ],
+      [
+        "x",
+        "ÉĻn"
+      ],
+      [
+        "Ġd",
+        "ÊĬ"
+      ],
+      [
+        "ĠkËĪÊĮÉ¾",
+        "nËĮeËĲ"
+      ],
+      [
+        "jËĪaËĲd",
+        "aËĲ"
+      ],
+      [
+        "Ġf",
+        "ÉĻ"
+      ],
+      [
+        "ĠËĮi",
+        "mp"
+      ],
+      [
+        "Ġh",
+        "Éªz"
+      ],
+      [
+        "Ġ",
+        "Ê°Ïĩ"
+      ],
+      [
+        "ËĪoËĲ",
+        "ni"
+      ],
+      [
+        "Ġx",
+        "ËĪiÉľ"
+      ],
+      [
+        "ËĪeËĲ",
+        "sÊĪ"
+      ],
+      [
+        "Êı",
+        "bÉľ"
+      ],
+      [
+        "ËĮÉĶÉ¾",
+        "ke"
+      ],
+      [
+        "ĠÉ¡",
+        "ËĪÉĻÊĬ"
+      ],
+      [
+        "ËĪÉª",
+        "ÊĥÉĻn"
+      ],
+      [
+        "l",
+        "es"
+      ],
+      [
+        "Ġf",
+        "ËĪiËĲ"
+      ],
+      [
+        "É¡",
+        "tÉĻ"
+      ],
+      [
+        "ËĪeËĲ",
+        "re"
+      ],
+      [
+        "Ġv",
+        "ËĮaËĲ"
+      ],
+      [
+        "Ġ",
+        "ËĪeÉª"
+      ],
+      [
+        "Ġm",
+        "ËĪuÉĻÉľn"
+      ],
+      [
+        "ĠÉ¡ËĪÊĬ",
+        "d"
+      ],
+      [
+        "ĠmËĮa",
+        "Éªn"
+      ],
+      [
+        "z",
+        "ËĪe"
+      ],
+      [
+        "ĠlËĪi",
+        "Éľ"
+      ],
+      [
+        "Ġm",
+        "u"
+      ],
+      [
+        "Ġk",
+        "ËĮÉĽl"
+      ],
+      [
+        "Ġj",
+        "ËĮÉĻh"
+      ],
+      [
+        "Ġf",
+        "ËĮÉĶÉ¾"
+      ],
+      [
+        "f",
+        "É¹"
+      ],
+      [
+        "Ġk",
+        "ËĪaÉªn"
+      ],
+      [
+        "ĠËĪÉĴ",
+        "lsÉĻÊĬ"
+      ],
+      [
+        "Î¸",
+        "ÉªÅĭ"
+      ],
+      [
+        "Ġth",
+        "ËĪonÉ¡Éľ"
+      ],
+      [
+        "t",
+        "ËĪÉĳ"
+      ],
+      [
+        "Î¸j",
+        "o"
+      ],
+      [
+        "m",
+        "ËĪÉĶ"
+      ],
+      [
+        "Ġ",
+        "os"
+      ],
+      [
+        "Ġs",
+        "ÊĬ"
+      ],
+      [
+        "ĠsËĪÊĮ",
+        "mÉĻ"
+      ],
+      [
+        "ĠvËĮÉĽ",
+        "n"
+      ],
+      [
+        "n",
+        "ËĪo"
+      ],
+      [
+        "ĠËĪak",
+        "tÊĥuËĲ"
+      ],
+      [
+        "É£",
+        "a"
+      ],
+      [
+        "ĠtÊ°",
+        "i"
+      ],
+      [
+        "Ġf",
+        "ËĮi"
+      ],
+      [
+        "Ġv",
+        "ËĪÉĽl"
+      ],
+      [
+        "ĠtËĪu",
+        "tËĲi"
+      ],
+      [
+        "x",
+        "os"
+      ]
+    ]
+  }
+}
\ No newline at end of file

From 4e09d1c69fe7e42d1387ba0d11aa8e82cccd4ff3 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Fri, 6 Feb 2026 22:17:45 -0800
Subject: [PATCH 38/94] Magpietts decoderonly 2601 valinfer (#61)

* add inference loggin in val step

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* infer during validation

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* use local transformer for val

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* ignore eval models when loading weights

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* logging statements

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* asr issue

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* bug fix for multinode

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* add whisper asr as well for val infer

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* add missing changes

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* handle errors

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* allow non lhotse validation loader also

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

---------

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 .../tts/conf/magpietts/easy_magpietts.yaml    |   2 +-
 .../conf/magpietts/easy_magpietts_lhotse.yaml |   3 +-
 .../tts/data/text_to_speech_dataset_lhotse.py |   8 +
 nemo/collections/tts/models/easy_magpietts.py | 291 +++++++++++++++++-
 .../magpietts_preference_optimization.py      |  76 +----
 nemo/collections/tts/parts/utils/helpers.py   |  83 ++++-
 6 files changed, 384 insertions(+), 79 deletions(-)

diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
index 6166fd68968f..8c44fef3f173 100644
--- a/examples/tts/conf/magpietts/easy_magpietts.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -60,7 +60,7 @@ model:
   
   embedding_dim: 1536
   hidden_dim: 1536
-  audio_embedding_dim: 256  # Smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection.
+  audio_embedding_dim: 1536  # Can set a smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection.
   codecmodel_path: ???
   max_epochs: ${max_epochs}
   steps_per_epoch: ${weighted_sampling_steps_per_epoch}
diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
index 5461af8d6ee5..af943ee25dbb 100644
--- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -59,7 +59,7 @@ model:
   
   embedding_dim: 1536
   hidden_dim: 1536
-  audio_embedding_dim: 256  # Smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection.
+  audio_embedding_dim: 1536  # Can set a smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection.
   codecmodel_path: ???
   
   # Local transformer parameters for autoregressive codebook prediction within a frame
@@ -141,6 +141,7 @@ model:
       shuffle: false
       num_workers: 2
       pin_memory: true
+      force_map_dataset: true
 
       input_cfg:
       - type: lhotse_shar
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index 480119202e28..5e088708573f 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -225,6 +225,7 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
         context_text_tokens_len_list = []
         context_has_text_context_list = []
         reward_list = []
+        language_list = []
         raw_text_list = (
             []
         )  # raw text here is the string of normalized text or text stored in the supervision segment. Used to distinguish from text tokens.
@@ -236,6 +237,12 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
                 raise ValueError(f"Invalid format in cut.supervisions[0].speaker: {speaker}")
             dataset_name = speaker.strip().split()[2].split(":")[-1]
             dataset_name_list.append(dataset_name)
+            language = (
+                cut.supervisions[0].language
+                if cut.supervisions[0].has_custom("language")
+                else "en"
+            )
+            language_list.append(language)
 
             # target audio or target codes
             if self.load_cached_codes_if_available and cut.has_custom("target_codes"):
@@ -444,6 +451,7 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
         batch_dict = {
             "dataset_names": dataset_name_list,
             "raw_texts": raw_text_list,
+            "languages": language_list,
             "text": collate_vectors(token_list, padding_value=self.pad_id),  # (B, max_len)
             "text_lens": torch.IntTensor(token_len_list),
         }
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 4c9b26ded4d7..224100e07ff6 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -11,8 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import random
 import time
+
+import numpy as np
+import soundfile as sf
 from dataclasses import dataclass
 from functools import partial
 from typing import Any, Dict, List, Optional, Sequence, Tuple
@@ -25,8 +29,12 @@
 from omegaconf import DictConfig
 from torch import nn
 from torch.utils.data import get_worker_info
+from torch.utils.data.distributed import DistributedSampler
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 
+import nemo.collections.asr as nemo_asr
+from nemo.collections.asr.metrics.wer import word_error_rate
+from nemo.collections.asr.parts.mixins.transcription import TranscribeConfig
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.collections.tts.data.text_to_speech_dataset_lhotse import (
     MagpieTTSLhotseDataset,
@@ -42,7 +50,12 @@
     SpecialAudioToken,
     cosine_schedule,
 )
-from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
+from nemo.collections.tts.parts.utils.helpers import (
+    get_mask_from_lengths,
+    get_speaker_embeddings_from_filepaths,
+    process_text_for_cer,
+    transcribe_with_whisper,
+)
 from nemo.core.classes import ModelPT
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
@@ -496,6 +509,31 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
                 )
             self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections)
 
+        # Validation inference with metrics (optional)
+        self.run_val_inference = cfg.get('run_val_inference', False)
+        self.use_multilingual_asr = cfg.get('use_multilingual_asr', False)
+        if self.run_val_inference:
+            logging.info("Loading eval models for validation inference (ASR and speaker verification)...")
+            if self.use_multilingual_asr:
+                from transformers import WhisperForConditionalGeneration, WhisperProcessor
+
+                self.whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
+                self.whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
+                self.whisper_model.eval()
+                self._eval_asr_model = None
+            else:
+                self._eval_asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
+                    model_name="nvidia/parakeet-ctc-0.6b"
+                )
+                self._eval_asr_model.freeze()
+                self.whisper_processor = None
+                self.whisper_model = None
+            self._eval_speaker_verification_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(
+                model_name='titanet_large'
+            )
+            self._eval_speaker_verification_model.freeze()
+            logging.info("Eval models loaded successfully.")
+
     def state_dict(self, destination=None, prefix='', keep_vars=False):
         """
         Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model
@@ -505,7 +543,14 @@ def state_dict(self, destination=None, prefix='', keep_vars=False):
             return {}
         # Don't save the speaker verification and codec model in the state dict
         state_dict = super().state_dict(destination, prefix, keep_vars)
-        keys_substrings_to_exclude = ['_speaker_verification_model', '_codec_model']
+        keys_substrings_to_exclude = [
+            '_speaker_verification_model',
+            '_codec_model',
+            '_eval_asr_model',
+            '_eval_speaker_verification_model',
+            'whisper_model',
+            'whisper_processor',
+        ]
         for key in list(state_dict.keys()):
             if any([substring in key for substring in keys_substrings_to_exclude]):
                 del state_dict[key]
@@ -521,7 +566,14 @@ def load_state_dict(self, state_dict, strict=True):
         if strict == False:
             super().load_state_dict(state_dict, strict=False)
         for name, child in self.named_children():
-            if name in ['_speaker_verification_model', '_codec_model']:
+            if name in [
+                '_speaker_verification_model',
+                '_codec_model',
+                '_eval_asr_model',
+                '_eval_speaker_verification_model',
+                'whisper_model',
+                'whisper_processor',
+            ]:
                 continue
             if any(param.numel() > 0 for param in child.parameters()):
                 # If the module has parameters, we want to change the default mapping so that the state_dict gets
@@ -1124,8 +1176,8 @@ def prepare_context_tensors(
         context_audio_codes, context_audio_codes_lens = self.stack_codes(
             context_audio_codes,
             context_audio_codes_lens,
-            self.audio_bos_id,
-            self.audio_eos_id,
+            self.context_audio_bos_id,
+            self.context_audio_eos_id,
             self.frame_stacking_factor,
             self.num_audio_codebooks,
         )
@@ -1848,6 +1900,10 @@ def training_step(self, batch, batch_idx):
 
     def validation_step(self, batch, batch_idx):
         # Extract inputs from batch and pass explicitly to process_batch
+        print(f"[Validation] global_rank: {self.global_rank}, "
+          f"local_rank: {self.local_rank}, "
+          f"world_size: {self.trainer.world_size}, "
+          f"batch_idx: {batch_idx}")
         if 'context_audio_codes' in batch:
             context_audio_codes = batch['context_audio_codes']
             context_audio_codes_lens = batch['context_audio_codes_lens']
@@ -1915,6 +1971,140 @@ def validation_step(self, batch, batch_idx):
             phoneme_loss = batch_output.phoneme_loss
             val_output['val_phoneme_loss'] = phoneme_loss
 
+        # Run inference and compute metrics if enabled
+        if self.run_val_inference:
+            infer_output = self.infer_batch(
+                batch,
+                max_decoder_steps=220,
+                temperature=0.7,
+                topk=80,
+                use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR
+            )
+
+            # Get audio output directory
+            audio_dir = self.trainer.log_dir
+            audio_dir = os.path.join(audio_dir, 'val_audios', f'epoch_{self.trainer.current_epoch}')
+            os.makedirs(audio_dir, exist_ok=True)
+
+            # Save predicted and context audio, collect paths for metrics
+            predicted_audio_paths = []
+            context_audio_paths = []
+
+            context_audio_codes_cleaned, context_audio_codes_lens_cleaned = self.remove_special_tokens(
+                codes=context_audio_codes,
+                codes_len=context_audio_codes_lens,
+            )
+            context_audio_cleaned, context_audio_lens_cleaned, _ = self.codes_to_audio(context_audio_codes_cleaned, context_audio_codes_lens_cleaned)
+
+            for idx in range(infer_output.predicted_audio.size(0)):
+                audio_np = infer_output.predicted_audio[idx].float().detach().cpu().numpy()
+                audio_np = audio_np[: infer_output.predicted_audio_lens[idx]]
+
+                # Log first batch on first device to wandb/tensorboard (first 3 samples)
+                if batch_idx == 0 and self.global_rank == 0 and idx < 3:
+                    for logger in self.loggers:
+                        if isinstance(logger, WandbLogger):
+                            logger.experiment.log(
+                                {
+                                    f"Audio_Generated/Example_{idx}": wandb.Audio(
+                                        audio_np, sample_rate=self.output_sample_rate, caption="generated"
+                                    )
+                                }
+                            )
+                        elif isinstance(logger, TensorBoardLogger):
+                            logger.experiment.add_audio(
+                                f'Example_{idx}/generated',
+                                audio_np,
+                                global_step=self.global_step,
+                                sample_rate=self.output_sample_rate,
+                            )
+
+                # Save predicted audio to disk
+                if audio_dir:
+                    audio_path = os.path.join(audio_dir, f'rank{self.global_rank}_batch{batch_idx}_idx{idx}.wav')
+                    sf.write(audio_path, audio_np, self.output_sample_rate)
+                    predicted_audio_paths.append(audio_path)
+
+                    # Save context audio for SSIM computation
+                    ctx_audio_np = context_audio_codes_cleaned[idx].float().detach().cpu().numpy()[: context_audio_lens_cleaned[idx]]
+                    ctx_path = os.path.join(audio_dir, f'rank{self.global_rank}_batch{batch_idx}_idx{idx}_context.wav')
+                    sf.write(ctx_path, ctx_audio_np, self.output_sample_rate)
+                    context_audio_paths.append(ctx_path)
+
+            # Compute metrics if we have audio paths
+            if predicted_audio_paths and context_audio_paths:
+                with torch.no_grad():
+                    # ASR transcription for CER/WER
+                    if self.use_multilingual_asr:
+                        self.whisper_model.to(self.device)
+                        languages = batch.get('languages', None)
+                        if languages is None:
+                            languages = ['en'] * len(predicted_audio_paths)
+                        pred_transcripts = []
+                        for audio_path, lang in zip(predicted_audio_paths, languages):
+                            try:
+                                transcript = transcribe_with_whisper(
+                                    audio_path, lang, self.whisper_processor, self.whisper_model, self.device, normalizer=None
+                                )
+                                pred_transcripts.append(process_text_for_cer(transcript))
+                            except Exception as e:
+                                logging.warning(f"Val ASR transcription failed for {audio_path}: {e}")
+                                pred_transcripts.append(None)
+                    else:
+                        pred_transcripts = self._eval_asr_model.transcribe(
+                            predicted_audio_paths,
+                            batch_size=len(predicted_audio_paths),
+                            override_config=TranscribeConfig(
+                                use_lhotse=False,
+                                batch_size=len(predicted_audio_paths),
+                                num_workers=0
+                            )
+                        )
+                        pred_transcripts = [process_text_for_cer(t.text) for t in pred_transcripts]
+
+                    # Speaker embeddings for SSIM
+                    try:
+                        pred_embeddings = get_speaker_embeddings_from_filepaths(
+                            predicted_audio_paths, self._eval_speaker_verification_model, self.device
+                        )
+                        ctx_embeddings = get_speaker_embeddings_from_filepaths(
+                            context_audio_paths, self._eval_speaker_verification_model, self.device
+                        )
+                    except Exception as e:
+                        logging.warning(f"Val speaker embeddings failed: {e}")
+                        pred_embeddings = ctx_embeddings = None
+
+                    # Compute per-sample metrics for successful cases only
+                    batch_cer, batch_wer, batch_ssim = [], [], []
+                    for idx in range(len(predicted_audio_paths)):
+                        if pred_transcripts[idx] is None:
+                            continue
+                        gt_transcript = process_text_for_cer(batch['raw_texts'][idx])
+                        cer = word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=True)
+                        wer = word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=False)
+                        batch_cer.append(cer)
+                        batch_wer.append(wer)
+                        if pred_embeddings is not None and ctx_embeddings is not None:
+                            pred_emb = pred_embeddings[idx].cpu().float().numpy()
+                            ctx_emb = ctx_embeddings[idx].cpu().float().numpy()
+                            ssim = np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb))
+                            batch_ssim.append(ssim)
+                        logging.info(
+                            f"[Val] rank{self.global_rank}_batch{batch_idx}_idx{idx}: "
+                            f"CER={cer:.4f}, WER={wer:.4f} | GT: '{gt_transcript[:50]}...' | Pred: '{pred_transcripts[idx][:50]}...'"
+                        )
+
+                    if batch_cer:
+                        val_output['val_cer'] = torch.tensor(np.mean(batch_cer), device=self.device)
+                        val_output['val_wer'] = torch.tensor(np.mean(batch_wer), device=self.device)
+                        if self.use_multilingual_asr:
+                            langs = batch.get('languages', ['en'] * len(predicted_audio_paths))
+                            val_output['val_languages'] = [langs[i] for i in range(len(pred_transcripts)) if pred_transcripts[i] is not None]
+                            val_output['val_cer_list'] = batch_cer
+                            val_output['val_wer_list'] = batch_wer
+                    if batch_ssim:
+                        val_output['val_ssim'] = torch.tensor(np.mean(batch_ssim), device=self.device)
+
         self.validation_step_outputs.append(val_output)
 
         return val_output
@@ -1935,6 +2125,39 @@ def on_validation_epoch_end(self):
             val_phoneme_loss = collect("val_phoneme_loss")
             self.log("val/phoneme_loss", val_phoneme_loss, prog_bar=True, sync_dist=True)
 
+        if self.run_val_inference:
+            # Collect metrics only from outputs that have them
+            def collect_if_exists(key):
+                values = [x[key] for x in self.validation_step_outputs if key in x]
+                if values:
+                    return torch.stack(values).mean()
+                return None
+
+            val_cer = collect_if_exists("val_cer")
+            val_wer = collect_if_exists("val_wer")
+            val_ssim = collect_if_exists("val_ssim")
+
+            if val_cer is not None:
+                self.log("val/cer", val_cer, prog_bar=True, sync_dist=True)
+            if val_wer is not None:
+                self.log("val/wer", val_wer, prog_bar=True, sync_dist=True)
+            if val_ssim is not None:
+                self.log("val/ssim", val_ssim, prog_bar=True, sync_dist=True)
+
+            if self.use_multilingual_asr:
+                lang_cer = {}
+                lang_wer = {}
+                for x in self.validation_step_outputs:
+                    if 'val_languages' not in x or 'val_cer_list' not in x or 'val_wer_list' not in x:
+                        continue
+                    for lang, cer, wer in zip(x['val_languages'], x['val_cer_list'], x['val_wer_list']):
+                        lang_cer.setdefault(lang, []).append(cer)
+                        lang_wer.setdefault(lang, []).append(wer)
+                for lang in lang_cer:
+                    self.log(f"val/cer_lang_{lang}", torch.tensor(np.mean(lang_cer[lang]), device=self.device), prog_bar=True, sync_dist=True)
+                for lang in lang_wer:
+                    self.log(f"val/wer_lang_{lang}", torch.tensor(np.mean(lang_wer[lang]), device=self.device), prog_bar=True, sync_dist=True)
+
         self.validation_step_outputs.clear()  # free memory
 
     def get_dataset(self, dataset_cfg, dataset_type):
@@ -2043,11 +2266,69 @@ def _setup_test_dataloader(self, dataset_cfg) -> torch.utils.data.DataLoader:
         return data_loader
 
     def setup_validation_data(self, cfg):
+        self._validation_uses_lhotse = cfg.get("use_lhotse", False)
         self._validation_dl = self._setup_test_dataloader(cfg)
 
     def setup_test_data(self, cfg):
         self._test_dl = self._setup_test_dataloader(cfg)
 
+    def val_dataloader(self):
+        """
+        Override val_dataloader to lazily wrap with DistributedSampler for non-lhotse
+        validation. This is needed because use_distributed_sampler=False is set for lhotse
+        training, which also prevents Lightning from auto-wrapping the non-lhotse validation
+        dataloader. We do this lazily (here instead of in setup_validation_data) because
+        distributed is not yet initialized when setup_validation_data is called during __init__.
+        """
+        if self._validation_dl is None:
+            self._validation_dl = []
+
+        if getattr(self, '_validation_uses_lhotse', False):
+            print(f"[val_dataloader] rank={self.global_rank}: Using lhotse, skipping DistributedSampler wrap")
+            return self._validation_dl
+
+        if not torch.distributed.is_initialized():
+            print(f"[val_dataloader] rank={self.global_rank}: Distributed not initialized, skipping DistributedSampler wrap")
+            return self._validation_dl
+
+        if getattr(self, '_val_dl_wrapped_with_dist_sampler', False):
+            return self._validation_dl
+
+        # Wrap the validation dataloader(s) with DistributedSampler
+        dataloaders = self._validation_dl if isinstance(self._validation_dl, list) else [self._validation_dl]
+        wrapped = []
+        for i, dl in enumerate(dataloaders):
+            if dl is not None and not isinstance(dl.sampler, DistributedSampler):
+                print(f"[val_dataloader] rank={self.global_rank}: Wrapping val dataloader {i} with DistributedSampler "
+                      f"(dataset_len={len(dl.dataset)}, world_size={torch.distributed.get_world_size()}, "
+                      f"batch_size={dl.batch_size}, num_workers={dl.num_workers})")
+                sampler = DistributedSampler(dl.dataset, shuffle=False)
+                new_dl = torch.utils.data.DataLoader(
+                    dl.dataset,
+                    sampler=sampler,
+                    batch_size=dl.batch_size,
+                    num_workers=dl.num_workers,
+                    collate_fn=dl.collate_fn,
+                    pin_memory=dl.pin_memory,
+                    drop_last=dl.drop_last,
+                    worker_init_fn=dl.worker_init_fn,
+                    persistent_workers=dl.persistent_workers,
+                )
+                wrapped.append(new_dl)
+            else:
+                sampler_type = type(dl.sampler).__name__ if dl is not None else "N/A"
+                print(f"[val_dataloader] rank={self.global_rank}: Val dataloader {i} already has "
+                      f"sampler={sampler_type}, skipping wrap")
+                wrapped.append(dl)
+
+        if isinstance(self._validation_dl, list):
+            self._validation_dl = wrapped
+        else:
+            self._validation_dl = wrapped[0]
+
+        self._val_dl_wrapped_with_dist_sampler = True
+        return self._validation_dl
+
     def _sample_audio_codes(
         self,
         last_hidden: torch.Tensor,
diff --git a/nemo/collections/tts/models/magpietts_preference_optimization.py b/nemo/collections/tts/models/magpietts_preference_optimization.py
index d583cacadd74..a6d11f6ac1ae 100644
--- a/nemo/collections/tts/models/magpietts_preference_optimization.py
+++ b/nemo/collections/tts/models/magpietts_preference_optimization.py
@@ -15,7 +15,6 @@
 import json
 import os
 import random
-import string
 from typing import Optional
 
 import librosa
@@ -27,7 +26,11 @@
 
 import nemo.collections.asr as nemo_asr
 from nemo.collections.asr.metrics.wer import word_error_rate
-from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors
+from nemo.collections.tts.parts.utils.helpers import (
+    get_speaker_embeddings_from_filepaths,
+    process_text_for_cer,
+    transcribe_with_whisper,
+)
 from nemo.utils import logging
 
 try:
@@ -1030,72 +1033,3 @@ def collect(key):
 
         for val_outputs in self.validation_step_outputs:
             val_outputs.clear()
-
-
-# Utility functions
-def process_text_for_cer(input_text):
-    """
-    Normalizes text for CER/WER calculation.
-    Taken from hallucination_eval.py
-    """
-    # Convert text to lowercase
-    lower_case_text = input_text.lower()
-
-    # Remove commas from text
-    no_comma_text = lower_case_text.replace(",", "")
-    # Replace "-" with spaces
-    no_dash_text = no_comma_text.replace("-", " ")
-    no_dash_text = no_dash_text.replace("'", "")
-    no_dash_text = no_dash_text.replace(";", "")
-    no_dash_text = no_dash_text.replace(".", "")
-
-    # Replace double spaces with single space
-    single_space_text = " ".join(no_dash_text.split())
-
-    single_space_text = single_space_text.translate(str.maketrans('', '', string.punctuation))
-
-    # @shehzeen: Added this to handle some common errors in ASR transcripts
-    single_space_text = single_space_text.replace("h t t p", "http")
-    single_space_text = single_space_text.replace("w w w", "www")
-
-    return single_space_text
-
-
-def get_speaker_embeddings_from_filepaths(filepaths, speaker_verification_model, device):
-    audio_batch = []
-    audio_lengths = []
-    for filepath in filepaths:
-        audio, sr = sf.read(filepath)
-        if sr != 16000:
-            audio = librosa.core.resample(audio, orig_sr=sr, target_sr=16000)
-        audio_tensor = torch.tensor(audio, dtype=torch.float32, device=device)
-        audio_batch.append(audio_tensor)
-        audio_lengths.append(audio_tensor.size(0))
-
-    batch_audio_lens = torch.tensor(audio_lengths, device=device).long()
-    max_audio_len = int(batch_audio_lens.max().item())
-    audio_batch = stack_tensors(audio_batch, max_lens=[max_audio_len])
-
-    _, speaker_embeddings = speaker_verification_model.forward(
-        input_signal=audio_batch, input_signal_length=batch_audio_lens
-    )
-
-    return speaker_embeddings
-
-
-def transcribe_with_whisper(
-    audio_filepath, language, whisper_processor, whisper_model, device, normalizer: Optional[Normalizer] = None
-):
-    speech_array, sampling_rate = librosa.load(audio_filepath, sr=16000)
-    forced_decoder_ids = (
-        whisper_processor.get_decoder_prompt_ids(language=language, task="transcribe") if language else None
-    )
-    inputs = whisper_processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt").input_features
-    inputs = inputs.to(device)
-    with torch.no_grad():
-        predicted_ids = whisper_model.generate(inputs, forced_decoder_ids=forced_decoder_ids)
-    transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
-    result = transcription[0]
-    if normalizer is not None:
-        result = normalizer.normalize(result)
-    return result
diff --git a/nemo/collections/tts/parts/utils/helpers.py b/nemo/collections/tts/parts/utils/helpers.py
index 1b1855cf356d..a8ee48ce57ef 100644
--- a/nemo/collections/tts/parts/utils/helpers.py
+++ b/nemo/collections/tts/parts/utils/helpers.py
@@ -42,15 +42,18 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import string
 from enum import Enum
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple
 
 import librosa
 import matplotlib.pylab as plt
 import numpy as np
+import soundfile as sf
 import torch
 from numba import jit, prange
 
+from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors
 from nemo.collections.tts.torch.tts_data_types import DATA_STR2DATA_CLASS, MAIN_DATA_TYPES, WithLens
 from nemo.utils import logging
 from nemo.utils.decorators import deprecated
@@ -802,3 +805,81 @@ def g2p_backward_compatible_support(g2p_target: str) -> str:
     # for backward compatibility
     g2p_target_new = g2p_target.replace("nemo_text_processing.g2p", "nemo.collections.tts.g2p")
     return g2p_target_new
+
+
+def process_text_for_cer(input_text):
+    """
+    Normalizes text for CER/WER calculation.
+    """
+    # Convert text to lowercase
+    lower_case_text = input_text.lower()
+
+    # Remove commas from text
+    no_comma_text = lower_case_text.replace(",", "")
+    # Replace "-" with spaces
+    no_dash_text = no_comma_text.replace("-", " ")
+    no_dash_text = no_dash_text.replace("'", "")
+    no_dash_text = no_dash_text.replace(";", "")
+    no_dash_text = no_dash_text.replace(".", "")
+
+    # Replace double spaces with single space
+    single_space_text = " ".join(no_dash_text.split())
+
+    single_space_text = single_space_text.translate(str.maketrans('', '', string.punctuation))
+
+    # Handle some common errors in ASR transcripts
+    single_space_text = single_space_text.replace("h t t p", "http")
+    single_space_text = single_space_text.replace("w w w", "www")
+
+    return single_space_text
+
+
+def transcribe_with_whisper(
+    audio_filepath: str,
+    language: Optional[str],
+    whisper_processor: Any,
+    whisper_model: Any,
+    device: torch.device,
+    normalizer: Optional[Any] = None,
+) -> str:
+    """
+    Transcribe audio with Whisper. Optionally normalize the transcript if a normalizer is provided.
+    """
+    speech_array, sampling_rate = librosa.load(audio_filepath, sr=16000)
+    forced_decoder_ids = (
+        whisper_processor.get_decoder_prompt_ids(language=language, task="transcribe") if language else None
+    )
+    inputs = whisper_processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt").input_features
+    inputs = inputs.to(device)
+    with torch.no_grad():
+        predicted_ids = whisper_model.generate(inputs, forced_decoder_ids=forced_decoder_ids)
+    transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
+    result = transcription[0]
+    if normalizer is not None:
+        result = normalizer.normalize(result)
+    return result
+
+
+def get_speaker_embeddings_from_filepaths(filepaths, speaker_verification_model, device):
+    """
+    Get speaker embeddings from audio filepaths using a speaker verification model.
+    """
+    audio_batch = []
+    audio_lengths = []
+    for filepath in filepaths:
+        audio, sr = sf.read(filepath)
+        if sr != 16000:
+            audio = librosa.core.resample(audio, orig_sr=sr, target_sr=16000)
+        audio_tensor = torch.tensor(audio, dtype=torch.float32, device=device)
+        audio_batch.append(audio_tensor)
+        audio_lengths.append(audio_tensor.size(0))
+
+    batch_audio_lens = torch.tensor(audio_lengths, device=device).long()
+    max_audio_len = int(batch_audio_lens.max().item())
+    audio_batch = stack_tensors(audio_batch, max_lens=[max_audio_len])
+
+    _, speaker_embeddings = speaker_verification_model.forward(
+        input_signal=audio_batch, input_signal_length=batch_audio_lens
+    )
+
+    return speaker_embeddings

From 97d98daa7273ba854506770cb6e794347d5ce5e6 Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Sat, 7 Feb 2026 06:18:32 +0000
Subject: [PATCH 39/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 .../tts/data/text_to_speech_dataset_lhotse.py |  6 +-
 nemo/collections/tts/models/easy_magpietts.py | 78 +++++++++++++------
 2 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index 5e088708573f..ba111838efa3 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -237,11 +237,7 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
                 raise ValueError(f"Invalid format in cut.supervisions[0].speaker: {speaker}")
             dataset_name = speaker.strip().split()[2].split(":")[-1]
             dataset_name_list.append(dataset_name)
-            language = (
-                cut.supervisions[0].language
-                if cut.supervisions[0].has_custom("language")
-                else "en"
-            )
+            language = cut.supervisions[0].language if cut.supervisions[0].has_custom("language") else "en"
             language_list.append(language)
 
             # target audio or target codes
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 224100e07ff6..c51315140d31 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -14,13 +14,12 @@
 import os
 import random
 import time
-
-import numpy as np
-import soundfile as sf
 from dataclasses import dataclass
 from functools import partial
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
+import numpy as np
+import soundfile as sf
 import torch
 import wandb
 from hydra.utils import instantiate
@@ -1900,10 +1899,12 @@ def training_step(self, batch, batch_idx):
 
     def validation_step(self, batch, batch_idx):
         # Extract inputs from batch and pass explicitly to process_batch
-        print(f"[Validation] global_rank: {self.global_rank}, "
-          f"local_rank: {self.local_rank}, "
-          f"world_size: {self.trainer.world_size}, "
-          f"batch_idx: {batch_idx}")
+        print(
+            f"[Validation] global_rank: {self.global_rank}, "
+            f"local_rank: {self.local_rank}, "
+            f"world_size: {self.trainer.world_size}, "
+            f"batch_idx: {batch_idx}"
+        )
         if 'context_audio_codes' in batch:
             context_audio_codes = batch['context_audio_codes']
             context_audio_codes_lens = batch['context_audio_codes_lens']
@@ -1978,7 +1979,7 @@ def validation_step(self, batch, batch_idx):
                 max_decoder_steps=220,
                 temperature=0.7,
                 topk=80,
-                use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR
+                use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR,
             )
 
             # Get audio output directory
@@ -1994,7 +1995,9 @@ def validation_step(self, batch, batch_idx):
                 codes=context_audio_codes,
                 codes_len=context_audio_codes_lens,
             )
-            context_audio_cleaned, context_audio_lens_cleaned, _ = self.codes_to_audio(context_audio_codes_cleaned, context_audio_codes_lens_cleaned)
+            context_audio_cleaned, context_audio_lens_cleaned, _ = self.codes_to_audio(
+                context_audio_codes_cleaned, context_audio_codes_lens_cleaned
+            )
 
             for idx in range(infer_output.predicted_audio.size(0)):
                 audio_np = infer_output.predicted_audio[idx].float().detach().cpu().numpy()
@@ -2026,7 +2029,13 @@ def validation_step(self, batch, batch_idx):
                     predicted_audio_paths.append(audio_path)
 
                     # Save context audio for SSIM computation
-                    ctx_audio_np = context_audio_codes_cleaned[idx].float().detach().cpu().numpy()[: context_audio_lens_cleaned[idx]]
+                    ctx_audio_np = (
+                        context_audio_codes_cleaned[idx]
+                        .float()
+                        .detach()
+                        .cpu()
+                        .numpy()[: context_audio_lens_cleaned[idx]]
+                    )
                     ctx_path = os.path.join(audio_dir, f'rank{self.global_rank}_batch{batch_idx}_idx{idx}_context.wav')
                     sf.write(ctx_path, ctx_audio_np, self.output_sample_rate)
                     context_audio_paths.append(ctx_path)
@@ -2044,7 +2053,12 @@ def validation_step(self, batch, batch_idx):
                         for audio_path, lang in zip(predicted_audio_paths, languages):
                             try:
                                 transcript = transcribe_with_whisper(
-                                    audio_path, lang, self.whisper_processor, self.whisper_model, self.device, normalizer=None
+                                    audio_path,
+                                    lang,
+                                    self.whisper_processor,
+                                    self.whisper_model,
+                                    self.device,
+                                    normalizer=None,
                                 )
                                 pred_transcripts.append(process_text_for_cer(transcript))
                             except Exception as e:
@@ -2055,10 +2069,8 @@ def validation_step(self, batch, batch_idx):
                             predicted_audio_paths,
                             batch_size=len(predicted_audio_paths),
                             override_config=TranscribeConfig(
-                                use_lhotse=False,
-                                batch_size=len(predicted_audio_paths),
-                                num_workers=0
-                            )
+                                use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0
+                            ),
                         )
                         pred_transcripts = [process_text_for_cer(t.text) for t in pred_transcripts]
 
@@ -2099,7 +2111,9 @@ def validation_step(self, batch, batch_idx):
                         val_output['val_wer'] = torch.tensor(np.mean(batch_wer), device=self.device)
                         if self.use_multilingual_asr:
                             langs = batch.get('languages', ['en'] * len(predicted_audio_paths))
-                            val_output['val_languages'] = [langs[i] for i in range(len(pred_transcripts)) if pred_transcripts[i] is not None]
+                            val_output['val_languages'] = [
+                                langs[i] for i in range(len(pred_transcripts)) if pred_transcripts[i] is not None
+                            ]
                             val_output['val_cer_list'] = batch_cer
                             val_output['val_wer_list'] = batch_wer
                     if batch_ssim:
@@ -2154,9 +2168,19 @@ def collect_if_exists(key):
                         lang_cer.setdefault(lang, []).append(cer)
                         lang_wer.setdefault(lang, []).append(wer)
                 for lang in lang_cer:
-                    self.log(f"val/cer_lang_{lang}", torch.tensor(np.mean(lang_cer[lang]), device=self.device), prog_bar=True, sync_dist=True)
+                    self.log(
+                        f"val/cer_lang_{lang}",
+                        torch.tensor(np.mean(lang_cer[lang]), device=self.device),
+                        prog_bar=True,
+                        sync_dist=True,
+                    )
                 for lang in lang_wer:
-                    self.log(f"val/wer_lang_{lang}", torch.tensor(np.mean(lang_wer[lang]), device=self.device), prog_bar=True, sync_dist=True)
+                    self.log(
+                        f"val/wer_lang_{lang}",
+                        torch.tensor(np.mean(lang_wer[lang]), device=self.device),
+                        prog_bar=True,
+                        sync_dist=True,
+                    )
 
         self.validation_step_outputs.clear()  # free memory
 
@@ -2288,7 +2312,9 @@ def val_dataloader(self):
             return self._validation_dl
 
         if not torch.distributed.is_initialized():
-            print(f"[val_dataloader] rank={self.global_rank}: Distributed not initialized, skipping DistributedSampler wrap")
+            print(
+                f"[val_dataloader] rank={self.global_rank}: Distributed not initialized, skipping DistributedSampler wrap"
+            )
             return self._validation_dl
 
         if getattr(self, '_val_dl_wrapped_with_dist_sampler', False):
@@ -2299,9 +2325,11 @@ def val_dataloader(self):
         wrapped = []
         for i, dl in enumerate(dataloaders):
             if dl is not None and not isinstance(dl.sampler, DistributedSampler):
-                print(f"[val_dataloader] rank={self.global_rank}: Wrapping val dataloader {i} with DistributedSampler "
-                      f"(dataset_len={len(dl.dataset)}, world_size={torch.distributed.get_world_size()}, "
-                      f"batch_size={dl.batch_size}, num_workers={dl.num_workers})")
+                print(
+                    f"[val_dataloader] rank={self.global_rank}: Wrapping val dataloader {i} with DistributedSampler "
+                    f"(dataset_len={len(dl.dataset)}, world_size={torch.distributed.get_world_size()}, "
+                    f"batch_size={dl.batch_size}, num_workers={dl.num_workers})"
+                )
                 sampler = DistributedSampler(dl.dataset, shuffle=False)
                 new_dl = torch.utils.data.DataLoader(
                     dl.dataset,
@@ -2317,8 +2345,10 @@ def val_dataloader(self):
                 wrapped.append(new_dl)
             else:
                 sampler_type = type(dl.sampler).__name__ if dl is not None else "N/A"
-                print(f"[val_dataloader] rank={self.global_rank}: Val dataloader {i} already has "
-                      f"sampler={sampler_type}, skipping wrap")
+                print(
+                    f"[val_dataloader] rank={self.global_rank}: Val dataloader {i} already has "
+                    f"sampler={sampler_type}, skipping wrap"
+                )
                 wrapped.append(dl)
 
         if isinstance(self._validation_dl, list):

From 003c4395cb30c919b94e3702e66169d9e4b715b6 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeenh@nvidia.com>
Date: Sat, 7 Feb 2026 10:08:41 -0800
Subject: [PATCH 40/94] bug fixes in inference and logging

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 60 ++++++++++++++++++-
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index c51315140d31..0e3f6ce55797 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import os
 import random
 import time
@@ -519,6 +520,8 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
                 self.whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
                 self.whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
                 self.whisper_model.eval()
+                for param in self.whisper_model.parameters():
+                    param.requires_grad = False
                 self._eval_asr_model = None
             else:
                 self._eval_asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
@@ -533,6 +536,38 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self._eval_speaker_verification_model.freeze()
             logging.info("Eval models loaded successfully.")
 
+    def setup_optimizer_param_groups(self):
+        """
+        Override to exclude frozen eval/inference-only models from the optimizer.
+        This prevents optimizer state mismatch errors when resuming from checkpoints
+        that were saved before these eval models were added.
+        """
+        modules_to_exclude = {
+            '_speaker_verification_model',
+            # '_codec_model',
+            '_eval_asr_model',
+            '_eval_speaker_verification_model',
+            'whisper_model',
+            'whisper_processor',
+        }
+
+        # Collect parameter ids to exclude
+        excluded_param_ids = set()
+        for name, module in self.named_children():
+            if name in modules_to_exclude:
+                for param in module.parameters():
+                    excluded_param_ids.add(id(param))
+
+        # Build param group with only trainable (non-excluded) parameters
+        trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids]
+
+        logging.info(
+            f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, "
+            f"{len(excluded_param_ids)} params excluded (eval models)"
+        )
+
+        self._optimizer_param_groups = [{"params": trainable_params}]
+
     def state_dict(self, destination=None, prefix='', keep_vars=False):
         """
         Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model
@@ -1980,6 +2015,8 @@ def validation_step(self, batch, batch_idx):
                 temperature=0.7,
                 topk=80,
                 use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR,
+                use_cfg=True,
+                cfg_scale=2.5
             )
 
             # Get audio output directory
@@ -2030,7 +2067,7 @@ def validation_step(self, batch, batch_idx):
 
                     # Save context audio for SSIM computation
                     ctx_audio_np = (
-                        context_audio_codes_cleaned[idx]
+                        context_audio_cleaned[idx]
                         .float()
                         .detach()
                         .cpu()
@@ -2096,16 +2133,35 @@ def validation_step(self, batch, batch_idx):
                         wer = word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=False)
                         batch_cer.append(cer)
                         batch_wer.append(wer)
+                        ssim = None
                         if pred_embeddings is not None and ctx_embeddings is not None:
                             pred_emb = pred_embeddings[idx].cpu().float().numpy()
                             ctx_emb = ctx_embeddings[idx].cpu().float().numpy()
-                            ssim = np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb))
+                            ssim = float(np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb)))
                             batch_ssim.append(ssim)
                         logging.info(
                             f"[Val] rank{self.global_rank}_batch{batch_idx}_idx{idx}: "
                             f"CER={cer:.4f}, WER={wer:.4f} | GT: '{gt_transcript[:50]}...' | Pred: '{pred_transcripts[idx][:50]}...'"
                         )
 
+                        # Save per-audio metrics JSON file alongside the audio file
+                        if audio_dir:
+                            metrics_dict = {
+                                'cer': float(cer),
+                                'wer': float(wer),
+                                'ssim': ssim,
+                                'gt_transcript': gt_transcript,
+                                'pred_transcript': pred_transcripts[idx],
+                                'audio_path': predicted_audio_paths[idx],
+                                'epoch': self.trainer.current_epoch,
+                                'global_step': self.global_step,
+                            }
+                            metrics_path = os.path.join(
+                                audio_dir, f'rank{self.global_rank}_batch{batch_idx}_idx{idx}_metrics.json'
+                            )
+                            with open(metrics_path, 'w') as f:
+                                json.dump(metrics_dict, f, indent=2)
+
                     if batch_cer:
                         val_output['val_cer'] = torch.tensor(np.mean(batch_cer), device=self.device)
                         val_output['val_wer'] = torch.tensor(np.mean(batch_wer), device=self.device)

From af7e76b253a20bca9ca86ffcaa0143c1d44a6d10 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Sat, 7 Feb 2026 13:38:06 -0800
Subject: [PATCH 41/94] more tests

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 208 ++++++--
 .../tts/test_infer_vs_process_batch.py        | 487 ++++++++++++++++++
 2 files changed, 660 insertions(+), 35 deletions(-)
 create mode 100644 tests/collections/tts/test_infer_vs_process_batch.py

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 0e3f6ce55797..0bb3aaff8441 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -93,6 +93,9 @@ class ProcessBatchOutput:
         local_transformer_loss: Loss from local transformer (None if not using local transformer)
         local_transformer_logits: Logits from local transformer, shape (B, T', num_codebooks * num_tokens_per_codebook)
         logits: Predicted logits from the main decoder, shape (B, T', num_codebooks * num_tokens_per_codebook)
+        phoneme_logits: Predicted phoneme logits, shape (B, T', phoneme_stacking_factor * phoneme_vocab_size). None if no phoneme tokenizer.
+        phoneme_tokens_target: Target phoneme tokens (shifted), shape (B, S, T'). None if no phoneme tokenizer.
+        phoneme_tokens_lens_target: Length of target phoneme tokens (B,). None if no phoneme tokenizer.
         audio_codes_target: Target audio codes for the decoder, shape (B, C, T')
         audio_codes_lens_target: Length of target audio codes for each batch item, shape (B,)
         context_audio_codes: Audio codes extracted from context audio, shape (B, C, T')
@@ -106,6 +109,9 @@ class ProcessBatchOutput:
     local_transformer_loss: Optional[torch.Tensor]
     local_transformer_logits: Optional[torch.Tensor]
     logits: torch.Tensor
+    phoneme_logits: Optional[torch.Tensor]
+    phoneme_tokens_target: Optional[torch.Tensor]
+    phoneme_tokens_lens_target: Optional[torch.Tensor]
     audio_codes_target: torch.Tensor
     audio_codes_lens_target: torch.Tensor
     context_audio_codes: torch.Tensor
@@ -202,6 +208,8 @@ class StreamingState:
     phoneme_prediction_end_idx: torch.Tensor
     gt_phoneme_embeddings: Optional[torch.Tensor] = None  # (B, T', E) pre-computed GT embeddings
     gt_phoneme_lens: Optional[torch.Tensor] = None  # (B,) lengths after stacking
+    gt_audio_embeddings: Optional[torch.Tensor] = None  # (B, T', E) pre-computed GT audio embeddings
+    gt_audio_lens: Optional[torch.Tensor] = None  # (B,) lengths after stacking
 
 
 @dataclass
@@ -225,6 +233,9 @@ class InferBatchOutput:
     predicted_codes: torch.Tensor  # (B, num_codebooks, T_frames)
     predicted_codes_lens: torch.Tensor  # (B,)
     rtf_metrics: Dict[str, Any]
+    predicted_phoneme_tokens: Optional[torch.Tensor] = None  # (B, phoneme_stacking_factor, T_phoneme_steps)
+    predicted_phoneme_tokens_lens: Optional[torch.Tensor] = None  # (B,) number of valid phoneme steps per item
+    phoneme_prediction_start_idx: Optional[torch.Tensor] = None  # (B,) start index into predicted_phoneme_tokens
 
 
 def worker_init_fn(worker_id):
@@ -970,10 +981,14 @@ def sample_codes_from_logits(
             codebook_logits_rescored = codebook_logits.clone()
             codebook_logits_rescored[indices_to_remove] = float('-inf')
 
-            codebook_probs = torch.softmax(
-                codebook_logits_rescored / temperature, dim=-1
-            )  # (B, num_tokens_per_codebook)
-            codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
+            if temperature <= 0.0:
+                # Argmax sampling for deterministic output
+                codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True)  # (B, 1)
+            else:
+                codebook_probs = torch.softmax(
+                    codebook_logits_rescored / temperature, dim=-1
+                )  # (B, num_tokens_per_codebook)
+                codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
             all_preds.append(codebook_preds)
         all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
         return all_preds
@@ -992,10 +1007,14 @@ def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, t
             codebook_logits_rescored = codebook_logits.clone()
             codebook_logits_rescored[indices_to_remove] = float('-inf')
 
-            codebook_probs = torch.softmax(
-                codebook_logits_rescored / temperature, dim=-1
-            )  # (B, num_tokens_per_codebook)
-            codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
+            if temperature <= 0.0:
+                # Argmax sampling for deterministic output
+                codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True)  # (B, 1)
+            else:
+                codebook_probs = torch.softmax(
+                    codebook_logits_rescored / temperature, dim=-1
+                )  # (B, num_tokens_per_codebook)
+                codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
             all_preds.append(codebook_preds)
         all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
         return all_preds
@@ -1810,6 +1829,9 @@ def process_batch(
 
         # Compute phoneme loss if applicable
         phoneme_loss = None
+        pb_phoneme_logits = None
+        pb_phoneme_tokens_target = None
+        pb_phoneme_tokens_lens_target = None
         if self.phoneme_tokenizer is not None and phoneme_tokens_stacked is not None:
             # Phoneme predictions start at phoneme_delay
             pred_embeddings_phoneme = self.slice_pred_embeddings(
@@ -1817,11 +1839,13 @@ def process_batch(
                 context_lens=phoneme_delay,
                 target_lens=phoneme_tokens_lens_stacked - 1,
             )
-            phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme)
+            pb_phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme)
+            pb_phoneme_tokens_target = phoneme_tokens_stacked[:, :, 1:].long()
+            pb_phoneme_tokens_lens_target = phoneme_tokens_lens_stacked - 1
 
             if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input):
                 phoneme_loss, _ = self.compute_phoneme_loss(
-                    phoneme_logits, phoneme_tokens_stacked[:, :, 1:].long(), phoneme_tokens_lens_stacked - 1
+                    pb_phoneme_logits, pb_phoneme_tokens_target, pb_phoneme_tokens_lens_target
                 )
                 print("No Dropout - phoneme loss:", phoneme_loss.item())
             else:
@@ -1837,6 +1861,9 @@ def process_batch(
             local_transformer_loss=local_transformer_loss,
             local_transformer_logits=local_transformer_logits,
             logits=logits,
+            phoneme_logits=pb_phoneme_logits,
+            phoneme_tokens_target=pb_phoneme_tokens_target,
+            phoneme_tokens_lens_target=pb_phoneme_tokens_lens_target,
             audio_codes_target=audio_codes_target,
             audio_codes_lens_target=audio_codes_lens_target,
             context_audio_codes=context_audio_codes_processed,
@@ -2451,7 +2478,10 @@ def _sample_audio_codes(
             # Parallel sampling from all codebook logits
             audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk)
             # Argmax sampling for reliable EOS detection
-            all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01)
+            if temperature <= 0.0:
+                all_codes_next_argmax = audio_codes_next  # already argmax
+            else:
+                all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01)
 
         return audio_codes_next, all_codes_next_argmax
 
@@ -2471,6 +2501,8 @@ def streaming_init(
         phoneme_sampling_method: str = 'argmax',
         gt_phoneme_tokens: Optional[torch.Tensor] = None,
         gt_phoneme_tokens_lens: Optional[torch.Tensor] = None,
+        gt_audio_codes: Optional[torch.Tensor] = None,
+        gt_audio_codes_lens: Optional[torch.Tensor] = None,
     ) -> StreamingState:
         """
         Initialize streaming TTS inference state.
@@ -2509,6 +2541,9 @@ def streaming_init(
             phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
             gt_phoneme_tokens: Optional GT phoneme tokens (B, L) with BOS/EOS for teacher forcing.
             gt_phoneme_tokens_lens: Lengths of GT phoneme tokens (B,).
+            gt_audio_codes: Optional GT audio codes (B, C*S, T) already stacked with BOS/EOS,
+                input portion ([:, :, :-1]) for teacher forcing. Pre-processed by caller.
+            gt_audio_codes_lens: Lengths of GT audio codes (B,) after stacking.
 
         Returns:
             StreamingState: Initial state for streaming inference.
@@ -2586,6 +2621,13 @@ def streaming_init(
                 )
                 gt_phoneme_embeddings = self.embed_phoneme_tokens(gt_phoneme_stacked)  # (B, T', E)
 
+            # Process GT audio codes if provided (for teacher forcing)
+            gt_audio_embeddings = None
+            gt_audio_lens_state = None
+            if gt_audio_codes is not None and gt_audio_codes_lens is not None:
+                gt_audio_embeddings = self.embed_audio_tokens(gt_audio_codes)  # (B, T', E)
+                gt_audio_lens_state = gt_audio_codes_lens
+
             # Initialize streaming state with batch support
             state = StreamingState(
                 batch_size=batch_size,
@@ -2624,6 +2666,8 @@ def streaming_init(
                 phoneme_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
                 gt_phoneme_embeddings=gt_phoneme_embeddings,
                 gt_phoneme_lens=gt_phoneme_lens,
+                gt_audio_embeddings=gt_audio_embeddings,
+                gt_audio_lens=gt_audio_lens_state,
             )
 
             return state
@@ -2722,11 +2766,13 @@ def streaming_step(
                 if force_dropout_text:
                     text_embedded = text_embedded * 0
 
-                text_add_mask = needs_text.view(batch_size, 1, 1).float()
-                next_input = next_input + text_embedded * text_add_mask
                 # Check for EOS tokens - mark those items as text_finished
-                # Items that receive EOS should not have their text embedded added after this step
+                # The EOS token itself IS embedded normally (matching process_batch behavior
+                # where EOS is part of the text sequence). After this step, text_finished is set
+                # so subsequent steps won't add any text embedding.
                 is_eos_token = text_tokens == self.eos_id  # (B,) bool
+                text_add_mask = needs_text.view(batch_size, 1, 1).float()
+                next_input = next_input + text_embedded * text_add_mask
                 state.text_finished = state.text_finished | is_eos_token
 
             elif text_tokens is None:
@@ -2778,28 +2824,39 @@ def streaming_step(
 
             # --- Audio embedding for audio phase items ---
             if needs_audio.any():
-                # Determine which items are at first audio step
-                first_audio_step = needs_audio & (state.audio_steps == 0)
-                has_last_audio = needs_audio & ~first_audio_step & (state.last_audio_codes is not None)
-
                 audio_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
 
-                if first_audio_step.any():
-                    # Create BOS for items at first audio step
-                    audio_bos = torch.full(
-                        (batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1),
-                        self.audio_bos_id,
-                        device=device,
-                    ).long()
-                    audio_bos_emb = self.embed_audio_tokens(audio_bos)  # (B, 1, E)
-                    first_mask = first_audio_step.view(batch_size, 1, 1).float()
-                    audio_emb = audio_emb + audio_bos_emb * first_mask
-
-                if has_last_audio.any() and state.last_audio_codes is not None:
-                    # Use last predicted audio
-                    last_audio_emb = self.embed_audio_tokens(state.last_audio_codes.unsqueeze(2))  # (B, 1, E)
-                    last_mask = has_last_audio.view(batch_size, 1, 1).float()
-                    audio_emb = audio_emb + last_audio_emb * last_mask
+                if state.gt_audio_embeddings is not None:
+                    # Teacher forcing: use pre-computed GT audio embeddings
+                    # Only use GT embedding if within valid length, otherwise zero
+                    within_gt_len = state.audio_steps < state.gt_audio_lens  # (B,)
+                    positions = state.audio_steps.clamp(max=state.gt_audio_embeddings.size(1) - 1)
+                    gt_emb = state.gt_audio_embeddings[
+                        torch.arange(batch_size, device=device), positions, :
+                    ].unsqueeze(1)  # (B, 1, E)
+                    audio_mask = (needs_audio & within_gt_len).view(batch_size, 1, 1).float()
+                    audio_emb = audio_emb + gt_emb * audio_mask
+                else:
+                    # Prediction mode: use BOS or last predicted audio
+                    first_audio_step = needs_audio & (state.audio_steps == 0)
+                    has_last_audio = needs_audio & ~first_audio_step & (state.last_audio_codes is not None)
+
+                    if first_audio_step.any():
+                        # Create BOS for items at first audio step
+                        audio_bos = torch.full(
+                            (batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1),
+                            self.audio_bos_id,
+                            device=device,
+                        ).long()
+                        audio_bos_emb = self.embed_audio_tokens(audio_bos)  # (B, 1, E)
+                        first_mask = first_audio_step.view(batch_size, 1, 1).float()
+                        audio_emb = audio_emb + audio_bos_emb * first_mask
+
+                    if has_last_audio.any() and state.last_audio_codes is not None:
+                        # Use last predicted audio
+                        last_audio_emb = self.embed_audio_tokens(state.last_audio_codes.unsqueeze(2))  # (B, 1, E)
+                        last_mask = has_last_audio.view(batch_size, 1, 1).float()
+                        audio_emb = audio_emb + last_audio_emb * last_mask
 
                 next_input = next_input + audio_emb
 
@@ -2952,6 +3009,14 @@ def streaming_step(
                 state.all_predictions.append(audio_codes_unstacked)
                 audio_codes_next = audio_codes_unstacked
 
+            # Force-finish items when GT audio is exhausted (teacher forcing).
+            # This is checked AFTER predictions so the last valid prediction is still made.
+            # audio_steps was already incremented above. When audio_steps >= gt_audio_lens,
+            # we've consumed all GT input positions and made all corresponding predictions.
+            if state.gt_audio_embeddings is not None and state.gt_audio_lens is not None:
+                gt_exhausted = needs_audio & (state.audio_steps >= state.gt_audio_lens)
+                state.finished = state.finished | gt_exhausted
+
             return state, audio_codes_next, pred_phoneme_tokens
 
     def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor:
@@ -3175,6 +3240,7 @@ def infer_batch(
         phoneme_input_type: str = 'pred',
         phoneme_sampling_method: str = 'argmax',
         force_dropout_text: bool = False,
+        use_teacher_forced: bool = False,
     ) -> InferBatchOutput:
         """
         Batch inference using streaming infrastructure.
@@ -3192,8 +3258,11 @@ def infer_batch(
                 - context_audio / context_audio_lens: Raw context audio to encode
                 - phoneme_tokens (optional): GT phoneme tokens (B, L'')
                 - phoneme_tokens_lens (optional): Lengths (B,)
+                For teacher forcing (use_teacher_forced=True), also requires:
+                - audio_codes / audio_codes_lens: GT audio codes (B, C, T) OR
+                - audio / audio_lens: Raw audio waveforms to encode
             max_decoder_steps: Maximum number of decoder steps.
-            temperature: Sampling temperature for audio codes.
+            temperature: Sampling temperature for audio codes. Use 0.0 for argmax.
             topk: Top-k sampling parameter.
             use_cfg: Whether to use classifier-free guidance.
             cfg_scale: CFG scale factor.
@@ -3201,6 +3270,8 @@ def infer_batch(
             phoneme_input_type: 'gt' or 'pred' for phoneme tokens.
             phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
             force_dropout_text: Whether to dropout text embeddings.
+            use_teacher_forced: If True, feed GT audio codes (and force GT phonemes, argmax sampling)
+                instead of predicted codes at each streaming step.
 
         Returns:
             InferBatchOutput containing predicted audio, codes, and RTF metrics.
@@ -3227,6 +3298,53 @@ def infer_batch(
             gt_phoneme_tokens = batch.get('phoneme_tokens')
             gt_phoneme_tokens_lens = batch.get('phoneme_tokens_lens')
 
+            # Prepare GT audio codes for teacher forcing if requested
+            gt_audio_codes_for_init = None
+            gt_audio_codes_lens_for_init = None
+            if use_teacher_forced:
+                # Force GT phoneme input and argmax sampling
+                phoneme_input_type = 'gt'
+                temperature = 0.0
+
+                # Get GT audio codes - support both codes and raw audio
+                if 'audio_codes' in batch:
+                    gt_audio_codes_raw = batch['audio_codes']
+                    gt_audio_codes_lens_raw = batch['audio_codes_lens']
+                elif 'audio' in batch:
+                    gt_audio_codes_raw, gt_audio_codes_lens_raw = self.audio_to_codes(
+                        batch['audio'], batch['audio_lens']
+                    )
+                else:
+                    raise ValueError(
+                        "Teacher forcing requires 'audio_codes'/'audio_codes_lens' or 'audio'/'audio_lens' in batch."
+                    )
+
+                # Pre-process GT audio codes same as prepare_audio_channel_embeddings:
+                # codec convert, add BOS/EOS, stack, then take input portion ([:, :, :-1])
+                if self._codec_converter is not None:
+                    gt_audio_codes_raw = self._codec_converter.convert_original_to_new(
+                        audio_tokens=gt_audio_codes_raw, audio_lens=gt_audio_codes_lens_raw
+                    ).long()
+
+                gt_audio_codes_processed, gt_audio_codes_lens_processed = self.add_special_tokens(
+                    codes=gt_audio_codes_raw,
+                    codes_len=gt_audio_codes_lens_raw,
+                    bos_id=self.audio_bos_id,
+                    eos_id=self.audio_eos_id,
+                )
+                gt_audio_codes_processed, gt_audio_codes_lens_processed = self.stack_codes(
+                    gt_audio_codes_processed,
+                    gt_audio_codes_lens_processed,
+                    self.audio_bos_id,
+                    self.audio_eos_id,
+                    self.frame_stacking_factor,
+                    self.num_audio_codebooks,
+                )
+
+                # Input portion: all tokens except the last (teacher forcing shift)
+                gt_audio_codes_for_init = gt_audio_codes_processed[:, :, :-1]
+                gt_audio_codes_lens_for_init = gt_audio_codes_lens_processed - 1
+
             batch_size = text.size(0)
 
             # Initialize streaming state
@@ -3244,6 +3362,8 @@ def infer_batch(
                 phoneme_sampling_method=phoneme_sampling_method,
                 gt_phoneme_tokens=gt_phoneme_tokens,
                 gt_phoneme_tokens_lens=gt_phoneme_tokens_lens,
+                gt_audio_codes=gt_audio_codes_for_init,
+                gt_audio_codes_lens=gt_audio_codes_lens_for_init,
             )
 
             time_to_first_prediction = None
@@ -3296,12 +3416,30 @@ def infer_batch(
                 'batch_size': batch_size,
             }
 
+            # Extract raw phoneme predictions from state
+            ib_phoneme_tokens = None
+            ib_phoneme_tokens_lens = None
+            if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0:
+                # Stack: each element is (B, phoneme_stacking_factor), stack along time -> (B, S, T)
+                ib_phoneme_tokens = torch.stack(state.all_phoneme_predictions, dim=-1)  # (B, S, T)
+                # Compute per-item lengths using start/end indices
+                ib_phoneme_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device)
+                for i in range(batch_size):
+                    start = max(0, state.phoneme_prediction_start_idx[i].item())
+                    end = state.phoneme_prediction_end_idx[i].item()
+                    if end < 0:
+                        end = ib_phoneme_tokens.size(-1)
+                    ib_phoneme_tokens_lens[i] = end - start
+
             return InferBatchOutput(
                 predicted_audio=finalize_output.audio,
                 predicted_audio_lens=finalize_output.audio_len,
                 predicted_codes=finalize_output.audio_codes,
                 predicted_codes_lens=finalize_output.audio_codes_len,
                 rtf_metrics=rtf_metrics,
+                predicted_phoneme_tokens=ib_phoneme_tokens,
+                predicted_phoneme_tokens_lens=ib_phoneme_tokens_lens,
+                phoneme_prediction_start_idx=state.phoneme_prediction_start_idx.clone() if ib_phoneme_tokens is not None else None,
             )
 
     @classmethod
diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py
new file mode 100644
index 000000000000..b9838602586a
--- /dev/null
+++ b/tests/collections/tts/test_infer_vs_process_batch.py
@@ -0,0 +1,487 @@
+"""
+Test script to verify that infer_batch (teacher-forced) produces the same audio code
+and phoneme predictions as process_batch (single forward pass).
+
+Usage:
+    python tests/collections/tts/test_infer_vs_process_batch.py --codecmodel_path /path/to/codec.nemo
+
+The script:
+1. Builds a tiny NemotronH-backed EasyMagpieTTSModel with a real codec model.
+2. Creates synthetic random inputs (with variable lengths per batch item).
+3. Runs process_batch (full-sequence forward) and infer_batch (streaming, teacher-forced).
+4. Compares the argmax audio code predictions and phoneme predictions from both paths.
+5. Repeats for multiple configurations.
+"""
+
+import argparse
+import sys
+import torch
+from omegaconf import OmegaConf
+
+from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel
+
+
+def build_minimal_config(codecmodel_path: str) -> OmegaConf:
+    """Build a minimal OmegaConf config for a tiny NemotronH model."""
+    hidden_size = 256
+
+    cfg_dict = {
+        # Decoder backend
+        'decoder_type': 'nemotron_h',
+        'nemotron_h_config': {
+            'hidden_size': hidden_size,
+            'num_hidden_layers': 2,
+            'vocab_size': 131072,
+            'num_attention_heads': 4,
+            'num_key_value_heads': 2,
+            'attention_dropout': 0.0,
+            'attention_bias': False,
+            'max_position_embeddings': 4096,
+            'mamba_num_heads': 16,
+            'mamba_head_dim': 16,
+            'ssm_state_size': 128,
+            'conv_kernel': 4,
+            'n_groups': 8,
+            'chunk_size': 256,
+            'mamba_hidden_act': 'silu',
+            'use_conv_bias': True,
+            'use_bias': False,
+            'intermediate_size': 512,
+            'mlp_hidden_act': 'silu',
+            'mlp_bias': False,
+            'hybrid_override_pattern': 'M*',
+            'layer_norm_epsilon': 1e-5,
+            'residual_in_fp32': True,
+        },
+        'embedding_dim': hidden_size,
+        'hidden_dim': hidden_size,
+        'audio_embedding_dim': hidden_size,
+        'codecmodel_path': codecmodel_path,
+        # Text tokenizer - use a simple AutoTokenizer
+        'text_tokenizers': {
+            'test_tokenizer': {
+                '_target_': 'AutoTokenizer',
+                'pretrained_model': 'gpt2',
+            },
+        },
+        # Phoneme tokenizer
+        'phoneme_tokenizer': {
+            '_target_': 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer',
+            'tokenizer_path': 'scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json',
+        },
+        'phoneme_stacking_factor': 1,
+        # Training modes (single streaming mode)
+        'training_modes': [
+            {
+                'name': 'streaming_4_8',
+                'text_input_mode': 'streaming',
+                'streaming_phonemes_delay': 4,
+                'streaming_speech_delay': 8,
+            },
+        ],
+        'frame_stacking_factor': 2,
+        'cfg_unconditional_prob': 0.0,
+        'dropout_text_input_prob': 0.0,
+        'dropout_phoneme_input_prob': 0.0,
+        'local_transformer_type': 'none',
+        'run_val_inference': False,
+        # Optim placeholder (required by ModelPT but not used)
+        'optim': {
+            '_target_': 'torch.optim.AdamW',
+            'lr': 1e-4,
+        },
+        # No dataloaders
+    }
+    return OmegaConf.create(cfg_dict)
+
+
+def create_synthetic_batch(
+    model,
+    batch_size=2,
+    text_lens_list=None,
+    audio_frames_list=None,
+    context_text_lens_list=None,
+    context_audio_frames_list=None,
+    phoneme_lens_list=None,
+    device='cpu',
+):
+    """Create a synthetic batch with random valid token IDs and variable lengths per item.
+
+    If *_list args are None, defaults to uniform lengths for all items.
+    """
+    num_codebooks = model.num_audio_codebooks
+    codebook_size = model.codebook_size
+    text_vocab_size = model.bos_id  # valid text tokens are [0, bos_id)
+    phoneme_vocab_size = model.phoneme_tokenizer.vocab_size - 2  # exclude BOS/EOS
+
+    # Defaults
+    if text_lens_list is None:
+        text_lens_list = [20] * batch_size
+    if audio_frames_list is None:
+        audio_frames_list = [30] * batch_size
+    if context_text_lens_list is None:
+        context_text_lens_list = [10] * batch_size
+    if context_audio_frames_list is None:
+        context_audio_frames_list = [15] * batch_size
+    if phoneme_lens_list is None:
+        phoneme_lens_list = [25] * batch_size
+
+    assert len(text_lens_list) == batch_size
+    assert len(audio_frames_list) == batch_size
+    assert len(context_text_lens_list) == batch_size
+    assert len(context_audio_frames_list) == batch_size
+    assert len(phoneme_lens_list) == batch_size
+
+    # Max lengths for padding
+    max_text_len = max(text_lens_list)
+    max_audio_frames = max(audio_frames_list)
+    max_context_text_len = max(context_text_lens_list)
+    max_context_audio_frames = max(context_audio_frames_list)
+    max_phoneme_len = max(phoneme_lens_list)
+
+    # Text tokens: random tokens + EOS at the end (matching dataset behavior)
+    text = torch.zeros(batch_size, max_text_len, dtype=torch.long, device=device)
+    for b in range(batch_size):
+        tl = text_lens_list[b]
+        text[b, :tl - 1] = torch.randint(0, text_vocab_size, (tl - 1,), device=device)
+        text[b, tl - 1] = model.eos_id  # EOS as last valid token
+    text_lens = torch.tensor(text_lens_list, dtype=torch.long, device=device)
+
+    # Context text tokens
+    context_text_tokens = torch.zeros(batch_size, max_context_text_len, dtype=torch.long, device=device)
+    for b in range(batch_size):
+        cl = context_text_lens_list[b]
+        context_text_tokens[b, :cl] = torch.randint(0, text_vocab_size, (cl,), device=device)
+    context_text_tokens_lens = torch.tensor(context_text_lens_list, dtype=torch.long, device=device)
+
+    # Audio codes (raw, without BOS/EOS)
+    audio_codes = torch.zeros(batch_size, num_codebooks, max_audio_frames, dtype=torch.long, device=device)
+    for b in range(batch_size):
+        af = audio_frames_list[b]
+        audio_codes[b, :, :af] = torch.randint(0, codebook_size, (num_codebooks, af), device=device)
+    audio_codes_lens = torch.tensor(audio_frames_list, dtype=torch.long, device=device)
+
+    # Context audio codes (raw, without BOS/EOS)
+    context_audio_codes = torch.zeros(batch_size, num_codebooks, max_context_audio_frames, dtype=torch.long, device=device)
+    for b in range(batch_size):
+        caf = context_audio_frames_list[b]
+        context_audio_codes[b, :, :caf] = torch.randint(0, codebook_size, (num_codebooks, caf), device=device)
+    context_audio_codes_lens = torch.tensor(context_audio_frames_list, dtype=torch.long, device=device)
+
+    # Phoneme tokens (raw IDs, BOS/EOS will be added by the model)
+    phoneme_tokens = torch.zeros(batch_size, max_phoneme_len, dtype=torch.long, device=device)
+    for b in range(batch_size):
+        pl = phoneme_lens_list[b]
+        phoneme_tokens[b, :pl] = torch.randint(0, phoneme_vocab_size, (pl,), device=device)
+    phoneme_tokens_lens = torch.tensor(phoneme_lens_list, dtype=torch.long, device=device)
+
+    batch = {
+        'text': text,
+        'text_lens': text_lens,
+        'context_text_tokens': context_text_tokens,
+        'context_text_tokens_lens': context_text_tokens_lens,
+        'audio_codes': audio_codes,
+        'audio_codes_lens': audio_codes_lens,
+        'context_audio_codes': context_audio_codes,
+        'context_audio_codes_lens': context_audio_codes_lens,
+        'phoneme_tokens': phoneme_tokens,
+        'phoneme_tokens_lens': phoneme_tokens_lens,
+    }
+    return batch
+
+
+def compare_audio_codes(model, pb_output, ib_output, batch):
+    """Compare audio codes from process_batch and infer_batch. Returns True if all match."""
+    C = model.num_audio_codebooks
+    S = model.frame_stacking_factor
+    C_stacked = C * S
+    V = model.num_all_tokens_per_codebook
+    pb_logits = pb_output.logits  # (B, T_stacked, C_stacked * V)
+    T_stacked = pb_logits.size(1)
+    batch_size = batch['text'].size(0)
+
+    # Extract per-codebook argmax at stacked resolution
+    pb_stacked_codes_list = []
+    for cb_idx in range(C_stacked):
+        si = cb_idx * V
+        ei = si + V
+        cb_logits = pb_logits[:, :, si:ei]  # (B, T_stacked, V)
+        cb_preds = cb_logits.argmax(dim=-1)  # (B, T_stacked)
+        pb_stacked_codes_list.append(cb_preds)
+    pb_stacked_codes = torch.stack(pb_stacked_codes_list, dim=1)  # (B, C_stacked, T_stacked)
+
+    # Unstack: (B, C*S, T_stacked) -> (B, C, S, T_stacked) -> (B, C, T_stacked, S) -> (B, C, T_stacked*S)
+    pb_unstacked = pb_stacked_codes.view(batch_size, C, S, T_stacked)
+    pb_unstacked = pb_unstacked.permute(0, 1, 3, 2).contiguous()
+    pb_unstacked = pb_unstacked.reshape(batch_size, C, T_stacked * S)
+    pb_unstacked_lens = pb_output.audio_codes_lens_target * S
+
+    ib_codes = ib_output.predicted_codes
+    ib_codes_lens = ib_output.predicted_codes_lens
+
+    print(f"  process_batch argmax codes (unstacked): {pb_unstacked.shape}, lens: {pb_unstacked_lens.tolist()}")
+    print(f"  infer_batch predicted codes: {ib_codes.shape}, lens: {ib_codes_lens.tolist()}")
+
+    all_match = True
+    for b in range(batch_size):
+        pb_len = pb_unstacked_lens[b].item()
+        ib_len = ib_codes_lens[b].item()
+        compare_len = min(pb_len, ib_len)
+
+        if compare_len == 0:
+            print(f"  Batch item {b}: No codes to compare (pb_len={pb_len}, ib_len={ib_len})")
+            continue
+
+        pb_codes_b = pb_unstacked[b, :, :compare_len]
+        ib_codes_b = ib_codes[b, :, :compare_len]
+
+        matches = (pb_codes_b == ib_codes_b).all()
+        num_matching = (pb_codes_b == ib_codes_b).sum().item()
+        total = pb_codes_b.numel()
+        match_pct = 100.0 * num_matching / total if total > 0 else 0.0
+
+        print(f"  Batch item {b}: pb_len={pb_len}, ib_len={ib_len}, compare_len={compare_len}")
+        print(f"    Audio match: {matches.item()}, {num_matching}/{total} ({match_pct:.1f}%)")
+
+        if not matches:
+            all_match = False
+            mismatch_mask = pb_codes_b != ib_codes_b
+            mismatch_positions = mismatch_mask.nonzero(as_tuple=False)
+            num_show = min(10, mismatch_positions.size(0))
+            for i in range(num_show):
+                cb, t = mismatch_positions[i].tolist()
+                print(f"    Mismatch at codebook={cb}, time={t}: "
+                      f"pb={pb_codes_b[cb, t].item()}, ib={ib_codes_b[cb, t].item()}")
+
+    return all_match
+
+
+def compare_phoneme_predictions(model, pb_output, ib_output, batch):
+    """Compare phoneme predictions from process_batch and infer_batch. Returns True if all match."""
+    if pb_output.phoneme_logits is None:
+        print("  No phoneme logits from process_batch (no phoneme tokenizer?). Skipping.")
+        return True
+    if ib_output.predicted_phoneme_tokens is None:
+        print("  No phoneme predictions from infer_batch. Skipping.")
+        return True
+
+    batch_size = batch['text'].size(0)
+    phoneme_stacking_factor = model.phoneme_stacking_factor
+    phoneme_vocab_size = model.phoneme_vocab_size
+
+    # Extract argmax phoneme predictions from process_batch logits
+    # phoneme_logits: (B, T_phoneme, phoneme_stacking_factor * phoneme_vocab_size)
+    pb_phoneme_logits = pb_output.phoneme_logits
+    T_phoneme = pb_phoneme_logits.size(1)
+
+    pb_phoneme_preds_list = []
+    for sf_idx in range(phoneme_stacking_factor):
+        si = sf_idx * phoneme_vocab_size
+        ei = si + phoneme_vocab_size
+        sf_logits = pb_phoneme_logits[:, :, si:ei]  # (B, T_phoneme, V_phoneme)
+        sf_preds = sf_logits.argmax(dim=-1)  # (B, T_phoneme)
+        pb_phoneme_preds_list.append(sf_preds)
+    pb_phoneme_preds = torch.stack(pb_phoneme_preds_list, dim=1)  # (B, phoneme_stacking_factor, T_phoneme)
+    pb_phoneme_lens = pb_output.phoneme_tokens_lens_target  # (B,) number of phoneme prediction steps
+
+    # infer_batch phoneme predictions: (B, phoneme_stacking_factor, T_all_steps)
+    ib_phoneme_preds = ib_output.predicted_phoneme_tokens
+    ib_phoneme_lens = ib_output.predicted_phoneme_tokens_lens
+
+    print(f"  process_batch phoneme preds: {pb_phoneme_preds.shape}, lens: {pb_phoneme_lens.tolist()}")
+    print(f"  infer_batch phoneme preds: {ib_phoneme_preds.shape}, lens: {ib_phoneme_lens.tolist()}")
+
+    # Get start indices for infer_batch phoneme predictions
+    ib_start_idx = ib_output.phoneme_prediction_start_idx  # (B,)
+
+    all_match = True
+    for b in range(batch_size):
+        pb_len = pb_phoneme_lens[b].item()
+        ib_len = ib_phoneme_lens[b].item()
+        compare_len = min(pb_len, ib_len)
+
+        if compare_len == 0:
+            print(f"  Batch item {b}: No phonemes to compare (pb_len={pb_len}, ib_len={ib_len})")
+            continue
+
+        # process_batch phoneme preds start from 0 (already sliced to prediction region)
+        pb_ph_b = pb_phoneme_preds[b, :, :compare_len]
+
+        # infer_batch phoneme preds: slice from start_idx for this batch item
+        start = max(0, ib_start_idx[b].item())
+        ib_ph_b = ib_phoneme_preds[b, :, start:start + compare_len]
+
+        matches = (pb_ph_b == ib_ph_b).all()
+        num_matching = (pb_ph_b == ib_ph_b).sum().item()
+        total = pb_ph_b.numel()
+        match_pct = 100.0 * num_matching / total if total > 0 else 0.0
+
+        print(f"  Batch item {b}: pb_len={pb_len}, ib_len={ib_len}, compare_len={compare_len}")
+        print(f"    Phoneme match: {matches.item()}, {num_matching}/{total} ({match_pct:.1f}%)")
+
+        if not matches:
+            all_match = False
+            mismatch_mask = pb_ph_b != ib_ph_b
+            mismatch_positions = mismatch_mask.nonzero(as_tuple=False)
+            num_show = min(10, mismatch_positions.size(0))
+            for i in range(num_show):
+                sf, t = mismatch_positions[i].tolist()
+                print(f"    Mismatch at stacking_factor={sf}, time={t}: "
+                      f"pb={pb_ph_b[sf, t].item()}, ib={ib_ph_b[sf, t].item()}")
+
+    return all_match
+
+
+def run_single_test(model, batch, test_name, device):
+    """Run a single test comparing process_batch and infer_batch outputs."""
+    print(f"\n{'='*60}")
+    print(f"TEST: {test_name}")
+    print(f"{'='*60}")
+
+    for k, v in batch.items():
+        if isinstance(v, torch.Tensor):
+            print(f"  {k}: shape={v.shape}, dtype={v.dtype}")
+
+    # Run process_batch
+    print("\n  Running process_batch...")
+    training_mode = model.training_modes[0]
+    with torch.inference_mode():
+        pb_output = model.process_batch(
+            text=batch['text'],
+            text_lens=batch['text_lens'],
+            context_text_tokens=batch['context_text_tokens'],
+            context_text_tokens_lens=batch['context_text_tokens_lens'],
+            audio_codes=batch['audio_codes'],
+            audio_codes_lens=batch['audio_codes_lens'],
+            context_audio_codes=batch['context_audio_codes'],
+            context_audio_codes_lens=batch['context_audio_codes_lens'],
+            phoneme_tokens=batch['phoneme_tokens'],
+            phoneme_tokens_lens=batch['phoneme_tokens_lens'],
+            mode='val',
+            training_mode=training_mode,
+        )
+
+    # Run infer_batch (teacher-forced)
+    print("  Running infer_batch (teacher-forced)...")
+    ib_output = model.infer_batch(
+        batch=batch,
+        max_decoder_steps=1000,
+        temperature=0.0,
+        topk=80,
+        use_cfg=False,
+        use_local_transformer_for_inference=False,
+        phoneme_input_type='gt',
+        phoneme_sampling_method='argmax',
+        use_teacher_forced=True,
+    )
+
+    # Compare audio codes
+    print("\n  --- Audio Codes Comparison ---")
+    audio_match = compare_audio_codes(model, pb_output, ib_output, batch)
+
+    # Compare phoneme predictions
+    print("\n  --- Phoneme Predictions Comparison ---")
+    phoneme_match = compare_phoneme_predictions(model, pb_output, ib_output, batch)
+
+    success = audio_match and phoneme_match
+    if success:
+        print(f"\n  ✓ {test_name}: PASSED (audio + phoneme match)")
+    else:
+        parts = []
+        if not audio_match:
+            parts.append("audio")
+        if not phoneme_match:
+            parts.append("phoneme")
+        print(f"\n  ✗ {test_name}: FAILED ({' and '.join(parts)} mismatch)")
+
+    return success
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Test infer_batch vs process_batch')
+    parser.add_argument('--codecmodel_path', type=str, required=True, help='Path to codec model .nemo file')
+    parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu')
+    args = parser.parse_args()
+
+    device = args.device
+    print(f"Using device: {device}")
+
+    # 1. Build config and model
+    print("Building minimal config...")
+    cfg = build_minimal_config(args.codecmodel_path)
+
+    print("Instantiating EasyMagpieTTSModel (tiny NemotronH + real codec)...")
+    model = EasyMagpieTTSModel(cfg=cfg, trainer=None)
+    model = model.to(device)
+    model.eval()
+    print(f"  num_audio_codebooks={model.num_audio_codebooks}, codebook_size={model.codebook_size}")
+    print(f"  frame_stacking_factor={model.frame_stacking_factor}")
+    print(f"  phoneme_vocab_size={model.phoneme_tokenizer.vocab_size}")
+
+    # Define test configurations: (test_name, kwargs_for_create_synthetic_batch)
+    test_configs = [
+        (
+            "Uniform lengths (B=2, text=20, audio=30, ctx_text=10, ctx_audio=15, phoneme=25)",
+            dict(
+                batch_size=2,
+                text_lens_list=[20, 20],
+                audio_frames_list=[30, 30],
+                context_text_lens_list=[10, 10],
+                context_audio_frames_list=[15, 15],
+                phoneme_lens_list=[25, 25],
+            ),
+        ),
+        (
+            "Variable text & context lens (B=2, text=[15,25], ctx_text=[8,12], ctx_audio=[10,20])",
+            dict(
+                batch_size=2,
+                text_lens_list=[15, 25],
+                audio_frames_list=[30, 30],
+                context_text_lens_list=[8, 12],
+                context_audio_frames_list=[10, 20],
+                phoneme_lens_list=[20, 30],
+            ),
+        ),
+        (
+            "Variable audio & phoneme lens (B=2, audio=[20,40], phoneme=[15,35])",
+            dict(
+                batch_size=2,
+                text_lens_list=[20, 20],
+                audio_frames_list=[20, 40],
+                context_text_lens_list=[10, 10],
+                context_audio_frames_list=[15, 15],
+                phoneme_lens_list=[15, 35],
+            ),
+        ),
+        (
+            "All different (B=3)",
+            dict(
+                batch_size=3,
+                text_lens_list=[12, 20, 28],
+                audio_frames_list=[20, 30, 40],
+                context_text_lens_list=[6, 10, 14],
+                context_audio_frames_list=[8, 15, 22],
+                phoneme_lens_list=[15, 25, 35],
+            ),
+        ),
+    ]
+
+    all_passed = True
+    for test_name, kwargs in test_configs:
+        batch = create_synthetic_batch(model, device=device, **kwargs)
+        passed = run_single_test(model, batch, test_name, device)
+        if not passed:
+            all_passed = False
+
+    # Final summary
+    print(f"\n{'='*60}")
+    if all_passed:
+        print("✓ ALL TESTS PASSED")
+    else:
+        print("✗ SOME TESTS FAILED")
+        sys.exit(1)
+    print(f"{'='*60}")
+
+
+if __name__ == '__main__':
+    main()

From 4a0e36b8b80a670be464901fe766b90ad8f9c800 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Sat, 7 Feb 2026 14:09:47 -0800
Subject: [PATCH 42/94] tested and verified that infer batch works correctly
 with teacher forcing and matches process batch output

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 58 ++++++++++---------
 .../tts/test_infer_vs_process_batch.py        |  2 +-
 2 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 0bb3aaff8441..580039e3db1b 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -2977,33 +2977,35 @@ def streaming_step(
                     state.last_audio_codes = torch.where(update_mask, audio_codes_next_stacked, state.last_audio_codes)
 
                 # Check for EOS in each frame and track exact end position
-                # all_codes_next_argmax is also (B, C*S), reshape to (B, C, S)
-                all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S)
-
-                # For each batch item, find if/where EOS occurs in this step's frames
-                eos_in_sampled = audio_codes_unstacked == self.audio_eos_id  # (B, C, S)
-                eos_in_argmax = all_codes_argmax_unstacked == self.audio_eos_id  # (B, C, S)
-                eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1)  # (B, S)
-
-                # Find first frame with EOS per batch item (or S if none)
-                eos_frame_idx = torch.where(
-                    eos_any_codebook.any(dim=1),
-                    eos_any_codebook.int().argmax(dim=1),  # first frame with EOS
-                    torch.full((batch_size,), S, device=device),  # no EOS in this step
-                )  # (B,)
-
-                audio_eos_detected = eos_any_codebook.any(dim=1)  # (B,)
-                state.finished = state.finished | audio_eos_detected
-
-                # Track audio prediction end index (in frames) for items that just ended
-                newly_ended_audio = audio_eos_detected & (state.audio_prediction_end_idx == -1)
-                if newly_ended_audio.any():
-                    # End index = current frame count + frame offset where EOS was found
-                    current_frame_count = len(state.all_predictions) * self.frame_stacking_factor
-                    end_frame_idx = current_frame_count + eos_frame_idx
-                    state.audio_prediction_end_idx = torch.where(
-                        newly_ended_audio, end_frame_idx, state.audio_prediction_end_idx
-                    )
+                # Skip EOS detection in teacher-forced mode - rely on GT exhaustion instead
+                if state.gt_audio_embeddings is None:
+                    # all_codes_next_argmax is also (B, C*S), reshape to (B, C, S)
+                    all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S)
+
+                    # For each batch item, find if/where EOS occurs in this step's frames
+                    eos_in_sampled = audio_codes_unstacked == self.audio_eos_id  # (B, C, S)
+                    eos_in_argmax = all_codes_argmax_unstacked == self.audio_eos_id  # (B, C, S)
+                    eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1)  # (B, S)
+
+                    # Find first frame with EOS per batch item (or S if none)
+                    eos_frame_idx = torch.where(
+                        eos_any_codebook.any(dim=1),
+                        eos_any_codebook.int().argmax(dim=1),  # first frame with EOS
+                        torch.full((batch_size,), S, device=device),  # no EOS in this step
+                    )  # (B,)
+
+                    audio_eos_detected = eos_any_codebook.any(dim=1)  # (B,)
+                    state.finished = state.finished | audio_eos_detected
+
+                    # Track audio prediction end index (in frames) for items that just ended
+                    newly_ended_audio = audio_eos_detected & (state.audio_prediction_end_idx == -1)
+                    if newly_ended_audio.any():
+                        # End index = current frame count + frame offset where EOS was found
+                        current_frame_count = len(state.all_predictions) * self.frame_stacking_factor
+                        end_frame_idx = current_frame_count + eos_frame_idx
+                        state.audio_prediction_end_idx = torch.where(
+                            newly_ended_audio, end_frame_idx, state.audio_prediction_end_idx
+                        )
 
                 # Store unstacked codes
                 state.all_predictions.append(audio_codes_unstacked)
@@ -3030,7 +3032,7 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor:
 
         # Sample phonemes
         if state.phoneme_sampling_method == 'argmax':
-            pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.01)
+            pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.0)
         else:
             pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(
                 all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk
diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py
index b9838602586a..006be87ebaa2 100644
--- a/tests/collections/tts/test_infer_vs_process_batch.py
+++ b/tests/collections/tts/test_infer_vs_process_batch.py
@@ -49,7 +49,7 @@ def build_minimal_config(codecmodel_path: str) -> OmegaConf:
             'intermediate_size': 512,
             'mlp_hidden_act': 'silu',
             'mlp_bias': False,
-            'hybrid_override_pattern': 'M*',
+            'hybrid_override_pattern': 'M*',  # All Mamba layers
             'layer_norm_epsilon': 1e-5,
             'residual_in_fp32': True,
         },

From 0518c99606751ef74370a6598d87eaab57e7b6af Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Sat, 7 Feb 2026 15:07:14 -0800
Subject: [PATCH 43/94] added legacy option to still work with 21fps F2F model

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 examples/tts/magpietts_inference.py           |  4 ++++
 nemo/collections/tts/models/easy_magpietts.py | 21 +++++++++++++------
 .../modules/magpietts_inference/inference.py  |  8 +++++++
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py
index 1e7753798db4..feead6519875 100644
--- a/examples/tts/magpietts_inference.py
+++ b/examples/tts/magpietts_inference.py
@@ -507,6 +507,7 @@ def create_argument_parser() -> argparse.ArgumentParser:
     target_group.add_argument('--cer_target', type=float, default=None)
     target_group.add_argument('--ssim_target', type=float, default=None)
     target_group.add_argument('--is_decoder_only_model', action='store_true')
+    target_group.add_argument('--legacy_context_stacking', action='store_true', help='Use audio_bos_id/audio_eos_id instead of context_audio_bos_id/context_audio_eos_id for context stacking')
     target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt'])
     target_group.add_argument(
         '--phoneme_sampling_method', type=str, default='greedy', choices=['greedy', 'multinomial']
@@ -575,6 +576,9 @@ def main(argv=None):
         phoneme_input_type=args.phoneme_input_type,
         phoneme_sampling_method=args.phoneme_sampling_method,
         dropout_text_input=args.dropout_text_input,
+        legacy_context_stacking=args.legacy_context_stacking,
+        longform_mode=args.longform_mode,
+        longform_word_threshold=args.longform_word_threshold,
     )
 
     eval_config = EvaluationConfig(
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 580039e3db1b..8d7a956ac5d4 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -935,10 +935,15 @@ def local_transformer_sample_autoregressive(
             )  # (B, num_tokens_per_codebook)
             codebook_logits_rescored = codebook_logits.clone()
             codebook_logits_rescored[indices_to_remove] = float('-inf')
-            codebook_probs = torch.softmax(
-                codebook_logits_rescored / temperature, dim=-1
-            )  # (B, num_tokens_per_codebook)
-            codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
+
+            if temperature <= 0.0:
+                # Argmax sampling for deterministic output
+                codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True)  # (B, 1)
+            else:
+                codebook_probs = torch.softmax(
+                    codebook_logits_rescored / temperature, dim=-1
+                )  # (B, num_tokens_per_codebook)
+                codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
             if use_cfg:
                 codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size]
             all_preds.append(codebook_preds)
@@ -1226,11 +1231,15 @@ def prepare_context_tensors(
             eos_id=self.context_audio_eos_id,
         )
 
+        # Use legacy audio_bos_id/audio_eos_id if flag is set
+        stack_bos_id = self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id
+        stack_eos_id = self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id
+        
         context_audio_codes, context_audio_codes_lens = self.stack_codes(
             context_audio_codes,
             context_audio_codes_lens,
-            self.context_audio_bos_id,
-            self.context_audio_eos_id,
+            stack_bos_id,
+            stack_eos_id,
             self.frame_stacking_factor,
             self.num_audio_codebooks,
         )
diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index 9b0db0f7f75e..70ca811f58a2 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -77,6 +77,11 @@ class InferenceConfig:
     phoneme_input_type: str = "gt"  # gt or predicted
     phoneme_sampling_method: str = "argmax"  # argmax or multinomial
     dropout_text_input: bool = False
+    legacy_context_stacking: bool = False  # Use audio_bos_id/audio_eos_id for context stacking
+
+    # Longform inference mode
+    longform_mode: str = "auto"  # "auto" | "always" | "never"
+    longform_word_threshold: int = 40  # Word threshold for auto-detection
 
     is_decoder_only_model: bool = False
     def build_identifier(self) -> str:
@@ -146,6 +151,9 @@ def __init__(
         self.model = model
         self.config = config
 
+        # Set legacy context stacking flag on model
+        self.model.legacy_context_stacking = config.legacy_context_stacking
+
         # Set phoneme probability to 1 for inference
         self._configure_tokenizer()
 

From fa0fafb7faf3059d7c8f12dda21fa798153e17d4 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Sat, 7 Feb 2026 15:31:07 -0800
Subject: [PATCH 44/94] remove streaming decode because it not being used

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 57 +------------------
 1 file changed, 1 insertion(+), 56 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 8d7a956ac5d4..d48bf3125d10 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -2809,7 +2809,7 @@ def streaming_step(
                         # Prediction mode: use BOS or last predicted phoneme
                         first_phoneme_step = needs_phoneme & (state.phoneme_steps == 0)
                         has_last_phoneme = (
-                            needs_phoneme & ~first_phoneme_step & (state.last_phoneme_tokens is not None)
+                            needs_phoneme & (~first_phoneme_step) & (state.last_phoneme_tokens is not None)
                         )
 
                         if first_phoneme_step.any():
@@ -3077,61 +3077,6 @@ def _predict_audio_codes(self, state: StreamingState) -> Tuple[torch.Tensor, tor
 
         return audio_codes_next, all_codes_next_argmax
 
-    def streaming_decode(
-        self,
-        state: StreamingState,
-        previous_decode_length: int = 0,
-    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
-        """
-        Decode accumulated audio codes to waveform, returning only the new chunk.
-
-        WARNING: This function does not yet support batch_size > 1.
-        Do not use with batched streaming inference. Use streaming_finalize instead.
-
-        This function takes all predicted codes so far and decodes them, but only
-        returns the newly generated audio portion (after previous_decode_length).
-
-        Args:
-            state: Current StreamingState containing all_predictions.
-            previous_decode_length: Number of audio samples already decoded and returned
-                in previous calls. Use 0 on first call.
-
-        Returns:
-            Tuple of:
-                - new_audio: Newly generated audio waveform (1, new_samples)
-                - new_audio_len: Length of new audio (1,)
-                - total_decode_length: Total decoded length so far (use as previous_decode_length
-                    for next call)
-        """
-        if len(state.all_predictions) == 0:
-            return (
-                torch.zeros(1, 0, device=state.device),
-                torch.zeros(1, dtype=torch.long, device=state.device),
-                previous_decode_length,
-            )
-
-        with torch.inference_mode():
-            # Concatenate all predictions - each is (1, C, S), concat gives (1, C, T_total_frames)
-            predicted_codes = torch.cat(state.all_predictions, dim=-1)  # (1, C, T_total_frames)
-            predicted_codes_lens = torch.tensor([predicted_codes.size(-1)], device=state.device)
-
-            # Decode to audio (codes are already unstacked, no EOS removal needed)
-            audio, audio_len, _ = self.codes_to_audio(predicted_codes, predicted_codes_lens)
-
-            # Extract only new audio
-            total_decode_length = audio_len[0].item()
-            if total_decode_length <= previous_decode_length:
-                return (
-                    torch.zeros(1, 0, device=state.device),
-                    torch.zeros(1, dtype=torch.long, device=state.device),
-                    previous_decode_length,
-                )
-
-            new_audio = audio[:, previous_decode_length:total_decode_length]
-            new_audio_len = torch.tensor([total_decode_length - previous_decode_length], device=state.device)
-
-            return new_audio, new_audio_len, total_decode_length
-
     def streaming_finalize(
         self,
         state: StreamingState,

From 432605bc6a227449bec2b1fef370bfa3d6ee0f86 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Sun, 8 Feb 2026 09:19:09 -0800
Subject: [PATCH 45/94] pass phoneme EOS to next step

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index d48bf3125d10..8708c6f6732e 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -151,6 +151,7 @@ class StreamingState:
         phoneme_steps: Number of phoneme prediction steps taken per batch item (B,).
         audio_steps: Number of audio prediction steps taken per batch item (B,).
         phoneme_stream_ended: Whether the phoneme stream has ended per batch item (B,) bool tensor.
+        phoneme_eos_detected: Whether the phoneme EOS has been predicted per batch item (B,) bool tensor.
         finished: Whether generation is complete per batch item (B,) bool tensor.
         device: Device tensors are on.
         training_mode: The training mode being used for inference.
@@ -187,6 +188,7 @@ class StreamingState:
     phoneme_steps: torch.Tensor
     audio_steps: torch.Tensor
     phoneme_stream_ended: torch.Tensor
+    phoneme_eos_detected: torch.Tensor
     finished: torch.Tensor
     device: torch.device
     training_mode: TrainingMode
@@ -2654,6 +2656,7 @@ def streaming_init(
                 phoneme_steps=torch.zeros(batch_size, dtype=torch.long, device=device),
                 audio_steps=torch.zeros(batch_size, dtype=torch.long, device=device),
                 phoneme_stream_ended=torch.zeros(batch_size, dtype=torch.bool, device=device),
+                phoneme_eos_detected=torch.zeros(batch_size, dtype=torch.bool, device=device),
                 finished=torch.zeros(batch_size, dtype=torch.bool, device=device),
                 device=device,
                 training_mode=selected_training_mode,
@@ -2828,8 +2831,12 @@ def streaming_step(
                             )  # (B, 1, E)
                             last_mask = has_last_phoneme.view(batch_size, 1, 1).float()
                             phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask
+                        
+                        # Only end phoneme stream in prediction mode when the phoneme EOS is detected
+                        state.phoneme_stream_ended = state.phoneme_stream_ended | state.phoneme_eos_detected
 
                     next_input = next_input + phoneme_emb
+                    
 
             # --- Audio embedding for audio phase items ---
             if needs_audio.any():
@@ -2947,6 +2954,7 @@ def streaming_step(
                     dim=1
                 )  # (B,)
                 state.phoneme_stream_ended = state.phoneme_stream_ended | phoneme_eos_detected
+                state.phoneme_eos_detected = state.phoneme_eos_detected | phoneme_eos_detected
 
                 # Track phoneme prediction end index for items that just ended
                 newly_ended_phoneme = phoneme_eos_detected & (state.phoneme_prediction_end_idx == -1)

From b239c2f8ad15576e1cd2c666abdd3452c05c5a23 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Sun, 8 Feb 2026 18:50:08 -0800
Subject: [PATCH 46/94] exlcude codec model from optimizer params

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 8708c6f6732e..9f4e74a7141a 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -557,7 +557,7 @@ def setup_optimizer_param_groups(self):
         """
         modules_to_exclude = {
             '_speaker_verification_model',
-            # '_codec_model',
+            '_codec_model',
             '_eval_asr_model',
             '_eval_speaker_verification_model',
             'whisper_model',

From ff68871058dfdf93a591f50d1d373105aad5f588 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Mon, 9 Feb 2026 00:59:20 -0800
Subject: [PATCH 47/94] reduce dropout prob, change default delays to 0,1

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 examples/tts/conf/magpietts/easy_magpietts.yaml      | 12 ++++++------
 .../tts/conf/magpietts/easy_magpietts_lhotse.yaml    | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
index 8c44fef3f173..11ab71ab3a9b 100644
--- a/examples/tts/conf/magpietts/easy_magpietts.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -73,7 +73,7 @@ model:
   local_transformer_n_heads: 12
   local_transformer_hidden_dim: 1536
 
-  cfg_unconditional_prob: 0.1
+  cfg_unconditional_prob: 0.05
   # To get special_tokens of the tokenzer, you can do:
   # model.tokenizer.first_tokenizer.additional_special_tokens
   
@@ -82,15 +82,15 @@ model:
   # Each mode has its own task embedding that is prepended to the context.
   # During inference, you can specify which mode to use via the 'inference_mode' parameter.
   training_modes:
-    - name: "streaming_4_8"
+    - name: "streaming_0_1"
       text_input_mode: "streaming" # Options: "full", "streaming"
-      streaming_phonemes_delay: 4
-      streaming_speech_delay: 8
+      streaming_phonemes_delay: 0
+      streaming_speech_delay: 1
   
   frame_stacking_factor: 2
   phoneme_stacking_factor: 1
-  dropout_text_input_prob: 0.3
-  dropout_phoneme_input_prob: 0.3
+  dropout_text_input_prob: 0.1
+  dropout_phoneme_input_prob: 0.1
 
   phoneme_tokenizer:
     _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
index af943ee25dbb..dd6cf50d7c25 100644
--- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -70,22 +70,22 @@ model:
   local_transformer_n_heads: 12
   local_transformer_hidden_dim: 1536
 
-  cfg_unconditional_prob: 0.1
+  cfg_unconditional_prob: 0.05
   
   # Multi-mode training configuration
   # The model will randomly select one of the modes for each batch during training.
   # Each mode has its own task embedding that is prepended to the context.
   # During inference, you can specify which mode to use via the 'inference_mode' parameter.
   training_modes:
-    - name: "streaming_4_8"
+    - name: "streaming_0_1"
       text_input_mode: "streaming" # Options: "full", "streaming"
-      streaming_phonemes_delay: 4
-      streaming_speech_delay: 8
+      streaming_phonemes_delay: 0
+      streaming_speech_delay: 1
 
   frame_stacking_factor: 2
   phoneme_stacking_factor: 1
-  dropout_text_input_prob: 0.3
-  dropout_phoneme_input_prob: 0.3
+  dropout_text_input_prob: 0.1
+  dropout_phoneme_input_prob: 0.1
   
   phoneme_tokenizer:
     _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer

From 021bd9e7d9e01ff855eb3a0a5e5b1f4046b1db01 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Mon, 9 Feb 2026 09:12:09 -0800
Subject: [PATCH 48/94] bug fix

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 9f4e74a7141a..a42b7e6da5f1 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -3011,7 +3011,7 @@ def streaming_step(
                         torch.full((batch_size,), S, device=device),  # no EOS in this step
                     )  # (B,)
 
-                    audio_eos_detected = eos_any_codebook.any(dim=1)  # (B,)
+                    audio_eos_detected = eos_any_codebook.any(dim=1)  & needs_audio
                     state.finished = state.finished | audio_eos_detected
 
                     # Track audio prediction end index (in frames) for items that just ended

From cb2cff1f26138073537205743dc5bac15fd5301c Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Mon, 9 Feb 2026 15:38:18 -0800
Subject: [PATCH 49/94] phoneme EOS handling bug fix

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index a42b7e6da5f1..36b7f0f6b451 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -2953,7 +2953,7 @@ def streaming_step(
                 ).any(
                     dim=1
                 )  # (B,)
-                state.phoneme_stream_ended = state.phoneme_stream_ended | phoneme_eos_detected
+                
                 state.phoneme_eos_detected = state.phoneme_eos_detected | phoneme_eos_detected
 
                 # Track phoneme prediction end index for items that just ended

From 386814d56b20b7f7443301947ee606206acdd37d Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Mon, 9 Feb 2026 18:17:56 -0800
Subject: [PATCH 50/94] phoneme corruption methodology implemented

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 .../tts/conf/magpietts/easy_magpietts.yaml    |   5 +-
 .../conf/magpietts/easy_magpietts_lhotse.yaml |   5 +-
 .../tts/data/text_to_speech_dataset_lhotse.py |   3 +-
 nemo/collections/tts/models/easy_magpietts.py | 132 ++++++++++++++++--
 .../tts/test_infer_vs_process_batch.py        |   1 -
 5 files changed, 128 insertions(+), 18 deletions(-)

diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
index 11ab71ab3a9b..9545897ceda3 100644
--- a/examples/tts/conf/magpietts/easy_magpietts.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -89,8 +89,11 @@ model:
   
   frame_stacking_factor: 2
   phoneme_stacking_factor: 1
+  phoneme_confidence_unk_threshold: 0.35
   dropout_text_input_prob: 0.1
-  dropout_phoneme_input_prob: 0.1
+  phoneme_corruption_batch_prob: 0.1
+  phoneme_corruption_timestep_ratio: 0.15
+  phoneme_corruption_unk_mode_prob: 0.5
 
   phoneme_tokenizer:
     _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
index dd6cf50d7c25..19d39f4cf320 100644
--- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -84,8 +84,11 @@ model:
 
   frame_stacking_factor: 2
   phoneme_stacking_factor: 1
+  phoneme_confidence_unk_threshold: 0.35
   dropout_text_input_prob: 0.1
-  dropout_phoneme_input_prob: 0.1
+  phoneme_corruption_batch_prob: 0.1
+  phoneme_corruption_timestep_ratio: 0.15
+  phoneme_corruption_unk_mode_prob: 0.5
   
   phoneme_tokenizer:
     _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index ba111838efa3..464b988b9415 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -65,7 +65,8 @@ def instantiate_phoneme_tokenizer(phoneme_tokenizer_config):
     phoneme_vocab_size = len(phoneme_tokenizer.tokens)
     phoneme_tokenizer.bos_token_id = phoneme_vocab_size
     phoneme_tokenizer.eos_token_id = phoneme_vocab_size + 1
-    phoneme_tokenizer.vocab_size = phoneme_vocab_size + 2
+    phoneme_tokenizer.unk_token_id = phoneme_vocab_size + 2
+    phoneme_tokenizer.vocab_size = phoneme_vocab_size + 3
     return phoneme_tokenizer
 
 
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 36b7f0f6b451..2fc5da261fe6 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -373,11 +373,16 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.cfg_unk_token_id = num_tokens - 1
         self.phoneme_tokenizer = None
         self.dropout_text_input_prob = cfg.get('dropout_text_input_prob', 0.0)
-        self.dropout_phoneme_input_prob = cfg.get('dropout_phoneme_input_prob', 0.0)
+        self.phoneme_corruption_batch_prob = cfg.get('phoneme_corruption_batch_prob', 0.0)
+        self.phoneme_corruption_timestep_ratio = cfg.get('phoneme_corruption_timestep_ratio', 0.0)
+        self.phoneme_corruption_unk_mode_prob = cfg.get('phoneme_corruption_unk_mode_prob', 0.5)
         if cfg.get('phoneme_tokenizer', None) is not None:
             self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer)
             self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1)
             self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size
+            # If max phoneme probability is below this threshold at inference-time,
+            # replace the predicted timestep with UNK to reduce error propagation.
+            self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.35)
 
         self.pad_context_text_to_max_duration = False
 
@@ -1341,8 +1346,9 @@ def prepare_phoneme_channel_embeddings(
         phoneme_tokens: torch.Tensor,
         phoneme_tokens_lens: torch.Tensor,
         delay: torch.Tensor,
-        dropout_phoneme_input: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        apply_corruption: bool = False,
+        dropout_complete_phoneme_channel: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[str]]:
         """
         Prepare phoneme embeddings as a channel input with delay handling.
 
@@ -1355,7 +1361,8 @@ def prepare_phoneme_channel_embeddings(
             phoneme_tokens_lens: Length of phoneme tokens for each batch item (B,)
             delay: Number of zero positions to prepend for each batch item (B,).
                    This is typically context_lens + phoneme_delay.
-            dropout_phoneme_input: If True, return all zeros (for phoneme dropout regularization).
+            apply_corruption: If True, apply phoneme-token corruption before embedding.
+            dropout_complete_phoneme_channel: If True, zero-out the whole phoneme channel embedding.
 
         Returns:
             Tuple of:
@@ -1363,6 +1370,7 @@ def prepare_phoneme_channel_embeddings(
                 - phoneme_channel_lens: Total length of phoneme channel for each batch item (B,)
                 - phoneme_tokens_stacked: Stacked phoneme tokens (B, S, T')
                 - phoneme_tokens_lens_stacked: Length of stacked phoneme tokens (B,)
+                - corruption_mode: None, "unk", or "repeat_skip"
         """
         batch_size = phoneme_tokens.size(0)
         device = phoneme_tokens.device
@@ -1378,6 +1386,13 @@ def prepare_phoneme_channel_embeddings(
             1,
         )
 
+        phoneme_corruption_mode = None
+        if apply_corruption:
+            phoneme_tokens_stacked, phoneme_corruption_mode = self.corrupt_stacked_phoneme_tokens(
+                phoneme_tokens_stacked=phoneme_tokens_stacked,
+                phoneme_tokens_lens_stacked=phoneme_tokens_lens_stacked,
+            )
+
         # Embed phoneme tokens
         phoneme_embedded = self.embed_phoneme_tokens(phoneme_tokens_stacked)  # (B, T', E)
 
@@ -1386,7 +1401,7 @@ def prepare_phoneme_channel_embeddings(
         phoneme_embedded = phoneme_embedded * phoneme_mask.unsqueeze(2)  # (B, T', E)
 
         # Handle phoneme dropout - zero out the embeddings
-        if dropout_phoneme_input:
+        if dropout_complete_phoneme_channel:
             phoneme_embedded = phoneme_embedded * 0.0
 
         # Create zero tensor for delay padding
@@ -1399,7 +1414,78 @@ def prepare_phoneme_channel_embeddings(
             lengths=[delay, phoneme_tokens_lens_stacked],
         )
 
-        return phoneme_channel_embedding, phoneme_channel_lens, phoneme_tokens_stacked, phoneme_tokens_lens_stacked
+        return (
+            phoneme_channel_embedding,
+            phoneme_channel_lens,
+            phoneme_tokens_stacked,
+            phoneme_tokens_lens_stacked,
+            phoneme_corruption_mode,
+        )
+
+    def corrupt_stacked_phoneme_tokens(
+        self,
+        phoneme_tokens_stacked: torch.Tensor,
+        phoneme_tokens_lens_stacked: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Optional[str]]:
+        """
+        Corrupt stacked phoneme tokens for robustness to phoneme prediction errors.
+
+        Two corruption modes are supported:
+        1. UNK replacement at selected timesteps (all stacked channels replaced).
+        2. Repeat/skip corruption via a shared index remapping over the valid prefix.
+        """
+        if self.phoneme_tokenizer is None:
+            return phoneme_tokens_stacked, None
+        if self.phoneme_corruption_batch_prob <= 0.0:
+            return phoneme_tokens_stacked, None
+        if self.phoneme_corruption_timestep_ratio <= 0.0:
+            return phoneme_tokens_stacked, None
+        if torch.rand(1).item() >= self.phoneme_corruption_batch_prob:
+            return phoneme_tokens_stacked, None
+
+        min_len = int(phoneme_tokens_lens_stacked.min().item())
+        # Need room for BOS and EOS plus at least one interior timestep.
+        if min_len <= 2:
+            return phoneme_tokens_stacked, None
+
+        # Corrupt only interior steps, keeping BOS/EOS untouched.
+        valid_start = 1
+        valid_end = min_len - 1  # exclusive
+        num_valid_steps = max(0, valid_end - valid_start)
+        if num_valid_steps == 0:
+            return phoneme_tokens_stacked, None
+
+        num_corrupt_steps = int(round(num_valid_steps * self.phoneme_corruption_timestep_ratio))
+        num_corrupt_steps = max(1, min(num_valid_steps, num_corrupt_steps))
+
+        corrupted = phoneme_tokens_stacked.clone()
+        mode = 'unk' if torch.rand(1).item() < self.phoneme_corruption_unk_mode_prob else 'repeat_skip'
+
+        candidate_steps = torch.arange(valid_start, valid_end, device=phoneme_tokens_stacked.device)
+        corrupt_steps = candidate_steps[torch.randperm(num_valid_steps, device=phoneme_tokens_stacked.device)][
+            :num_corrupt_steps
+        ]
+
+        if mode == 'unk':
+            if not hasattr(self.phoneme_tokenizer, 'unk_token_id'):
+                raise ValueError("Phoneme tokenizer is missing `unk_token_id` required for UNK corruption.")
+            corrupted[:, :, corrupt_steps] = self.phoneme_tokenizer.unk_token_id
+            return corrupted, mode
+
+        # Repeat/skip corruption with a shared remap over [0, min_len).
+        # This keeps batched execution efficient and applies the same corrupted timeline across the batch.
+        source_index = torch.arange(min_len, device=phoneme_tokens_stacked.device, dtype=torch.long)
+        step_delta = torch.ones(min_len, device=phoneme_tokens_stacked.device, dtype=torch.long)
+        op_is_repeat = torch.rand(corrupt_steps.numel(), device=phoneme_tokens_stacked.device) < 0.5
+        step_delta[corrupt_steps] = torch.where(op_is_repeat, torch.zeros_like(corrupt_steps), torch.full_like(corrupt_steps, 2))
+        source_index = torch.cumsum(step_delta, dim=0) - step_delta[0]
+        source_index = torch.clamp(source_index, min=0, max=min_len - 1)
+        source_index[0] = 0
+        source_index[-1] = min_len - 1
+
+        corrupted_prefix = phoneme_tokens_stacked[:, :, :min_len].index_select(dim=2, index=source_index)
+        corrupted[:, :, :min_len] = corrupted_prefix
+        return corrupted, mode
 
     def prepare_audio_channel_embeddings(
         self,
@@ -1657,10 +1743,7 @@ def process_batch(
 
         # Determine dropout flags
         dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False
-        dropout_phoneme_input = (random.random() < self.dropout_phoneme_input_prob) if mode == 'train' else False
-        if dropout_phoneme_input and dropout_text_input:
-            dropout_phoneme_input = random.random() < 0.5
-            dropout_text_input = not dropout_phoneme_input
+        dropout_phoneme_input = False
 
         # Determine CFG unconditional dropout
         dropout_conditional_input = False
@@ -1707,17 +1790,24 @@ def process_batch(
         phoneme_channel_embedding = None
         phoneme_tokens_stacked = None
         phoneme_tokens_lens_stacked = None
+        phoneme_corruption_mode = None
+        dropout_complete_phoneme_channel = False
         if self.phoneme_tokenizer is not None and phoneme_tokens is not None:
+            # Corrupt phonemes only when text input is not dropped.
+            apply_phoneme_corruption = mode == 'train' and not dropout_text_input and not dropout_conditional_input
+            dropout_complete_phoneme_channel = dropout_conditional_input
             (
                 phoneme_channel_embedding,
                 phoneme_channel_lens,
                 phoneme_tokens_stacked,
                 phoneme_tokens_lens_stacked,
+                phoneme_corruption_mode,
             ) = self.prepare_phoneme_channel_embeddings(
                 phoneme_tokens=phoneme_tokens,
                 phoneme_tokens_lens=phoneme_tokens_lens,
                 delay=phoneme_delay,
-                dropout_phoneme_input=dropout_phoneme_input or dropout_conditional_input,
+                apply_corruption=apply_phoneme_corruption,
+                dropout_complete_phoneme_channel=dropout_complete_phoneme_channel,
             )
 
         # 5. Prepare audio channel embeddings
@@ -1854,14 +1944,12 @@ def process_batch(
             pb_phoneme_tokens_target = phoneme_tokens_stacked[:, :, 1:].long()
             pb_phoneme_tokens_lens_target = phoneme_tokens_lens_stacked - 1
 
-            if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input):
+            if phoneme_corruption_mode != 'repeat_skip' and not dropout_complete_phoneme_channel:
                 phoneme_loss, _ = self.compute_phoneme_loss(
                     pb_phoneme_logits, pb_phoneme_tokens_target, pb_phoneme_tokens_lens_target
                 )
-                print("No Dropout - phoneme loss:", phoneme_loss.item())
             else:
                 phoneme_loss = torch.tensor(0.0, device=logits.device)
-                print("Dropout - phoneme loss skipped", phoneme_loss.item())
 
             loss = loss + phoneme_loss
 
@@ -3046,6 +3134,8 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor:
         # Get phoneme logits
         all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :])
         all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size]
+        phoneme_logits = all_code_logits_t_phoneme.view(actual_batch_size, self.phoneme_stacking_factor, self.phoneme_vocab_size)
+        max_probs = torch.softmax(phoneme_logits, dim=-1).max(dim=-1).values  # (B, phoneme_stacking_factor)
 
         # Sample phonemes
         if state.phoneme_sampling_method == 'argmax':
@@ -3054,6 +3144,20 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor:
             pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(
                 all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk
             )
+
+        # In prediction mode, low-confidence phoneme steps are replaced with UNK across
+        # all stacked channels (except steps where EOS is predicted).
+        if (
+            state.phoneme_input_type != 'gt'
+            and hasattr(self.phoneme_tokenizer, 'unk_token_id')
+            and self.phoneme_confidence_unk_threshold > 0.0
+        ):
+            underconfident_step = (max_probs < self.phoneme_confidence_unk_threshold).any(dim=1, keepdim=True)  # (B, 1)
+            eos_predicted_step = (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1, keepdim=True)
+            replace_with_unk = underconfident_step & (~eos_predicted_step)
+            if replace_with_unk.any():
+                unk_tokens = torch.full_like(pred_phoneme_tokens, self.phoneme_tokenizer.unk_token_id)
+                pred_phoneme_tokens = torch.where(replace_with_unk, unk_tokens, pred_phoneme_tokens)
         # (B, phoneme_stacking_factor)
         return pred_phoneme_tokens
 
diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py
index 006be87ebaa2..d225136989f1 100644
--- a/tests/collections/tts/test_infer_vs_process_batch.py
+++ b/tests/collections/tts/test_infer_vs_process_batch.py
@@ -82,7 +82,6 @@ def build_minimal_config(codecmodel_path: str) -> OmegaConf:
         'frame_stacking_factor': 2,
         'cfg_unconditional_prob': 0.0,
         'dropout_text_input_prob': 0.0,
-        'dropout_phoneme_input_prob': 0.0,
         'local_transformer_type': 'none',
         'run_val_inference': False,
         # Optim placeholder (required by ModelPT but not used)

From 2bd08ed66180ccddbb3046bfa593fc541d045159 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Mon, 9 Feb 2026 18:28:19 -0800
Subject: [PATCH 51/94] revisit defaults and update

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 examples/tts/conf/magpietts/easy_magpietts.yaml        | 2 +-
 examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml | 2 +-
 nemo/collections/tts/models/easy_magpietts.py          | 5 ++---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
index 9545897ceda3..ef2ad794c2d0 100644
--- a/examples/tts/conf/magpietts/easy_magpietts.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -89,7 +89,7 @@ model:
   
   frame_stacking_factor: 2
   phoneme_stacking_factor: 1
-  phoneme_confidence_unk_threshold: 0.35
+  phoneme_confidence_unk_threshold: 0.0 # If max phoneme probability is below this threshold at inference-time, replace the predicted timestep with UNK to reduce error propagation.
   dropout_text_input_prob: 0.1
   phoneme_corruption_batch_prob: 0.1
   phoneme_corruption_timestep_ratio: 0.15
diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
index 19d39f4cf320..a6330272a1da 100644
--- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -84,7 +84,7 @@ model:
 
   frame_stacking_factor: 2
   phoneme_stacking_factor: 1
-  phoneme_confidence_unk_threshold: 0.35
+  phoneme_confidence_unk_threshold: 0.0 # If max phoneme probability is below this threshold at inference-time, replace the predicted timestep with UNK to reduce error propagation.
   dropout_text_input_prob: 0.1
   phoneme_corruption_batch_prob: 0.1
   phoneme_corruption_timestep_ratio: 0.15
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 2fc5da261fe6..eec9e58a4161 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -382,7 +382,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size
             # If max phoneme probability is below this threshold at inference-time,
             # replace the predicted timestep with UNK to reduce error propagation.
-            self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.35)
+            self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0)
 
         self.pad_context_text_to_max_duration = False
 
@@ -1743,7 +1743,6 @@ def process_batch(
 
         # Determine dropout flags
         dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False
-        dropout_phoneme_input = False
 
         # Determine CFG unconditional dropout
         dropout_conditional_input = False
@@ -1794,7 +1793,7 @@ def process_batch(
         dropout_complete_phoneme_channel = False
         if self.phoneme_tokenizer is not None and phoneme_tokens is not None:
             # Corrupt phonemes only when text input is not dropped.
-            apply_phoneme_corruption = mode == 'train' and not dropout_text_input and not dropout_conditional_input
+            apply_phoneme_corruption = mode == 'train' and (not dropout_text_input) and (not dropout_conditional_input)
             dropout_complete_phoneme_channel = dropout_conditional_input
             (
                 phoneme_channel_embedding,

From 18e39b04d91093c881bf70846c033029e6944bce Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Mon, 9 Feb 2026 22:23:00 -0800
Subject: [PATCH 52/94] bug fix phoneme loss

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index eec9e58a4161..6b49977013e8 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -1348,7 +1348,7 @@ def prepare_phoneme_channel_embeddings(
         delay: torch.Tensor,
         apply_corruption: bool = False,
         dropout_complete_phoneme_channel: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[str]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[str]]:
         """
         Prepare phoneme embeddings as a channel input with delay handling.
 
@@ -1370,6 +1370,7 @@ def prepare_phoneme_channel_embeddings(
                 - phoneme_channel_lens: Total length of phoneme channel for each batch item (B,)
                 - phoneme_tokens_stacked: Stacked phoneme tokens (B, S, T')
                 - phoneme_tokens_lens_stacked: Length of stacked phoneme tokens (B,)
+                - phoneme_tokens_stacked_clean: Clean stacked phoneme tokens before corruption (B, S, T')
                 - corruption_mode: None, "unk", or "repeat_skip"
         """
         batch_size = phoneme_tokens.size(0)
@@ -1385,6 +1386,7 @@ def prepare_phoneme_channel_embeddings(
             self.phoneme_stacking_factor,
             1,
         )
+        phoneme_tokens_stacked_clean = phoneme_tokens_stacked.clone()
 
         phoneme_corruption_mode = None
         if apply_corruption:
@@ -1419,6 +1421,7 @@ def prepare_phoneme_channel_embeddings(
             phoneme_channel_lens,
             phoneme_tokens_stacked,
             phoneme_tokens_lens_stacked,
+            phoneme_tokens_stacked_clean,
             phoneme_corruption_mode,
         )
 
@@ -1789,6 +1792,7 @@ def process_batch(
         phoneme_channel_embedding = None
         phoneme_tokens_stacked = None
         phoneme_tokens_lens_stacked = None
+        phoneme_tokens_stacked_clean = None
         phoneme_corruption_mode = None
         dropout_complete_phoneme_channel = False
         if self.phoneme_tokenizer is not None and phoneme_tokens is not None:
@@ -1800,6 +1804,7 @@ def process_batch(
                 phoneme_channel_lens,
                 phoneme_tokens_stacked,
                 phoneme_tokens_lens_stacked,
+                phoneme_tokens_stacked_clean,
                 phoneme_corruption_mode,
             ) = self.prepare_phoneme_channel_embeddings(
                 phoneme_tokens=phoneme_tokens,
@@ -1940,10 +1945,10 @@ def process_batch(
                 target_lens=phoneme_tokens_lens_stacked - 1,
             )
             pb_phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme)
-            pb_phoneme_tokens_target = phoneme_tokens_stacked[:, :, 1:].long()
+            pb_phoneme_tokens_target = phoneme_tokens_stacked_clean[:, :, 1:].long()
             pb_phoneme_tokens_lens_target = phoneme_tokens_lens_stacked - 1
 
-            if phoneme_corruption_mode != 'repeat_skip' and not dropout_complete_phoneme_channel:
+            if (phoneme_corruption_mode != 'repeat_skip') and not (dropout_complete_phoneme_channel or dropout_conditional_input or dropout_text_input):
                 phoneme_loss, _ = self.compute_phoneme_loss(
                     pb_phoneme_logits, pb_phoneme_tokens_target, pb_phoneme_tokens_lens_target
                 )

From e5d141b93b0328c52a66ecdd3ffac8abea47d1e7 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Tue, 10 Feb 2026 09:47:31 -0800
Subject: [PATCH 53/94] another inference bug fix

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 6b49977013e8..b92ebf03d8ac 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -380,6 +380,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer)
             self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1)
             self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size
+            self.phoneme_vocab_size -= 1
             # If max phoneme probability is below this threshold at inference-time,
             # replace the predicted timestep with UNK to reduce error propagation.
             self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0)
@@ -412,6 +413,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             for _ in range(self.phoneme_stacking_factor):
                 phoneme_embeddings.append(nn.Embedding(self.phoneme_vocab_size, cfg.embedding_dim))
             self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings)
+            print("phoneme_vocab_size for final proj.", self.phoneme_vocab_size)
             self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor)
 
         # Decoder backend selection - supports HuggingFace models or NemotronH
@@ -2832,8 +2834,8 @@ def streaming_step(
             # ==================== DETERMINE PHASES PER BATCH ITEM ====================
             needs_context = state.context_position < state.full_context_lens  # (B,) bool
             needs_text = (~needs_context) & (~state.text_finished)
-            needs_phoneme = (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended)
-            needs_audio = (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished)
+            needs_phoneme = (~needs_context) & (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended)
+            needs_audio = (~needs_context) & (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished)
 
             next_input = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
             # --- Context phase items: use next context embedding ---
@@ -2874,7 +2876,7 @@ def streaming_step(
                 # The EOS token itself IS embedded normally (matching process_batch behavior
                 # where EOS is part of the text sequence). After this step, text_finished is set
                 # so subsequent steps won't add any text embedding.
-                is_eos_token = text_tokens == self.eos_id  # (B,) bool
+                is_eos_token = text_tokens == self.eos_id  & needs_text # (B,) bool
                 text_add_mask = needs_text.view(batch_size, 1, 1).float()
                 next_input = next_input + text_embedded * text_add_mask
                 state.text_finished = state.text_finished | is_eos_token

From b1b86f0c68d5fd81c893f8521da938a0b12e64ad Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Tue, 10 Feb 2026 09:51:12 -0800
Subject: [PATCH 54/94] phoneme vocab size fix

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index b92ebf03d8ac..33e0c6cf0aef 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -380,7 +380,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer)
             self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1)
             self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size
-            self.phoneme_vocab_size -= 1
             # If max phoneme probability is below this threshold at inference-time,
             # replace the predicted timestep with UNK to reduce error propagation.
             self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0)

From 4a872aaa9c9328f03d26e99482cc508c0e93585b Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Tue, 10 Feb 2026 10:04:16 -0800
Subject: [PATCH 55/94] bug fix

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 33e0c6cf0aef..9a12b2480adf 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -2875,7 +2875,7 @@ def streaming_step(
                 # The EOS token itself IS embedded normally (matching process_batch behavior
                 # where EOS is part of the text sequence). After this step, text_finished is set
                 # so subsequent steps won't add any text embedding.
-                is_eos_token = text_tokens == self.eos_id  & needs_text # (B,) bool
+                is_eos_token = (text_tokens == self.eos_id)  & needs_text # (B,) bool
                 text_add_mask = needs_text.view(batch_size, 1, 1).float()
                 next_input = next_input + text_embedded * text_add_mask
                 state.text_finished = state.text_finished | is_eos_token

From beaee7b944e8a2f0812c3c67e156a36cf2850422 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Tue, 10 Feb 2026 11:46:59 -0800
Subject: [PATCH 56/94] handle legacy model phoneme vocab size

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 9a12b2480adf..ca3b1eb56c2e 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -380,6 +380,10 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer)
             self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1)
             self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size
+            if cfg.get('phoneme_corruption_batch_prob', None) is None:
+                # Legacy mode: remove the UNK token from the phoneme vocabulary
+                # TODO: Remove this.
+                self.phoneme_vocab_size -= 1
             # If max phoneme probability is below this threshold at inference-time,
             # replace the predicted timestep with UNK to reduce error propagation.
             self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0)
@@ -412,7 +416,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             for _ in range(self.phoneme_stacking_factor):
                 phoneme_embeddings.append(nn.Embedding(self.phoneme_vocab_size, cfg.embedding_dim))
             self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings)
-            print("phoneme_vocab_size for final proj.", self.phoneme_vocab_size)
             self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor)
 
         # Decoder backend selection - supports HuggingFace models or NemotronH

From 0879a1251ba737408b35c00848ac37659e53789f Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Wed, 11 Feb 2026 12:06:15 -0800
Subject: [PATCH 57/94] context duration handling - stop repeating excessively

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 .../tts/data/text_to_speech_dataset.py         | 18 ++++++++++++++----
 .../tts/data/text_to_speech_dataset_lhotse.py  | 18 ++++++++++++++----
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py
index e25e703f52ee..f680a8d9eb34 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset.py
@@ -420,6 +420,12 @@ def get_num_audio_samples_to_slice(self, duration, sample_rate):
 
     def __getitem__(self, index):
         data = self.data_samples[index]
+
+        def _sample_context_duration_with_available_limit(available_duration_sec: float) -> float:
+            effective_duration_max = min(self.context_duration_max, available_duration_sec)
+            effective_duration_max = max(self.context_duration_min, effective_duration_max)
+            return random.uniform(self.context_duration_min, effective_duration_max)
+
         tokenizer_name = "english_phoneme"  # Default to english phoneme tokenizer
         if data.tokenizer_names is not None:
             # Pick a random tokenizer from the list of tokenizers
@@ -489,8 +495,10 @@ def __getitem__(self, index):
         if self.load_cached_codes_if_available and 'context_audio_codes_path' in data.manifest_entry:
             context_audio_codes_path = data.manifest_entry['context_audio_codes_path']
             context_audio_codes = torch.load(context_audio_codes_path)  # (8, T)
-            # Sample random duration between self.context_duration_min and self.context_duration_max
-            _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max)
+            _available_context_duration = (
+                context_audio_codes.shape[1] * self.codec_model_samples_per_frame / self.sample_rate
+            )
+            _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration)
             _num_frames_to_slice = int(
                 _context_duration_to_slice * self.sample_rate / self.codec_model_samples_per_frame
             )
@@ -517,7 +525,8 @@ def __getitem__(self, index):
                 duration=context_duration,
             )
             context_audio_array = context_audio_array.samples
-            _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max)
+            _available_context_duration = len(context_audio_array) / self.sample_rate
+            _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration)
             _num_samples_to_slice = self.get_num_audio_samples_to_slice(_context_duration_to_slice, self.sample_rate)
             if _num_samples_to_slice < len(context_audio_array):
                 start_idx = random.randint(0, len(context_audio_array) - _num_samples_to_slice)
@@ -566,7 +575,8 @@ def __getitem__(self, index):
                     sample_rate=16000,
                     volume_norm=self.volume_norm,
                 )
-            _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max)
+            _available_context_duration = len(audio_array_16khz) / 16000
+            _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration)
             _num_samples_to_slice = int(_context_duration_to_slice * 16000)
             if _num_samples_to_slice < len(audio_array_16khz):
                 start_idx = random.randint(0, len(audio_array_16khz) - _num_samples_to_slice)
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index 464b988b9415..56a80d6af63c 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -232,6 +232,12 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
         )  # raw text here is the string of normalized text or text stored in the supervision segment. Used to distinguish from text tokens.
         phoneme_token_list = []
         phoneme_token_len_list = []
+
+        def _sample_context_duration_with_available_limit(available_duration_sec: float) -> float:
+            effective_duration_max = min(self.context_duration_max, available_duration_sec)
+            effective_duration_max = max(self.context_duration_min, effective_duration_max)
+            return random.uniform(self.context_duration_min, effective_duration_max)
+
         for cut in cuts:
             speaker = cut.supervisions[0].speaker
             if not check_speaker_format(speaker):
@@ -276,8 +282,10 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
                 # and duration are None to the load function.
                 context_audio_codes_array = cut.context_codes.load().astype(np.int32)
                 context_audio_codes = torch.from_numpy(context_audio_codes_array)  # (C, T)
-                # Sample random duration between self.context_duration_min and self.context_duration_max
-                _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max)
+                _available_context_duration = (
+                    context_audio_codes.shape[1] * self.codec_model_samples_per_frame / self.sample_rate
+                )
+                _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration)
                 _num_frames_to_slice = int(
                     _context_duration_to_slice * self.sample_rate / self.codec_model_samples_per_frame
                 )
@@ -301,7 +309,8 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
                 context_audio_array = cut.context_audio.resample(self.sample_rate).load_audio().squeeze(0)
                 if self.volume_norm:
                     context_audio_array = normalize_volume(context_audio_array)
-                _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max)
+                _available_context_duration = len(context_audio_array) / self.sample_rate
+                _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration)
                 _num_samples_to_slice = self.get_num_audio_samples_to_slice(
                     _context_duration_to_slice, self.sample_rate
                 )
@@ -351,7 +360,8 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
                     audio_array_16khz = cut.target_audio.resample(16_000).load_audio().squeeze(0)
                     if self.volume_norm:
                         audio_array_16khz = normalize_volume(audio_array_16khz)
-                _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max)
+                _available_context_duration = len(audio_array_16khz) / 16_000
+                _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration)
                 _num_samples_to_slice = int(_context_duration_to_slice * 16_000)
                 if _num_samples_to_slice < len(audio_array_16khz):
                     start_idx = random.randint(0, len(audio_array_16khz) - _num_samples_to_slice)

From ae557ac2696516216f759855b248e951a18459b7 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Wed, 11 Feb 2026 12:24:32 -0800
Subject: [PATCH 58/94] clamp cer and wer to 1

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index ca3b1eb56c2e..47138ac18094 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -2263,8 +2263,8 @@ def validation_step(self, batch, batch_idx):
                         if pred_transcripts[idx] is None:
                             continue
                         gt_transcript = process_text_for_cer(batch['raw_texts'][idx])
-                        cer = word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=True)
-                        wer = word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=False)
+                        cer = min(word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=True), 1.0)
+                        wer = min(word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=False), 1.0)
                         batch_cer.append(cer)
                         batch_wer.append(wer)
                         ssim = None

From 3d69a1253fa27d7b787ad5e3ef5717cb96c14020 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Tue, 17 Feb 2026 00:13:57 -0800
Subject: [PATCH 59/94] Preference Optimization for EasyMagpieTTS (#64)

* PO for EM-TTS

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* add PO mode in training

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* PO code update

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* wip

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>

* wip

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>

* wip

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>

* wip

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>

* bug fixes

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>

* logging for gradient tracking

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>

* GRPO working

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>

---------

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>
---
 examples/tts/easy_magpietts.py                |   24 +-
 .../tts/data/text_to_speech_dataset_lhotse.py |    5 +-
 nemo/collections/tts/models/__init__.py       |    2 +
 nemo/collections/tts/models/easy_magpietts.py |   50 +-
 .../easy_magpietts_preference_optimization.py | 1141 +++++++++++++++++
 nemo/core/classes/modelPT.py                  |    2 +-
 6 files changed, 1208 insertions(+), 16 deletions(-)
 create mode 100644 nemo/collections/tts/models/easy_magpietts_preference_optimization.py

diff --git a/examples/tts/easy_magpietts.py b/examples/tts/easy_magpietts.py
index 4195060b87ef..5e9be71a7805 100644
--- a/examples/tts/easy_magpietts.py
+++ b/examples/tts/easy_magpietts.py
@@ -14,9 +14,9 @@
 
 import lightning.pytorch as pl
 import torch.multiprocessing as mp
-from omegaconf import OmegaConf
+from omegaconf import OmegaConf, open_dict
 
-from nemo.collections.tts.models import EasyMagpieTTSModel
+from nemo.collections.tts.models import EasyMagpieTTSModel, EasyMagpieTTSModelOnlinePO
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
@@ -42,15 +42,25 @@ def main(cfg):
     trainer.callbacks.append(pl.callbacks.LearningRateMonitor(logging_interval='step', log_weight_decay=True))
     exp_manager(trainer, cfg.get("exp_manager", None))
 
-    model = EasyMagpieTTSModel(cfg=cfg.model, trainer=trainer)
+    mode = cfg.get('mode', 'train')
+    if mode == 'train':
+        model = EasyMagpieTTSModel(cfg=cfg.model, trainer=trainer)
+    elif mode == 'onlinepo_train':
+        model_cfg = cfg.model
+        with open_dict(model_cfg):
+            model_cfg.reference_model_ckpt_path = cfg.init_from_ptl_ckpt
+        model = EasyMagpieTTSModelOnlinePO(cfg=model_cfg, trainer=trainer)
+    elif mode == 'test':
+        model = EasyMagpieTTSModel(cfg=cfg.model, trainer=trainer)
+    else:
+        raise NotImplementedError(f"Only train, onlinepo_train and test modes are supported. Got {mode}")
+
     model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
 
-    if cfg.get('mode', 'train') == 'train':
+    if mode in ['train', 'onlinepo_train']:
         trainer.fit(model)
-    elif cfg.get('mode', 'train') == 'test':
+    elif mode == 'test':
         trainer.test(model)
-    else:
-        raise NotImplementedError(f"Only train and test modes are supported. Got {cfg.mode}")
 
 
 if __name__ == '__main__':
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index 56a80d6af63c..ffd6b5629cc4 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -244,7 +244,10 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float)
                 raise ValueError(f"Invalid format in cut.supervisions[0].speaker: {speaker}")
             dataset_name = speaker.strip().split()[2].split(":")[-1]
             dataset_name_list.append(dataset_name)
-            language = cut.supervisions[0].language if cut.supervisions[0].has_custom("language") else "en"
+            if cut.has_custom("lang"):
+                language = cut.lang
+            else:
+                language = cut.supervisions[0].language if cut.supervisions[0].has_custom("language") else "en"
             language_list.append(language)
 
             # target audio or target codes
diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py
index 20984cfccc6a..0783c79bacab 100644
--- a/nemo/collections/tts/models/__init__.py
+++ b/nemo/collections/tts/models/__init__.py
@@ -15,6 +15,7 @@
 from nemo.collections.tts.models.aligner import AlignerModel
 from nemo.collections.tts.models.audio_codec import AudioCodecModel
 from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel
+from nemo.collections.tts.models.easy_magpietts_preference_optimization import EasyMagpieTTSModelOnlinePO
 from nemo.collections.tts.models.fastpitch import FastPitchModel
 from nemo.collections.tts.models.fastpitch_ssl import FastPitchModel_SSL
 from nemo.collections.tts.models.hifigan import HifiGanModel
@@ -36,6 +37,7 @@
     "InferBatchOutput",
     "MagpieTTSModel",
     "EasyMagpieTTSModel",
+    "EasyMagpieTTSModelOnlinePO",
     "MagpieTTSModelOfflinePODataGen",
     "MagpieTTSModelOfflinePO",
     "MagpieTTSModelOnlinePO",
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 47138ac18094..5dd61563788d 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -454,6 +454,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
 
         self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim)
         self.decoder.set_input_embeddings(self.text_embedding)
+        # self.decoder.float()
 
         # Task embedding for multi-mode training
         # Each mode has a unique task embedding that is prepended to the context
@@ -718,6 +719,14 @@ def codes_to_audio(self, codes, codes_len):
             # Pass the modified integer token IDs
             if self._codec_converter is not None:
                 codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len)
+            if codes_len.min() < 4:
+                # Pad the codes with 0s to make the minimum length 4
+                # codes is (B, C, T)
+                codes = torch.nn.functional.pad(input=codes, pad=(0, 4 - codes_len.min()), value=0)
+                # Updates all lens less than 4 to 4
+                codes_len = torch.where(codes_len < 4, torch.ones_like(codes_len) * 4, codes_len)
+                codes = codes[:,:,:codes_len.max()]
+
             audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len)
             # audio: (B, T)
             # audio_len: (B,)
@@ -934,6 +943,12 @@ def local_transformer_sample_autoregressive(
                 cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
                 codebook_logits[:actual_batch_size] = cfg_logits
 
+            # Replace NaN/inf then clamp to prevent extreme values (e.g. from CFG) causing NaN in softmax
+            # print("codebook_logits stats before nan_to_num")
+            # print(f"min: {codebook_logits.min()}, max: {codebook_logits.max()}, mean: {codebook_logits.mean()}, std: {codebook_logits.std()}")
+            codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
+            codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
+
             for item_idx in unfinished_items:
                 codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
             for item_idx in finished_items:
@@ -985,6 +1000,9 @@ def sample_codes_from_logits(
             si = idx * self.num_all_tokens_per_codebook
             ei = si + self.num_all_tokens_per_codebook
             codebook_logits = all_code_logits_t[:, si:ei]  # (B, num_tokens_per_codebook)
+            # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax
+            codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
+            codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
             for item_idx in unfinished_items:
                 codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
             for item_idx in finished_items:
@@ -1016,6 +1034,9 @@ def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, t
             si = idx * self.phoneme_vocab_size
             ei = si + self.phoneme_vocab_size
             codebook_logits = all_code_logits_t[:, si:ei]  # (B, num_tokens_per_codebook)
+            # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax
+            codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
+            codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
             codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]  # (B, topk)
             indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(
                 -1
@@ -2145,11 +2166,11 @@ def validation_step(self, batch, batch_idx):
         if self.run_val_inference:
             infer_output = self.infer_batch(
                 batch,
-                max_decoder_steps=220,
+                max_decoder_steps=300,
                 temperature=0.7,
                 topk=80,
                 use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR,
-                use_cfg=True,
+                use_cfg=self.cfg.get('inference_use_cfg_in_val', True),
                 cfg_scale=2.5
             )
 
@@ -2610,6 +2631,7 @@ def streaming_init(
         gt_phoneme_tokens_lens: Optional[torch.Tensor] = None,
         gt_audio_codes: Optional[torch.Tensor] = None,
         gt_audio_codes_lens: Optional[torch.Tensor] = None,
+        use_inference_mode: bool = True,
     ) -> StreamingState:
         """
         Initialize streaming TTS inference state.
@@ -2655,7 +2677,8 @@ def streaming_init(
         Returns:
             StreamingState: Initial state for streaming inference.
         """
-        with torch.inference_mode():
+        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
+        with grad_ctx():
             batch_size = context_audio_codes.size(0)
             device = context_audio_codes.device
 
@@ -2785,6 +2808,7 @@ def streaming_step(
         state: StreamingState,
         text_tokens: Optional[torch.Tensor] = None,
         force_dropout_text: bool = False,
+        use_inference_mode: bool = True,
     ) -> Tuple[StreamingState, Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
         Perform one streaming inference step with batch support.
@@ -2827,7 +2851,8 @@ def streaming_step(
         if state.finished.all():
             return state, None, None
 
-        with torch.inference_mode():
+        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
+        with grad_ctx():
             device = state.device
             batch_size = state.batch_size
             streaming_speech_delay = state.training_mode.streaming_speech_delay
@@ -3200,6 +3225,7 @@ def _predict_audio_codes(self, state: StreamingState) -> Tuple[torch.Tensor, tor
     def streaming_finalize(
         self,
         state: StreamingState,
+        use_inference_mode: bool = True,
     ) -> StreamingFinalizeOutput:
         """
         Finalize streaming and return the complete generated audio and phoneme predictions.
@@ -3249,7 +3275,8 @@ def streaming_finalize(
                 phoneme_text=phoneme_text_list,
             )
 
-        with torch.inference_mode():
+        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
+        with grad_ctx():
             # Concatenate all predictions - each is (B, C, S), concat gives (B, C, T_total_frames)
             all_codes = torch.cat(state.all_predictions, dim=-1)  # (B, C, T_total_frames)
             total_frames = all_codes.size(-1)
@@ -3317,6 +3344,7 @@ def infer_batch(
         phoneme_sampling_method: str = 'argmax',
         force_dropout_text: bool = False,
         use_teacher_forced: bool = False,
+        use_inference_mode: bool = True,
     ) -> InferBatchOutput:
         """
         Batch inference using streaming infrastructure.
@@ -3352,7 +3380,8 @@ def infer_batch(
         Returns:
             InferBatchOutput containing predicted audio, codes, and RTF metrics.
         """
-        with torch.inference_mode():
+        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
+        with grad_ctx():
             start_time = time.time()
 
             # Extract tensors from batch
@@ -3440,6 +3469,7 @@ def infer_batch(
                 gt_phoneme_tokens_lens=gt_phoneme_tokens_lens,
                 gt_audio_codes=gt_audio_codes_for_init,
                 gt_audio_codes_lens=gt_audio_codes_lens_for_init,
+                use_inference_mode=use_inference_mode,
             )
 
             time_to_first_prediction = None
@@ -3447,7 +3477,12 @@ def infer_batch(
             device = text.device
 
             # Generate until all items are finished or max steps reached
+            print("Generation started")
+            gen_step = 0
             while not state.finished.all() and len(state.all_predictions) < max_decoder_steps:
+                gen_step += 1
+                if gen_step % 10 == 0:
+                    print(f"Generation step {gen_step} ")
                 # Gather the correct text token for each batch item based on text_tokens_seen
                 # Items in context phase will have their token ignored by streaming_step
                 positions = state.text_tokens_seen.clamp(max=text.size(1) - 1)
@@ -3463,6 +3498,7 @@ def infer_batch(
                     state=state,
                     text_tokens=current_tokens,
                     force_dropout_text=force_dropout_text,
+                    use_inference_mode=use_inference_mode,
                 )
 
                 # Record time to first audio prediction
@@ -3472,7 +3508,7 @@ def infer_batch(
             tts_generation_time = time.time() - generation_start_time
 
             # Finalize and decode audio
-            finalize_output = self.streaming_finalize(state)
+            finalize_output = self.streaming_finalize(state, use_inference_mode=use_inference_mode)
 
             end_time = time.time()
             total_time = end_time - start_time
diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
new file mode 100644
index 000000000000..1bc94c14206f
--- /dev/null
+++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
@@ -0,0 +1,1141 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import random
+import time
+from typing import Dict, List, Optional
+
+import numpy as np
+import soundfile as sf
+import torch
+from lightning.pytorch import Trainer
+from omegaconf import DictConfig, open_dict
+
+import nemo.collections.asr as nemo_asr
+from nemo.collections.asr.metrics.wer import word_error_rate
+from nemo.collections.asr.parts.mixins.transcription import TranscribeConfig
+from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel
+from nemo.collections.tts.parts.utils.helpers import (
+    get_mask_from_lengths,
+    get_speaker_embeddings_from_filepaths,
+    process_text_for_cer,
+    transcribe_with_whisper,
+)
+from nemo.utils import logging
+
+try:
+    import torchaudio
+    from torchaudio.pipelines import SQUIM_OBJECTIVE
+
+    HAVE_TORCHAUDIO = True
+except ImportError:
+    HAVE_TORCHAUDIO = False
+
+try:
+    from nemo_text_processing.text_normalization.normalize import Normalizer
+
+    PYNINI_AVAILABLE = True
+except (ImportError, ModuleNotFoundError):
+    Normalizer = None
+    PYNINI_AVAILABLE = False
+
+
+class EasyMagpieTTSModelOnlinePO(EasyMagpieTTSModel):
+    """
+    EasyMagpie-TTS online preference optimization model (GRPO / DR-GRPO).
+
+    Training flow:
+    1. Sample multiple generations per prompt.
+    2. Compute rewards (CER/SSIM/PESQ).
+    3. Compute group-normalized advantages.
+    4. Run teacher-forced policy forward on generated codes and optimize GRPO objective.
+    5. Add auxiliary phoneme loss from the same forward pass with GT phoneme tokens.
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
+        super().__init__(cfg, trainer)
+        
+        self.run_val_inference = True # Always run validation inference in PO.
+        self.automatic_optimization = False
+
+        ref_model_cfg = copy.deepcopy(cfg)
+        with open_dict(ref_model_cfg):
+            ref_model_cfg.train_ds = None
+            ref_model_cfg.validation_ds = None
+
+        self.reference_free = self.cfg.get('reference_free', False)
+        if not self.reference_free:
+            self._reference_model = EasyMagpieTTSModel(cfg=ref_model_cfg)
+            logging.info("Loading EasyMagpie reference model from checkpoint")
+            self._reference_model.load_state_dict(
+                torch.load(cfg.reference_model_ckpt_path, map_location="cpu", weights_only=False)['state_dict']
+            )
+            self._reference_model.freeze()
+            self._reference_model._no_state_dict = True
+            logging.info("Reference model loaded and frozen")
+
+        reward_asr_model = cfg.get('reward_asr_model', 'nemo')
+        if reward_asr_model == 'nemo':
+            self._eval_asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
+                model_name=cfg.get('reward_asr_model_name', "nvidia/parakeet-ctc-0.6b")
+            )
+            self._eval_asr_model.freeze()
+            self.whisper_processor = None
+            self.whisper_model = None
+        elif reward_asr_model == 'whisper':
+            from transformers import WhisperForConditionalGeneration, WhisperProcessor
+
+            self._eval_asr_model = None
+            self.whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
+            self.whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
+            self.whisper_model.eval()
+            for param in self.whisper_model.parameters():
+                param.requires_grad = False
+            self.use_multilingual_asr = True
+        else:
+            raise ValueError(f"Unknown reward_asr_model: {reward_asr_model}")
+
+        self._eval_speaker_verification_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(
+            model_name=cfg.get('speaker_verification_model_name', 'titanet_large')
+        )
+        self._eval_speaker_verification_model.freeze()
+
+        use_pesq = self.cfg.get('use_pesq', False)
+        if use_pesq:
+            assert HAVE_TORCHAUDIO, "torchaudio is required for PESQ reward."
+            self.squim_objective_model = SQUIM_OBJECTIVE.get_model()
+
+        self.loss_type = self.cfg.get('loss_type', 'grpo')
+        if self.loss_type not in ['grpo', 'dr_grpo']:
+            raise ValueError(
+                f"Received loss_type={self.loss_type}. Supported values: ['grpo', 'dr_grpo']."
+            )
+        self.scale_rewards = self.cfg.get('scale_rewards', True)
+        self.max_decoder_steps = self.cfg.get('max_decoder_steps', 220)
+        self.aux_phoneme_loss_weight = self.cfg.get('aux_phoneme_loss_weight', 1.0)
+        self.po_groups_per_subbatch = max(int(self.cfg.get('po_groups_per_subbatch', 1)), 1)
+
+        self._normalize_whisper_transcript = self.cfg.get('normalize_whisper_transcript', True)
+        if reward_asr_model == 'whisper' and self._normalize_whisper_transcript:
+            self._normalizer_cache = {}
+
+        # Filter out poor groups for stable optimization.
+        self.best_cer_threshold = self.cfg.get('best_cer_threshold', 1.0)
+        self.worst_cer_threshold = self.cfg.get('worst_cer_threshold', 1.0)
+
+        if self.trainer is not None and str(self.trainer.precision) in ("32", "32-true"):
+            self.decoder.float()
+
+    def _get_trainable_module_groups(self) -> Dict[str, List[torch.nn.Parameter]]:
+        """Return a dict mapping module-group name → list of trainable parameters."""
+        modules_to_exclude = {
+            '_speaker_verification_model', '_codec_model', '_eval_asr_model',
+            '_eval_speaker_verification_model', '_reference_model',
+            'whisper_model', 'whisper_processor', 'squim_objective_model',
+        }
+        groups: Dict[str, List[torch.nn.Parameter]] = {}
+        for name, module in self.named_children():
+            if name in modules_to_exclude:
+                continue
+            params = [p for p in module.parameters() if p.requires_grad]
+            if params:
+                groups[name] = params
+        return groups
+
+    @torch.no_grad()
+    def _compute_grad_and_weight_metrics(self) -> Dict[str, float]:
+        """Compute per-module grad_norm, weight_norm, and global aggregates."""
+        module_groups = self._get_trainable_module_groups()
+        metrics: Dict[str, float] = {}
+        all_grad_norms, all_weight_norms = [], []
+
+        for group_name, params in module_groups.items():
+            grad_norms, weight_norms = [], []
+            for p in params:
+                weight_norms.append(p.data.norm(2).item())
+                if p.grad is not None:
+                    grad_norms.append(p.grad.data.norm(2).item())
+
+            module_weight_norm = float(np.sqrt(sum(w ** 2 for w in weight_norms)))
+            metrics[f'weight_norm/{group_name}'] = module_weight_norm
+            all_weight_norms.extend(weight_norms)
+
+            if grad_norms:
+                module_grad_norm = float(np.sqrt(sum(g ** 2 for g in grad_norms)))
+                metrics[f'grad_norm/{group_name}'] = module_grad_norm
+                all_grad_norms.extend(grad_norms)
+            else:
+                metrics[f'grad_norm/{group_name}'] = 0.0
+
+        if all_grad_norms:
+            metrics['grad_norm/global'] = float(np.sqrt(sum(g ** 2 for g in all_grad_norms)))
+        if all_weight_norms:
+            metrics['weight_norm/global'] = float(np.sqrt(sum(w ** 2 for w in all_weight_norms)))
+        return metrics
+
+    @torch.no_grad()
+    def _compute_weight_update_metrics(self, prev_weights: Dict[int, torch.Tensor]) -> Dict[str, float]:
+        """Compute per-module weight delta norms (how much weights changed after optimizer step)."""
+        metrics: Dict[str, float] = {}
+        module_groups = self._get_trainable_module_groups()
+        all_deltas = []
+        for group_name, params in module_groups.items():
+            deltas = []
+            for p in params:
+                pid = id(p)
+                if pid in prev_weights:
+                    deltas.append((p.data - prev_weights[pid]).norm(2).item())
+            if deltas:
+                metrics[f'weight_delta/{group_name}'] = float(np.sqrt(sum(d ** 2 for d in deltas)))
+                all_deltas.extend(deltas)
+        if all_deltas:
+            metrics['weight_delta/global'] = float(np.sqrt(sum(d ** 2 for d in all_deltas)))
+        return metrics
+
+    @torch.no_grad()
+    def _snapshot_trainable_weights(self) -> Dict[int, torch.Tensor]:
+        """Take a snapshot of all trainable parameter values (by param id)."""
+        snapshot = {}
+        for params in self._get_trainable_module_groups().values():
+            for p in params:
+                snapshot[id(p)] = p.data.clone()
+        return snapshot
+
+    def _print_grad_weight_summary(self, metrics: Dict[str, float], step: int) -> None:
+        """Print a compact per-module summary of grad_norm / weight_norm / weight_delta."""
+        if not getattr(self.trainer, "is_global_zero", True):
+            return
+
+        lines = [f"\n[grad/weight] step={step}  "
+                 f"grad={metrics.get('grad_norm/global', 0.0):.6f}  "
+                 f"w={metrics.get('weight_norm/global', 0.0):.4f}  "
+                 f"Δw={metrics.get('weight_delta/global', 0.0):.8f}"]
+
+        module_names = sorted(
+            k.split('/')[1] for k in metrics
+            if k.startswith('weight_norm/') and k != 'weight_norm/global'
+        )
+        for name in module_names:
+            gn = metrics.get(f'grad_norm/{name}', 0.0)
+            wn = metrics.get(f'weight_norm/{name}', 0.0)
+            wd = metrics.get(f'weight_delta/{name}', 0.0)
+            lines.append(f"  {name:40s}  grad={gn:.6f}  w={wn:.4f}  Δw={wd:.8f}")
+
+        summary = "\n".join(lines)
+        print(summary)
+        logging.info(summary)
+
+    def setup_optimizer_param_groups(self):
+        """
+        Exclude frozen eval/reference modules AND modules that receive no gradients
+        from the PO loss (final_proj, lm_text_head, phoneme_final_proj) from the
+        optimizer. Including them would subject their weights to weight decay without
+        any learning signal, slowly degrading them.
+        """
+        modules_to_exclude = {
+            '_speaker_verification_model',
+            '_codec_model',
+            '_eval_asr_model',
+            '_eval_speaker_verification_model',
+            '_reference_model',
+            'whisper_model',
+            'whisper_processor',
+            # These modules are not used by the PO loss and receive no gradients.
+            # Including them would only apply weight decay, degrading their weights.
+            'final_proj',
+            'lm_text_head',
+            'phoneme_final_proj',
+        }
+
+        excluded_param_ids = set()
+        for name, module in self.named_children():
+            if name in modules_to_exclude and hasattr(module, "parameters"):
+                for param in module.parameters():
+                    excluded_param_ids.add(id(param))
+
+        trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids]
+        self._optimizer_param_groups = [{"params": trainable_params}]
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+        keys_substrings_to_exclude = ['_reference_model']
+        for key in list(state_dict.keys()):
+            if any(substring in key for substring in keys_substrings_to_exclude):
+                del state_dict[key]
+        return state_dict
+
+    def _get_cached_normalizer(self, lang_key: Optional[str]):
+        if not PYNINI_AVAILABLE:
+            return None
+        lang_key = lang_key if lang_key else "en"
+        if lang_key not in self._normalizer_cache:
+            logging.info(f"Creating normalizer for language: {lang_key}")
+            try:
+                self._normalizer_cache[lang_key] = Normalizer(input_case="cased", lang=lang_key)
+            except Exception as e:
+                logging.warning(f"Failed to create normalizer for language: {lang_key}. Error: {e}")
+                self._normalizer_cache[lang_key] = None
+        return self._normalizer_cache[lang_key]
+
+    def _get_per_token_logps(self, logits: torch.Tensor, labels: torch.Tensor, loss_mask: torch.Tensor) -> torch.Tensor:
+        # Force fp32 for log_softmax to avoid bf16 precision issues that sever the
+        # gradient path through the GRPO "exp(logps - logps.detach())" trick.
+        # Under bf16 autocast, the tiny gradient signal through this identity-like
+        # expression gets rounded to zero, disconnecting local_transformer_out_projections.
+        with torch.cuda.amp.autocast(enabled=False):
+            logits_fp32 = logits.float()
+            per_token_logps = torch.gather(logits_fp32.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)
+            per_token_logps = per_token_logps * loss_mask.float()
+        return per_token_logps
+
+
+    def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False):
+        """
+        Override parent to force fp32 computation for the entire local transformer logits path.
+
+        Under bf16-mixed autocast, the nn.Linear out_projections execute in bf16 and insert
+        ToCopyBackward0 nodes in the autograd graph. The GRPO loss formula
+        ``exp(logps - logps.detach())`` produces an identity in the forward pass, but the
+        gradient signal through this expression is extremely small. The bf16 ToCopyBackward0
+        nodes round these tiny gradients to zero, completely severing the gradient path to
+        local_transformer_out_projections. Running the full computation in fp32 preserves
+        the gradient fidelity.
+        """
+        with torch.cuda.amp.autocast(enabled=False):
+            # Cast dec_out to fp32 if it's in a lower precision (e.g. bf16 from autocast)
+            dec_out_fp32 = dec_out.float()
+            return super().compute_local_transformer_logits(
+                dec_out_fp32, audio_codes_target, targets_offset_by_one=targets_offset_by_one
+            )
+
+    def repeat_items_in_batch(self, batch: Dict, num_repeats: int) -> Dict:
+        repeated_batch = {}
+        for key, value in batch.items():
+            if isinstance(value, torch.Tensor):
+                repeated_batch[key] = value.repeat_interleave(num_repeats, dim=0)
+            elif isinstance(value, list):
+                repeated_value = []
+                for item in value:
+                    repeated_value.extend([item] * num_repeats)
+                repeated_batch[key] = repeated_value
+            else:
+                repeated_batch[key] = value
+        return repeated_batch
+
+    def _get_audio_dir(self) -> str:
+        if self.logger is not None and hasattr(self.logger, "log_dir") and self.logger.log_dir is not None:
+            log_dir = self.logger.log_dir
+        elif self.trainer is not None and self.trainer.log_dir is not None:
+            log_dir = self.trainer.log_dir
+        else:
+            log_dir = "."
+        audio_dir = os.path.join(log_dir, 'online_po_audios')
+        os.makedirs(audio_dir, exist_ok=True)
+        return audio_dir
+
+    def _save_waveforms_to_paths(
+        self,
+        waveforms: torch.Tensor,
+        waveform_lens: torch.Tensor,
+        prefix: str,
+        sample_rate: int,
+    ) -> List[str]:
+        audio_dir = self._get_audio_dir()
+        time_id = time.time_ns()
+        paths = []
+        for idx in range(waveforms.size(0)):
+            wav = waveforms[idx].float().detach().cpu().numpy()
+            wav = wav[: int(waveform_lens[idx].item())]
+            # path = os.path.join(audio_dir, f'{prefix}_rank{self.global_rank}_{time_id}_{idx}.wav')
+            path = os.path.join(audio_dir, f'{prefix}_rank{self.global_rank}_{idx}.wav')
+            sf.write(path, wav, sample_rate)
+            paths.append(path)
+        return paths
+
+    def _get_reference_audio_paths(self, batch_repeated: Dict) -> List[str]:
+        """
+        Build per-item reference audio paths for speaker similarity reward.
+        Priority: audio_filepaths -> context_audio -> context_audio_codes.
+        """
+        if 'context_audio' in batch_repeated and 'context_audio_lens' in batch_repeated:
+            # TODO: Handle text context here support here.
+            return self._save_waveforms_to_paths(
+                waveforms=batch_repeated['context_audio'],
+                waveform_lens=batch_repeated['context_audio_lens'],
+                prefix='reference_context_audio',
+                sample_rate=self.sample_rate,
+            )
+
+        if 'context_audio_codes' in batch_repeated and 'context_audio_codes_lens' in batch_repeated:
+            context_codes = batch_repeated['context_audio_codes'].clone()
+            context_lens = batch_repeated['context_audio_codes_lens'].clone()
+
+            target_codes = batch_repeated['audio_codes'].clone()
+            target_lens = batch_repeated['audio_codes_lens'].clone()
+
+            # For items where context_lens < 3, fall back to target_codes/target_lens
+            # This is for items with text context
+            short_context_mask = context_lens < 3
+            if short_context_mask.any():
+                # Pad the shorter tensor along the time dimension if needed
+                max_len = max(context_codes.shape[-1], target_codes.shape[-1])
+                if context_codes.shape[-1] < max_len:
+                    pad_size = max_len - context_codes.shape[-1]
+                    context_codes = torch.nn.functional.pad(context_codes, (0, pad_size), value=0)
+                if target_codes.shape[-1] < max_len:
+                    pad_size = max_len - target_codes.shape[-1]
+                    target_codes = torch.nn.functional.pad(target_codes, (0, pad_size), value=0)
+                context_codes[short_context_mask] = target_codes[short_context_mask]
+                context_lens[short_context_mask] = target_lens[short_context_mask]
+                # Slice to the actual max length needed
+                context_codes = context_codes[..., :context_lens.max()]
+
+            if self._codec_converter is not None:
+                context_codes = self._codec_converter.convert_original_to_new(
+                    audio_tokens=context_codes, audio_lens=context_lens
+                ).long()
+            context_audio, context_audio_lens, _ = self.codes_to_audio(context_codes, context_lens)
+            return self._save_waveforms_to_paths(
+                waveforms=context_audio,
+                waveform_lens=context_audio_lens,
+                prefix='reference_context_codes_decoded',
+                sample_rate=self.output_sample_rate,
+            )
+
+        raise ValueError(
+            "Could not construct reference audio for speaker similarity. Need one of: "
+            "context_audio/context_audio_lens, or context_audio_codes/context_audio_codes_lens."
+        )
+
+    def _run_easy_process_batch(
+        self,
+        model: EasyMagpieTTSModel,
+        batch: Dict,
+        audio_codes: torch.Tensor,
+        audio_codes_lens: torch.Tensor,
+        mode: str,
+    ):
+        if 'context_audio_codes' in batch:
+            context_audio_codes = batch['context_audio_codes']
+            context_audio_codes_lens = batch['context_audio_codes_lens']
+        else:
+            context_audio_codes, context_audio_codes_lens = model.audio_to_codes(
+                batch['context_audio'], batch['context_audio_lens']
+            )
+
+        return model.process_batch(
+            text=batch['text'],
+            text_lens=batch['text_lens'],
+            context_text_tokens=batch['context_text_tokens'],
+            context_text_tokens_lens=batch['context_text_tokens_lens'],
+            audio_codes=audio_codes,
+            audio_codes_lens=audio_codes_lens,
+            context_audio_codes=context_audio_codes,
+            context_audio_codes_lens=context_audio_codes_lens,
+            phoneme_tokens=batch.get('phoneme_tokens'),
+            phoneme_tokens_lens=batch.get('phoneme_tokens_lens'),
+            mode=mode,
+        )
+
+    def _format_text_table(self, headers: List[str], rows: List[List[str]]) -> str:
+        col_widths = [len(h) for h in headers]
+        for row in rows:
+            for col_idx, value in enumerate(row):
+                col_widths[col_idx] = max(col_widths[col_idx], len(value))
+
+        header_line = " | ".join(headers[col_idx].ljust(col_widths[col_idx]) for col_idx in range(len(headers)))
+        separator = "-+-".join("-" * col_widths[col_idx] for col_idx in range(len(headers)))
+        row_lines = [
+            " | ".join(row[col_idx].ljust(col_widths[col_idx]) for col_idx in range(len(headers))) for row in rows
+        ]
+        return "\n".join([header_line, separator] + row_lines)
+
+    def _print_group_cer_wer_table(
+        self,
+        batch: Dict,
+        batch_metrics: List[Dict],
+        group_idx: int,
+        group_start_idx: int,
+        group_end_idx: int,
+        is_group_valid: bool,
+        mean_reward: float,
+        std_reward: float,
+    ) -> None:
+        if not getattr(self.trainer, "is_global_zero", True):
+            return
+
+        prompt_text = str(batch['raw_texts'][group_idx]).replace("\n", " ")
+        if len(prompt_text) > 120:
+            prompt_text = f"{prompt_text[:117]}..."
+
+        rows = []
+        for local_idx, metric_idx in enumerate(range(group_start_idx, group_end_idx)):
+            item_metrics = batch_metrics[metric_idx]
+            rows.append(
+                [
+                    str(local_idx),
+                    f"{item_metrics['cer_gt']:.4f}",
+                    f"{item_metrics['wer_gt']:.4f}",
+                    f"{item_metrics['spk_similarity']:.4f}",
+                    f"{item_metrics['reward']:.4f}",
+                    f"{item_metrics.get('advantage', 0.0):.4f}",
+                ]
+            )
+
+        table = self._format_text_table(headers=["item", "cer", "wer", "ssim", "reward", "advantage"], rows=rows)
+        print(
+            f"[generate_and_reward] group={group_idx} valid={is_group_valid} "
+            f"mean_reward={mean_reward:.4f} std_reward={std_reward:.4f}\n"
+            f"prompt: {prompt_text}\n{table}\n"
+        )
+
+    def generate_and_reward(
+        self,
+        batch: Dict,
+        num_generations_per_item: int,
+        mode: str = 'train',
+        use_local_transformer_for_inference: bool = False,
+    ):
+        batch_repeated = self.repeat_items_in_batch(batch, num_generations_per_item)
+        reward_asr_model = self.cfg.get('reward_asr_model', 'nemo')
+        use_pesq = self.cfg.get('use_pesq', False)
+
+        use_cfg = False
+        cfg_scale = 1.0
+        inference_cfg_prob = self.cfg.get('inference_cfg_prob', 0.0)
+        if (inference_cfg_prob == 1.0) or (inference_cfg_prob > 0.0 and mode == 'train'):
+            use_cfg = random.random() < inference_cfg_prob
+            cfg_scale = self.cfg.get('inference_cfg_scale', 1.0)
+
+        phoneme_input_type = 'pred'
+        gt_phoneme_input_prob = self.cfg.get('gt_phoneme_input_prob', 0.0)
+        can_use_gt_phonemes = ('phoneme_tokens' in batch_repeated) and ('phoneme_tokens_lens' in batch_repeated)
+        if can_use_gt_phonemes and gt_phoneme_input_prob > 0.0 and mode == 'train':
+            phoneme_input_type = 'gt' if random.random() < gt_phoneme_input_prob else 'pred'
+
+        generation_start_time = time.perf_counter()
+        print("Inference started")
+        output = self.infer_batch(
+            batch=batch_repeated,
+            max_decoder_steps=self.max_decoder_steps,
+            temperature=self.cfg.get('inference_temperature', 0.7),
+            topk=self.cfg.get('inference_topk', 80),
+            use_cfg=use_cfg,
+            cfg_scale=cfg_scale,
+            use_local_transformer_for_inference=use_local_transformer_for_inference,
+            phoneme_input_type=phoneme_input_type,
+            phoneme_sampling_method=self.cfg.get('inference_phoneme_sampling_method', 'argmax'),
+            force_dropout_text=False,
+            use_teacher_forced=False,
+            use_inference_mode=False,
+        )
+        print("Inference ended")
+        audio_generation_time_sec = time.perf_counter() - generation_start_time
+
+        predicted_audio = output.predicted_audio
+        predicted_audio_lens = output.predicted_audio_lens
+        predicted_codes = output.predicted_codes
+        predicted_codes_lens = output.predicted_codes_lens
+        save_start_time = time.perf_counter()
+        predicted_audio_paths = self._save_waveforms_to_paths(
+            waveforms=predicted_audio,
+            waveform_lens=predicted_audio_lens,
+            prefix='generated',
+            sample_rate=self.output_sample_rate,
+        )
+        audio_save_time_sec = time.perf_counter() - save_start_time
+        audio_durations = [int(predicted_audio_lens[idx].item()) / self.output_sample_rate for idx in range(predicted_audio.size(0))]
+
+        rewarding_start_time = time.perf_counter()
+        if reward_asr_model == 'nemo':
+            pred_transcripts = self._eval_asr_model.transcribe(
+                predicted_audio_paths,
+                batch_size=len(predicted_audio_paths),
+                override_config=TranscribeConfig(use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0),
+            )
+            pred_transcripts = [process_text_for_cer(transcript.text) for transcript in pred_transcripts]
+        else:
+            self.whisper_model.to(self.device)
+            pred_transcripts = []
+            langs = batch_repeated.get('languages', ['en'] * len(predicted_audio_paths))
+            for item_idx, audio_path in enumerate(predicted_audio_paths):
+                language = langs[item_idx] if item_idx < len(langs) else 'en'
+                normalizer = self._get_cached_normalizer(language) if self._normalize_whisper_transcript else None
+                print(f"Transcribing audio {audio_path} with language {language}")
+                transcript = transcribe_with_whisper(
+                    audio_filepath=audio_path,
+                    language=language,
+                    whisper_processor=self.whisper_processor,
+                    whisper_model=self.whisper_model,
+                    device=self.device,
+                    normalizer=normalizer,
+                )
+                print(f"Pred Transcript: {transcript}")
+                print(f"Normalized Pred Text: {process_text_for_cer(transcript)}")
+                print(f"Raw Text: {batch_repeated['raw_texts'][item_idx]}")
+                print("--------------------------------")
+                pred_transcripts.append(process_text_for_cer(transcript))
+
+        reference_audio_paths = self._get_reference_audio_paths(batch_repeated)
+        try:
+            pred_speaker_embeddings = get_speaker_embeddings_from_filepaths(
+                predicted_audio_paths, self._eval_speaker_verification_model, self.device
+            )
+            gt_speaker_embeddings = get_speaker_embeddings_from_filepaths(
+                reference_audio_paths, self._eval_speaker_verification_model, self.device
+            )
+        except Exception as e:
+            logging.warning(f"Speaker-embedding reward failed. Falling back to zero SSIM reward. Error: {e}")
+            pred_speaker_embeddings = None
+            gt_speaker_embeddings = None
+
+        batch_metrics = []
+        cer_reward_weight = self.cfg.get('cer_reward_weight', 0.5)
+        ssim_reward_weight = self.cfg.get('ssim_reward_weight', 0.5)
+        pesq_reward_weight = self.cfg.get('pesq_reward_weight', 0.0)
+        min_valid_codes_len = self.cfg.get('min_valid_codes_len', 4)
+        max_valid_codes_len = self.cfg.get(
+            'max_valid_codes_len', self.max_decoder_steps * self.frame_stacking_factor - 1
+        )
+
+        for idx in range(predicted_audio.size(0)):
+            pred_transcript = pred_transcripts[idx]
+            gt_transcript = process_text_for_cer(batch_repeated['raw_texts'][idx])
+            cer_gt = min(max(word_error_rate([pred_transcript], [gt_transcript], use_cer=True), 0.0), 1.0)
+            wer_gt = min(max(word_error_rate([pred_transcript], [gt_transcript], use_cer=False), 0.0), 1.0)
+
+            if pred_speaker_embeddings is not None and gt_speaker_embeddings is not None:
+                spk_embedding_pred = pred_speaker_embeddings[idx].cpu().float().numpy()
+                spk_embedding_gt = gt_speaker_embeddings[idx].cpu().float().numpy()
+                denom = max(np.linalg.norm(spk_embedding_pred) * np.linalg.norm(spk_embedding_gt), 1e-8)
+                spk_similarity = float(np.dot(spk_embedding_pred, spk_embedding_gt) / denom)
+            else:
+                spk_similarity = 0.0
+
+            if use_pesq:
+                sample_audio, sr = torchaudio.load(predicted_audio_paths[idx])
+                sample_audio = sample_audio.to(self.device)
+                if sr != 16000:
+                    sample_audio = torchaudio.functional.resample(sample_audio, sr, 16000)
+                _, pesq_hyp, _ = self.squim_objective_model(sample_audio)
+                pesq_hyp = float(pesq_hyp.item())
+            else:
+                pesq_hyp = 0.0
+
+            item_metrics = {
+                'cer_gt': float(cer_gt),
+                'wer_gt': float(wer_gt),
+                'duration': float(audio_durations[idx]),
+                'spk_similarity': float(spk_similarity),
+                'pred_transcript': pred_transcript,
+                'gt_transcript': gt_transcript,
+                'codes_len': int(predicted_codes_lens[idx].item()),
+                'pesq': float(pesq_hyp),
+            }
+
+            best_ssim_achievable = self.cfg.get('best_ssim_achievable', 0.9)
+            mean_cer_dataset = self.cfg.get('mean_cer_dataset', 0.1)
+            mean_ssim_dataset = self.cfg.get('mean_ssim_dataset', 0.6)
+
+            item_cer = item_metrics['cer_gt']
+            item_ssim = max(min(item_metrics['spk_similarity'], best_ssim_achievable), 0.0)
+            if item_cer <= mean_cer_dataset:
+                cer_reward = 0.5 + 0.5 * (mean_cer_dataset - item_cer) / max(mean_cer_dataset, 1e-8)
+            else:
+                cer_reward = 0.5 - 0.5 * (item_cer - mean_cer_dataset) / max(1.0 - mean_cer_dataset, 1e-8)
+
+            if item_ssim >= mean_ssim_dataset:
+                spk_similarity_reward = 0.5 + 0.5 * (item_ssim - mean_ssim_dataset) / max(
+                    best_ssim_achievable - mean_ssim_dataset, 1e-8
+                )
+            else:
+                spk_similarity_reward = 0.5 - 0.5 * (mean_ssim_dataset - item_ssim) / max(mean_ssim_dataset, 1e-8)
+
+            pesq_reward = item_metrics['pesq'] / 4.5 if use_pesq else 0.0
+            reward = (
+                cer_reward * cer_reward_weight
+                + spk_similarity_reward * ssim_reward_weight
+                + pesq_reward * pesq_reward_weight
+            )
+            if (item_metrics['codes_len'] >= max_valid_codes_len) or (item_metrics['codes_len'] <= min_valid_codes_len):
+                item_metrics['_needs_group_min_reward'] = True
+            else:
+                item_metrics['_needs_group_min_reward'] = False
+
+            item_metrics['cer_reward'] = float(cer_reward)
+            item_metrics['spk_similarity_reward'] = float(spk_similarity_reward)
+            item_metrics['pesq_reward'] = float(pesq_reward)
+            item_metrics['reward'] = float(reward)
+            batch_metrics.append(item_metrics)
+
+        # Second pass: replace rewards for items with invalid code lengths with the group minimum reward
+        num_groups = len(batch['raw_texts'])
+        for group_idx in range(num_groups):
+            group_start_idx = group_idx * num_generations_per_item
+            group_end_idx = group_start_idx + num_generations_per_item
+            group_rewards = [batch_metrics[idx]['reward'] for idx in range(group_start_idx, group_end_idx)]
+            group_min_reward = min(group_rewards)
+            for idx in range(group_start_idx, group_end_idx):
+                if batch_metrics[idx]['_needs_group_min_reward']:
+                    batch_metrics[idx]['reward'] = float(group_min_reward)
+
+        all_groups_mean_reward = 0.0
+        all_groups_std_reward = 0.0
+        group_validities = []
+        for group_idx in range(num_groups):
+            group_start_idx = group_idx * num_generations_per_item
+            group_end_idx = group_start_idx + num_generations_per_item
+            group_rewards = [batch_metrics[idx]['reward'] for idx in range(group_start_idx, group_end_idx)]
+            group_cers = [batch_metrics[idx]['cer_gt'] for idx in range(group_start_idx, group_end_idx)]
+            mean_reward = float(np.mean(group_rewards))
+            std_reward = float(np.std(group_rewards))
+            is_group_valid = True
+            if min(group_cers) > self.best_cer_threshold:
+                is_group_valid = False
+            if max(group_cers) > self.worst_cer_threshold:
+                is_group_valid = False
+
+            for idx in range(group_start_idx, group_end_idx):
+                advantage = batch_metrics[idx]['reward'] - mean_reward
+                if self.scale_rewards:
+                    advantage = advantage / (std_reward + 1e-4)
+                batch_metrics[idx]['advantage'] = float(advantage)
+                group_validities.append(is_group_valid)
+
+            self._print_group_cer_wer_table(
+                batch=batch,
+                batch_metrics=batch_metrics,
+                group_idx=group_idx,
+                group_start_idx=group_start_idx,
+                group_end_idx=group_end_idx,
+                is_group_valid=is_group_valid,
+                mean_reward=mean_reward,
+                std_reward=std_reward,
+            )
+
+            all_groups_mean_reward += mean_reward
+            all_groups_std_reward += std_reward
+
+        all_groups_mean_reward = all_groups_mean_reward / max(num_groups, 1)
+        all_groups_std_reward = all_groups_std_reward / max(num_groups, 1)
+        advantages = torch.tensor([x['advantage'] for x in batch_metrics], device=self.device, dtype=torch.float32)
+        group_validities = torch.tensor(group_validities, device=self.device, dtype=torch.float32)
+        rewarding_time_sec = time.perf_counter() - rewarding_start_time
+
+        return {
+            'mean_reward': torch.tensor(all_groups_mean_reward, device=self.device, dtype=torch.float32),
+            'std_reward': torch.tensor(all_groups_std_reward, device=self.device, dtype=torch.float32),
+            'batch_repeated': batch_repeated,
+            'metrics': batch_metrics,
+            'predicted_codes': predicted_codes,
+            'predicted_codes_lens': predicted_codes_lens,
+            'advantages': advantages,
+            'group_validities': group_validities,
+            'rollout_phoneme_input_type': phoneme_input_type,
+            'timings': {
+                'audio_generation_time_sec': float(audio_generation_time_sec),
+                'audio_save_time_sec': float(audio_save_time_sec),
+                'rewarding_time_sec': float(rewarding_time_sec),
+            },
+        }
+
+    def process_batch_online_po(self, batch: Dict, n_generations_per_item: int, mode: str = 'train'):
+        generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = self._prepare_online_po_inputs(
+            batch=batch,
+            n_generations_per_item=n_generations_per_item,
+            mode=mode,
+        )
+        chunked_outputs = self._run_teacher_forced_chunked_po(
+            generated_codes_and_metrics=generated_codes_and_metrics,
+            batch_repeated=batch_repeated,
+            predicted_codes=predicted_codes,
+            predicted_codes_lens=predicted_codes_lens,
+            n_generations_per_item=n_generations_per_item,
+            do_backward=False,
+        )
+        return {
+            'mean_reward': generated_codes_and_metrics['mean_reward'],
+            'std_reward': generated_codes_and_metrics['std_reward'],
+            'loss': chunked_outputs['loss'],
+            'po_loss': chunked_outputs['po_loss'],
+            'phoneme_aux_loss': chunked_outputs['phoneme_aux_loss'],
+            'kl_loss': chunked_outputs['kl_loss'],
+            'used_gt_phoneme_input': chunked_outputs['used_gt_phoneme_input'],
+            'batch_metrics': generated_codes_and_metrics['metrics'],
+        }
+
+    def _slice_batch_range(self, batch: Dict, start_idx: int, end_idx: int) -> Dict:
+        sliced_batch = {}
+        for key, value in batch.items():
+            if isinstance(value, torch.Tensor):
+                sliced_batch[key] = value[start_idx:end_idx]
+            elif isinstance(value, list):
+                sliced_batch[key] = value[start_idx:end_idx]
+            else:
+                sliced_batch[key] = value
+
+        # Keep explicit keys only to avoid accidental slicing of non-temporal tensors.
+        temporal_key_pairs = [
+            ('text', 'text_lens'),
+            ('context_text_tokens', 'context_text_tokens_lens'),
+            ('audio_codes', 'audio_codes_lens'),
+            ('context_audio_codes', 'context_audio_codes_lens'),
+            ('phoneme_tokens', 'phoneme_tokens_lens'),
+            ('context_audio', 'context_audio_lens'),
+            ('audio', 'audio_lens'),
+        ]
+        for tensor_key, lens_key in temporal_key_pairs:
+            tensor_value = sliced_batch.get(tensor_key)
+            lens = sliced_batch.get(lens_key)
+            if not isinstance(tensor_value, torch.Tensor) or not isinstance(lens, torch.Tensor):
+                continue
+            if tensor_value.dim() < 2 or tensor_value.size(0) != lens.size(0):
+                continue
+
+            local_max_len = int(lens.max().item()) if lens.numel() > 0 else 0
+            local_max_len = min(local_max_len, tensor_value.size(-1))
+            sliced_batch[tensor_key] = tensor_value[..., :local_max_len]
+
+        return sliced_batch
+
+    def _iter_group_ranges(self, num_groups: int, groups_per_subbatch: int):
+        for group_start in range(0, num_groups, groups_per_subbatch):
+            yield group_start, min(group_start + groups_per_subbatch, num_groups)
+
+    def _prepare_online_po_inputs(self, batch: Dict, n_generations_per_item: int, mode: str):
+        use_local_transformer_for_inference = False
+        use_local_transformer_prob = self.cfg.get('use_local_transformer_prob', 0.0)
+        if use_local_transformer_prob > 0.0 and mode == 'train':
+            use_local_transformer_for_inference = random.random() < use_local_transformer_prob
+
+        with torch.no_grad():
+            self.eval()
+            generated_codes_and_metrics = self.generate_and_reward(
+                batch=batch,
+                num_generations_per_item=n_generations_per_item,
+                mode=mode,
+                use_local_transformer_for_inference=use_local_transformer_for_inference,
+            )
+            self.train()
+
+        batch_repeated = generated_codes_and_metrics['batch_repeated']
+        predicted_codes = generated_codes_and_metrics['predicted_codes']
+        predicted_codes_lens = generated_codes_and_metrics['predicted_codes_lens']
+        predicted_codes = predicted_codes[:, :, : predicted_codes_lens.max()]
+        predicted_codes = self._codec_converter.convert_new_to_original(
+            audio_tokens=predicted_codes, audio_lens=predicted_codes_lens
+        )
+        batch_repeated['audio_codes'] = predicted_codes
+        batch_repeated['audio_codes_lens'] = predicted_codes_lens
+        if 'audio' in batch_repeated:
+            del batch_repeated['audio']
+        if 'audio_lens' in batch_repeated:
+            del batch_repeated['audio_lens']
+
+        return generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens
+
+    def _compute_po_losses_from_outputs(
+        self,
+        policy_output,
+        reference_output,
+        advantages: torch.Tensor,
+        group_validities: torch.Tensor,
+        rollout_phoneme_input_type: str,
+    ):
+        logits = policy_output.local_transformer_logits
+        if logits is None:
+            logits = policy_output.logits
+        ref_logits = None
+        if reference_output is not None:
+            ref_logits = reference_output.local_transformer_logits
+            if ref_logits is None:
+                ref_logits = reference_output.logits
+
+        audio_codes_target = policy_output.audio_codes_target.long()
+        audio_codes_lens_target = policy_output.audio_codes_lens_target
+        audio_loss_mask = get_mask_from_lengths(audio_codes_lens_target).float()
+
+        n_codebooks = audio_codes_target.size(1)
+        total_loss = None
+        total_kl = None
+        for codebook_idx in range(n_codebooks):
+            si = codebook_idx * self.num_all_tokens_per_codebook
+            ei = si + self.num_all_tokens_per_codebook
+            codebook_logits = logits[:, :, si:ei]
+            codebook_labels = audio_codes_target[:, codebook_idx, :]
+            per_token_logps = self._get_per_token_logps(codebook_logits, codebook_labels, audio_loss_mask)
+            # Ensure the GRPO policy gradient trick stays in fp32 to preserve gradient signal
+            with torch.cuda.amp.autocast(enabled=False):
+                per_token_loss = -(torch.exp(per_token_logps.float() - per_token_logps.float().detach()) * advantages.float().unsqueeze(1))
+                per_token_loss = per_token_loss * group_validities.float().unsqueeze(1)
+
+            if not self.reference_free and ref_logits is not None:
+                with torch.no_grad():
+                    ref_codebook_logits = ref_logits[:, :, si:ei]
+                    per_token_ref_logps = self._get_per_token_logps(
+                        ref_codebook_logits, codebook_labels, audio_loss_mask
+                    )
+                with torch.cuda.amp.autocast(enabled=False):
+                    per_token_kl = (
+                        torch.exp(per_token_ref_logps.float() - per_token_logps.float()) - (per_token_ref_logps.float() - per_token_logps.float()) - 1
+                    )
+                    per_token_loss = per_token_loss + self.cfg.get('grpo_beta', 0.0) * per_token_kl
+                codebook_kl_loss_mean = (
+                    (per_token_kl * audio_loss_mask).sum(dim=1) / audio_loss_mask.sum(dim=1).clamp_min(1e-8)
+                ).mean()
+            else:
+                codebook_kl_loss_mean = torch.tensor(0.0, device=self.device)
+
+            if self.loss_type == "grpo":
+                codebook_loss = (
+                    (per_token_loss * audio_loss_mask).sum(dim=1) / audio_loss_mask.sum(dim=1).clamp_min(1e-8)
+                ).mean()
+            elif self.loss_type == "dr_grpo":
+                total_tokens = per_token_loss.shape[0] * self.max_decoder_steps
+                codebook_loss = (per_token_loss * audio_loss_mask).sum() / max(total_tokens, 1)
+            else:
+                raise ValueError(f"Unknown loss function: {self.loss_type}")
+
+            if total_loss is None:
+                total_loss = codebook_loss
+                total_kl = codebook_kl_loss_mean
+            else:
+                total_loss += codebook_loss
+                total_kl += codebook_kl_loss_mean
+
+        total_po_loss = total_loss / n_codebooks
+        total_kl = total_kl / n_codebooks
+
+        phoneme_aux_loss = policy_output.phoneme_loss if rollout_phoneme_input_type == 'gt' else None
+        if phoneme_aux_loss is None:
+            phoneme_aux_loss = torch.tensor(0.0, device=self.device)
+        total_loss = total_po_loss + self.aux_phoneme_loss_weight * phoneme_aux_loss
+
+        return {
+            'loss': total_loss,
+            'po_loss': total_po_loss,
+            'phoneme_aux_loss': phoneme_aux_loss,
+            'kl_loss': total_kl,
+            'used_gt_phoneme_input': float(rollout_phoneme_input_type == 'gt'),
+        }
+
+    def _run_teacher_forced_chunked_po(
+        self,
+        generated_codes_and_metrics: Dict,
+        batch_repeated: Dict,
+        predicted_codes: torch.Tensor,
+        predicted_codes_lens: torch.Tensor,
+        n_generations_per_item: int,
+        do_backward: bool,
+    ):
+        num_groups = len(batch_repeated['raw_texts']) // n_generations_per_item
+        groups_per_subbatch = max(self.po_groups_per_subbatch, 1)
+
+        accumulated_loss = torch.tensor(0.0, device=self.device)
+        accumulated_po_loss = torch.tensor(0.0, device=self.device)
+        accumulated_phoneme_aux_loss = torch.tensor(0.0, device=self.device)
+        accumulated_kl_loss = torch.tensor(0.0, device=self.device)
+        used_gt_phoneme_input = 0.0
+
+        for group_start_idx, group_end_idx in self._iter_group_ranges(num_groups, groups_per_subbatch):
+            item_start_idx = group_start_idx * n_generations_per_item
+            item_end_idx = group_end_idx * n_generations_per_item
+            group_weight = float(group_end_idx - group_start_idx) / max(float(num_groups), 1.0)
+
+            batch_sub = self._slice_batch_range(batch_repeated, item_start_idx, item_end_idx)
+            predicted_codes_sub = predicted_codes[item_start_idx:item_end_idx]
+            predicted_codes_lens_sub = predicted_codes_lens[item_start_idx:item_end_idx]
+            predicted_codes_sub = predicted_codes_sub[:, :, : predicted_codes_lens_sub.max()]
+            advantages_sub = generated_codes_and_metrics['advantages'][item_start_idx:item_end_idx]
+            group_validities_sub = generated_codes_and_metrics['group_validities'][item_start_idx:item_end_idx]
+            rollout_phoneme_input_type = generated_codes_and_metrics.get('rollout_phoneme_input_type', 'pred')
+
+            # Use mode='val' intentionally for stable PO optimization:
+            # no random input dropout, no CFG unconditional dropout, no random phoneme corruption.
+            policy_output = self._run_easy_process_batch(
+                model=self,
+                batch=batch_sub,
+                audio_codes=predicted_codes_sub,
+                audio_codes_lens=predicted_codes_lens_sub,
+                mode='val',
+            )
+
+            reference_output = None
+            if not self.reference_free:
+                with torch.no_grad():
+                    reference_output = self._run_easy_process_batch(
+                        model=self._reference_model,
+                        batch=batch_sub,
+                        audio_codes=predicted_codes_sub,
+                        audio_codes_lens=predicted_codes_lens_sub,
+                        mode='val',
+                    )
+
+            chunk_outputs = self._compute_po_losses_from_outputs(
+                policy_output=policy_output,
+                reference_output=reference_output,
+                advantages=advantages_sub,
+                group_validities=group_validities_sub,
+                rollout_phoneme_input_type=rollout_phoneme_input_type,
+            )
+
+            if do_backward:
+                self.manual_backward(chunk_outputs['loss'] * group_weight)
+
+            accumulated_loss = accumulated_loss + chunk_outputs['loss'].detach() * group_weight
+            accumulated_po_loss = accumulated_po_loss + chunk_outputs['po_loss'].detach() * group_weight
+            accumulated_phoneme_aux_loss = (
+                accumulated_phoneme_aux_loss + chunk_outputs['phoneme_aux_loss'].detach() * group_weight
+            )
+            accumulated_kl_loss = accumulated_kl_loss + chunk_outputs['kl_loss'].detach() * group_weight
+            used_gt_phoneme_input = max(used_gt_phoneme_input, chunk_outputs['used_gt_phoneme_input'])
+
+        return {
+            'loss': accumulated_loss,
+            'po_loss': accumulated_po_loss,
+            'phoneme_aux_loss': accumulated_phoneme_aux_loss,
+            'kl_loss': accumulated_kl_loss,
+            'used_gt_phoneme_input': used_gt_phoneme_input,
+        }
+
+    def training_step(self, batch, batch_idx):
+        n_generations_per_item = self.cfg.get('n_generations_per_item', 6)
+        optimizer = self.optimizers()
+        if isinstance(optimizer, (list, tuple)):
+            if len(optimizer) != 1:
+                raise ValueError(f"Expected a single optimizer, got {len(optimizer)}.")
+            optimizer = optimizer[0]
+        optimizer.zero_grad(set_to_none=True)
+
+        # Snapshot weights before optimizer step to measure weight deltas.
+        prev_weights = self._snapshot_trainable_weights()
+
+        generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = self._prepare_online_po_inputs(
+            batch=batch,
+            n_generations_per_item=n_generations_per_item,
+            mode='train',
+        )
+        teacher_forced_start_time = time.perf_counter()
+        po_outputs = self._run_teacher_forced_chunked_po(
+            generated_codes_and_metrics=generated_codes_and_metrics,
+            batch_repeated=batch_repeated,
+            predicted_codes=predicted_codes,
+            predicted_codes_lens=predicted_codes_lens,
+            n_generations_per_item=n_generations_per_item,
+            do_backward=True,
+        )
+        teacher_forced_time_sec = time.perf_counter() - teacher_forced_start_time
+
+        # Compute gradient/weight metrics BEFORE optimizer.step() clears gradients.
+        grad_weight_metrics = self._compute_grad_and_weight_metrics()
+
+        optimizer.step()
+
+        # Step the LR scheduler (required in manual optimization mode).
+        lr_schedulers = self.lr_schedulers()
+        if lr_schedulers is not None:
+            if isinstance(lr_schedulers, (list, tuple)):
+                for sched in lr_schedulers:
+                    sched.step()
+            else:
+                lr_schedulers.step()
+
+        # Compute weight delta metrics AFTER optimizer.step().
+        grad_weight_metrics.update(self._compute_weight_update_metrics(prev_weights))
+
+        # Log learning rate.
+        self.log('learning_rate', optimizer.param_groups[0]['lr'], prog_bar=False, sync_dist=True)
+
+        # Core training metrics.
+        self.log('train_loss', po_outputs['loss'], prog_bar=True, sync_dist=True)
+        self.log('train_po_loss', po_outputs['po_loss'], prog_bar=True, sync_dist=True)
+        self.log('train_phoneme_aux_loss', po_outputs['phoneme_aux_loss'], prog_bar=True, sync_dist=True)
+        self.log('train_kl_loss', po_outputs['kl_loss'], prog_bar=True, sync_dist=True)
+        self.log('train_used_gt_phoneme_input', po_outputs['used_gt_phoneme_input'], prog_bar=True, sync_dist=True)
+        self.log('train_mean_reward', generated_codes_and_metrics['mean_reward'], prog_bar=True, sync_dist=True)
+        self.log('train_std_reward', generated_codes_and_metrics['std_reward'], prog_bar=True, sync_dist=True)
+
+        # Gradient / weight diagnostics to wandb.
+        for metric_name, metric_value in grad_weight_metrics.items():
+            self.log(f'train_{metric_name}', metric_value, prog_bar=False, sync_dist=True)
+
+        # Compact summary to stdout / log file.
+        self._print_grad_weight_summary(grad_weight_metrics, step=self.global_step)
+
+        # Timing metrics.
+        timings = generated_codes_and_metrics.get('timings', {})
+        for tkey in ('audio_generation_time_sec', 'audio_save_time_sec', 'rewarding_time_sec'):
+            self.log(f'train_{tkey}', float(timings.get(tkey, 0.0)), prog_bar=False, sync_dist=True)
+        self.log('train_teacher_forced_time_sec', teacher_forced_time_sec, prog_bar=False, sync_dist=True)
+
+    # def validation_step(self, batch, batch_idx):
+    #     val_n_generations_per_item = self.cfg.get('val_n_generations_per_item', 1)
+    #     po_outputs = self.process_batch_online_po(
+    #         batch=batch,
+    #         n_generations_per_item=val_n_generations_per_item,
+    #         mode='val',
+    #     )
+    #     self.validation_step_outputs.append(
+    #         {
+    #             'mean_reward': po_outputs['mean_reward'],
+    #             'std_reward': po_outputs['std_reward'],
+    #             'val_loss': po_outputs['loss'],
+    #             'val_po_loss': po_outputs['po_loss'],
+    #             'val_phoneme_aux_loss': po_outputs['phoneme_aux_loss'],
+    #             'val_kl_loss': po_outputs['kl_loss'],
+    #             'val_used_gt_phoneme_input': torch.tensor(
+    #                 po_outputs['used_gt_phoneme_input'], device=self.device, dtype=torch.float32
+    #             ),
+    #             'batch_metrics': po_outputs['batch_metrics'],
+    #         }
+    #     )
+
+    # def on_validation_epoch_end(self):
+    #     def collect(key: str):
+    #         values = []
+    #         for x in self.validation_step_outputs:
+    #             if x[key] is not None:
+    #                 values.append(x[key])
+    #             else:
+    #                 values.append(torch.tensor(0.0, device=self.device))
+    #         return torch.stack(values).mean() if len(values) > 0 else torch.tensor(0.0, device=self.device)
+
+    #     val_loss = collect("val_loss")
+    #     val_po_loss = collect("val_po_loss")
+    #     val_phoneme_aux_loss = collect("val_phoneme_aux_loss")
+    #     val_kl_loss = collect("val_kl_loss")
+    #     val_used_gt_phoneme_input = collect("val_used_gt_phoneme_input")
+    #     mean_reward = collect("mean_reward")
+    #     std_reward = collect("std_reward")
+
+    #     self.log("val_loss", val_loss, prog_bar=True, sync_dist=True)
+    #     self.log("val_po_loss", val_po_loss, prog_bar=True, sync_dist=True)
+    #     self.log("val_phoneme_aux_loss", val_phoneme_aux_loss, prog_bar=True, sync_dist=True)
+    #     self.log("val_kl_loss", val_kl_loss, prog_bar=True, sync_dist=True)
+    #     self.log("val_used_gt_phoneme_input", val_used_gt_phoneme_input, prog_bar=True, sync_dist=True)
+    #     self.log("val_mean_reward", mean_reward, prog_bar=True, sync_dist=True)
+    #     self.log("val_std_reward", std_reward, prog_bar=True, sync_dist=True)
+
+    #     mean_metrics = {}
+    #     for val_output in self.validation_step_outputs:
+    #         for item_metrics in val_output['batch_metrics']:
+    #             for key, value in item_metrics.items():
+    #                 if "transcript" not in key:
+    #                     mean_metrics.setdefault(key, []).append(value)
+    #     for key, values in mean_metrics.items():
+    #         self.log(f"val_{key}", float(np.mean(values)), prog_bar=True, sync_dist=True)
+
+    #     self.validation_step_outputs.clear()
diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
index 027ca47a4e82..6d91ad25f976 100644
--- a/nemo/core/classes/modelPT.py
+++ b/nemo/core/classes/modelPT.py
@@ -1411,7 +1411,7 @@ def maybe_init_from_pretrained_checkpoint(self, cfg: OmegaConf, map_location: st
                 if isinstance(cfg.init_from_ptl_ckpt, str):
                     # Restore checkpoint
                     ckpt_path = cfg.pop('init_from_ptl_ckpt')
-                    ckpt = torch.load(ckpt_path, map_location=map_location)
+                    ckpt = torch.load(ckpt_path, map_location=map_location, weights_only=False)
 
                     # Restore checkpoint into current model
                     self.load_state_dict(ckpt['state_dict'], strict=False)

From 2ca71812552b91d7be7752d902254f2ba6efdd4d Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeenh@nvidia.com>
Date: Wed, 18 Feb 2026 11:47:27 -0800
Subject: [PATCH 60/94] po stabilize

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>
---
 .../easy_magpietts_preference_optimization.py | 39 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
index 1bc94c14206f..d5c94fec59b1 100644
--- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
+++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
@@ -132,10 +132,16 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         if reward_asr_model == 'whisper' and self._normalize_whisper_transcript:
             self._normalizer_cache = {}
 
+        # Entropy bonus coefficient – encourages exploration and prevents mode collapse.
+        # Set to 0.0 to disable. Typical range: 0.001–0.01.
+        self.entropy_coeff = self.cfg.get('entropy_coeff', 0.0)
+
         # Filter out poor groups for stable optimization.
         self.best_cer_threshold = self.cfg.get('best_cer_threshold', 1.0)
         self.worst_cer_threshold = self.cfg.get('worst_cer_threshold', 1.0)
 
+
+
         if self.trainer is not None and str(self.trainer.precision) in ("32", "32-true"):
             self.decoder.float()
 
@@ -871,6 +877,7 @@ def _compute_po_losses_from_outputs(
         n_codebooks = audio_codes_target.size(1)
         total_loss = None
         total_kl = None
+        total_entropy = None
         for codebook_idx in range(n_codebooks):
             si = codebook_idx * self.num_all_tokens_per_codebook
             ei = si + self.num_all_tokens_per_codebook
@@ -882,6 +889,16 @@ def _compute_po_losses_from_outputs(
                 per_token_loss = -(torch.exp(per_token_logps.float() - per_token_logps.float().detach()) * advantages.float().unsqueeze(1))
                 per_token_loss = per_token_loss * group_validities.float().unsqueeze(1)
 
+            # Per-token entropy of the policy distribution (always computed for logging).
+            with torch.cuda.amp.autocast(enabled=False):
+                logits_fp32 = codebook_logits.float()
+                log_probs = logits_fp32.log_softmax(-1)          # [B, T, V]
+                probs = log_probs.exp()                           # [B, T, V]
+                per_token_entropy = -(probs * log_probs).sum(-1)  # [B, T]
+            codebook_entropy = (
+                (per_token_entropy * audio_loss_mask).sum(dim=1) / audio_loss_mask.sum(dim=1).clamp_min(1e-8)
+            ).mean()
+
             if not self.reference_free and ref_logits is not None:
                 with torch.no_grad():
                     ref_codebook_logits = ref_logits[:, :, si:ei]
@@ -912,23 +929,31 @@ def _compute_po_losses_from_outputs(
             if total_loss is None:
                 total_loss = codebook_loss
                 total_kl = codebook_kl_loss_mean
+                total_entropy = codebook_entropy
             else:
                 total_loss += codebook_loss
                 total_kl += codebook_kl_loss_mean
+                total_entropy += codebook_entropy
 
         total_po_loss = total_loss / n_codebooks
         total_kl = total_kl / n_codebooks
+        total_entropy = total_entropy / n_codebooks
 
         phoneme_aux_loss = policy_output.phoneme_loss if rollout_phoneme_input_type == 'gt' else None
         if phoneme_aux_loss is None:
             phoneme_aux_loss = torch.tensor(0.0, device=self.device)
+
+        # Subtracting entropy encourages higher entropy (more exploration / prevents mode collapse).
         total_loss = total_po_loss + self.aux_phoneme_loss_weight * phoneme_aux_loss
+        if self.entropy_coeff > 0:
+            total_loss = total_loss - self.entropy_coeff * total_entropy
 
         return {
             'loss': total_loss,
             'po_loss': total_po_loss,
             'phoneme_aux_loss': phoneme_aux_loss,
             'kl_loss': total_kl,
+            'entropy': total_entropy,
             'used_gt_phoneme_input': float(rollout_phoneme_input_type == 'gt'),
         }
 
@@ -948,6 +973,7 @@ def _run_teacher_forced_chunked_po(
         accumulated_po_loss = torch.tensor(0.0, device=self.device)
         accumulated_phoneme_aux_loss = torch.tensor(0.0, device=self.device)
         accumulated_kl_loss = torch.tensor(0.0, device=self.device)
+        accumulated_entropy = torch.tensor(0.0, device=self.device)
         used_gt_phoneme_input = 0.0
 
         for group_start_idx, group_end_idx in self._iter_group_ranges(num_groups, groups_per_subbatch):
@@ -1001,6 +1027,7 @@ def _run_teacher_forced_chunked_po(
                 accumulated_phoneme_aux_loss + chunk_outputs['phoneme_aux_loss'].detach() * group_weight
             )
             accumulated_kl_loss = accumulated_kl_loss + chunk_outputs['kl_loss'].detach() * group_weight
+            accumulated_entropy = accumulated_entropy + chunk_outputs['entropy'].detach() * group_weight
             used_gt_phoneme_input = max(used_gt_phoneme_input, chunk_outputs['used_gt_phoneme_input'])
 
         return {
@@ -1008,6 +1035,7 @@ def _run_teacher_forced_chunked_po(
             'po_loss': accumulated_po_loss,
             'phoneme_aux_loss': accumulated_phoneme_aux_loss,
             'kl_loss': accumulated_kl_loss,
+            'entropy': accumulated_entropy,
             'used_gt_phoneme_input': used_gt_phoneme_input,
         }
 
@@ -1039,7 +1067,15 @@ def training_step(self, batch, batch_idx):
         )
         teacher_forced_time_sec = time.perf_counter() - teacher_forced_start_time
 
-        # Compute gradient/weight metrics BEFORE optimizer.step() clears gradients.
+        # Clip gradients to prevent catastrophic updates from outlier batches.
+        max_grad_norm = self.cfg.get('max_grad_norm', 1.0)
+        if max_grad_norm > 0:
+            torch.nn.utils.clip_grad_norm_(
+                [p for p in self.parameters() if p.requires_grad and p.grad is not None],
+                max_norm=max_grad_norm,
+            )
+
+        # Compute gradient/weight metrics AFTER clipping but BEFORE optimizer.step() clears them.
         grad_weight_metrics = self._compute_grad_and_weight_metrics()
 
         optimizer.step()
@@ -1064,6 +1100,7 @@ def training_step(self, batch, batch_idx):
         self.log('train_po_loss', po_outputs['po_loss'], prog_bar=True, sync_dist=True)
         self.log('train_phoneme_aux_loss', po_outputs['phoneme_aux_loss'], prog_bar=True, sync_dist=True)
         self.log('train_kl_loss', po_outputs['kl_loss'], prog_bar=True, sync_dist=True)
+        self.log('train_entropy', po_outputs['entropy'], prog_bar=True, sync_dist=True)
         self.log('train_used_gt_phoneme_input', po_outputs['used_gt_phoneme_input'], prog_bar=True, sync_dist=True)
         self.log('train_mean_reward', generated_codes_and_metrics['mean_reward'], prog_bar=True, sync_dist=True)
         self.log('train_std_reward', generated_codes_and_metrics['std_reward'], prog_bar=True, sync_dist=True)

From 1af65a9411a81f7502d8cbeed177111d63ff022c Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Wed, 18 Feb 2026 20:51:33 -0500
Subject: [PATCH 61/94] mamba config update

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 .../tts/conf/magpietts/easy_magpietts.yaml     | 18 ++++++++++++++----
 .../conf/magpietts/easy_magpietts_lhotse.yaml  | 18 ++++++++++++++----
 nemo/collections/tts/models/easy_magpietts.py  |  3 ++-
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
index ef2ad794c2d0..3a9a274b624c 100644
--- a/examples/tts/conf/magpietts/easy_magpietts.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -21,11 +21,11 @@ model:
   transformer_hf_backend: "Qwen/Qwen2.5-1.5B"
   
   # NemotronH config (used when decoder_type: "nemotron_h")
-  # This is a hybrid Mamba2/Attention model. Layer types are specified via hybrid_override_pattern:
+  # Hybrid Mamba2/MoE/Attention model (~3B total, ~600-800M active). Layer types via hybrid_override_pattern:
   # 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer
   nemotron_h_config:
     hidden_size: 1536  # Should match embedding_dim
-    num_hidden_layers: 24
+    num_hidden_layers: 48
     vocab_size: 131072
     # Attention config
     num_attention_heads: 12
@@ -47,8 +47,17 @@ model:
     intermediate_size: 4096
     mlp_hidden_act: "silu"
     mlp_bias: false
-    # Layer pattern: alternating Mamba and Attention
-    hybrid_override_pattern: "M*M*M*M*M*M*M*M*M*M*M*M*"
+    # MoE config (scaled from Nemotron-3-Nano-30B-A3B)
+    n_routed_experts: 48
+    num_experts_per_tok: 6
+    moe_intermediate_size: 1024
+    moe_shared_expert_intermediate_size: 2048
+    n_group: 1
+    topk_group: 1
+    routed_scaling_factor: 2.5
+    norm_topk_prob: true
+    # Layer pattern: (M E M E M *) x 8 => 16 Mamba, 16 MoE, 8 Attention
+    hybrid_override_pattern: "MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*"
     # Normalization
     layer_norm_epsilon: 1e-5
     residual_in_fp32: true
@@ -69,6 +78,7 @@ model:
   local_transformer_type: "autoregressive" # "none", "autoregressive"
   # Below args are only relevant if use_local_transformer is autoregressive
   local_transformer_loss_scale: 1.0
+  phoneme_loss_weight: 1.0
   local_transformer_n_layers: 3
   local_transformer_n_heads: 12
   local_transformer_hidden_dim: 1536
diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
index a6330272a1da..459c7cd071df 100644
--- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -20,11 +20,11 @@ model:
   transformer_hf_backend: "Qwen/Qwen2.5-1.5B"
   
   # NemotronH config (used when decoder_type: "nemotron_h")
-  # This is a hybrid Mamba2/Attention model. Layer types are specified via hybrid_override_pattern:
+  # Hybrid Mamba2/MoE/Attention model (~3B total, ~600-800M active). Layer types via hybrid_override_pattern:
   # 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer
   nemotron_h_config:
     hidden_size: 1536  # Should match embedding_dim
-    num_hidden_layers: 24
+    num_hidden_layers: 48
     vocab_size: 131072
     # Attention config
     num_attention_heads: 12
@@ -46,8 +46,17 @@ model:
     intermediate_size: 4096
     mlp_hidden_act: "silu"
     mlp_bias: false
-    # Layer pattern: alternating Mamba and Attention
-    hybrid_override_pattern: "M*M*M*M*M*M*M*M*M*M*M*M*"
+    # MoE config (scaled from Nemotron-3-Nano-30B-A3B)
+    n_routed_experts: 48
+    num_experts_per_tok: 6
+    moe_intermediate_size: 1024
+    moe_shared_expert_intermediate_size: 2048
+    n_group: 1
+    topk_group: 1
+    routed_scaling_factor: 2.5
+    norm_topk_prob: true
+    # Layer pattern: (M E M E M *) x 8 => 16 Mamba, 16 MoE, 8 Attention
+    hybrid_override_pattern: "MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*"
     # Normalization
     layer_norm_epsilon: 1e-5
     residual_in_fp32: true
@@ -66,6 +75,7 @@ model:
   local_transformer_type: "autoregressive" # "none", "autoregressive"
   # Below args are only relevant if use_local_transformer is autoregressive
   local_transformer_loss_scale: 1.0
+  phoneme_loss_weight: 1.0
   local_transformer_n_layers: 3
   local_transformer_n_heads: 12
   local_transformer_hidden_dim: 1536
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 5dd61563788d..c2249ce43092 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -376,6 +376,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.phoneme_corruption_batch_prob = cfg.get('phoneme_corruption_batch_prob', 0.0)
         self.phoneme_corruption_timestep_ratio = cfg.get('phoneme_corruption_timestep_ratio', 0.0)
         self.phoneme_corruption_unk_mode_prob = cfg.get('phoneme_corruption_unk_mode_prob', 0.5)
+        self.phoneme_loss_weight = cfg.get('phoneme_loss_weight', 1.0)
         if cfg.get('phoneme_tokenizer', None) is not None:
             self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer)
             self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1)
@@ -1980,7 +1981,7 @@ def process_batch(
             else:
                 phoneme_loss = torch.tensor(0.0, device=logits.device)
 
-            loss = loss + phoneme_loss
+            loss = loss + self.phoneme_loss_weight * phoneme_loss
 
         return ProcessBatchOutput(
             loss=loss,

From 89cee8f0a65d947dd714a7e398126acb96876e3b Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Wed, 18 Feb 2026 23:44:20 -0800
Subject: [PATCH 62/94] fix weight initialization bugs in mamba

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/modules/nemotron_h_decoder.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/tts/modules/nemotron_h_decoder.py b/nemo/collections/tts/modules/nemotron_h_decoder.py
index f89e0a8fd326..ba5aa25a77c0 100644
--- a/nemo/collections/tts/modules/nemotron_h_decoder.py
+++ b/nemo/collections/tts/modules/nemotron_h_decoder.py
@@ -898,6 +898,7 @@ def __init__(self, config: NemotronHConfig):
 
         self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size), dtype=torch.float32))
         self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts, dtype=torch.float32))
+        nn.init.normal_(self.weight, mean=0.0, std=config.initializer_range)
 
     @torch.no_grad()
     def get_topk_indices(self, scores: torch.Tensor) -> torch.Tensor:
@@ -1176,13 +1177,11 @@ def _init_weights(self):
             elif isinstance(module, nn.Embedding):
                 nn.init.normal_(module.weight, std=self.config.initializer_range)
 
-        # Rescale prenorm residual weights for better training stability
-        # Following GPT-2 paper: scale by 1/sqrt(2 * n_layer)
+        # Rescale residual-branch output projections for better training stability.
+        # Apply 1/sqrt(num_hidden_layers) to Mamba, attention, and MLP/MoE branches.
         if self.config.rescale_prenorm_residual:
             for name, p in self.named_parameters():
-                if "out_proj.weight" in name:
-                    # Special Scaled Initialization for residual projections
-                    # Scale by 1/sqrt(num_hidden_layers)
+                if any(k in name for k in ("out_proj.weight", "o_proj.weight", "down_proj.weight")):
                     with torch.no_grad():
                         p /= math.sqrt(self.config.num_hidden_layers)
 

From fb3343f27f793427e2803af7a87d7812625e0d79 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Fri, 20 Feb 2026 14:20:32 -0800
Subject: [PATCH 63/94] Magpietts decoderonly 2601 flash (#65)

* config options

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* flash attention and timing stats

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* clean up timing code

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

---------

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py |   9 +-
 .../tts/modules/nemotron_h_decoder.py         | 131 +++++++++++++++++-
 2 files changed, 131 insertions(+), 9 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index c2249ce43092..a69bb9b80801 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -377,6 +377,8 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.phoneme_corruption_timestep_ratio = cfg.get('phoneme_corruption_timestep_ratio', 0.0)
         self.phoneme_corruption_unk_mode_prob = cfg.get('phoneme_corruption_unk_mode_prob', 0.5)
         self.phoneme_loss_weight = cfg.get('phoneme_loss_weight', 1.0)
+        self.parallel_codebook_loss_scale = cfg.get('parallel_codebook_loss_scale', 1.0)
+        self.local_transformer_loss_scale = cfg.get('local_transformer_loss_scale', 1.0)
         if cfg.get('phoneme_tokenizer', None) is not None:
             self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer)
             self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1)
@@ -1942,7 +1944,7 @@ def process_batch(
 
         # Compute codebook loss
         codebook_loss, _ = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target)
-        loss = codebook_loss
+        loss = self.parallel_codebook_loss_scale * codebook_loss
 
         # Compute local transformer loss if applicable
         local_transformer_loss = None
@@ -1955,8 +1957,7 @@ def process_batch(
             local_transformer_loss, _ = self.compute_loss(
                 local_transformer_logits, audio_codes_target, audio_codes_lens_target
             )
-            local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0)
-            loss = loss + local_transformer_loss_scale * local_transformer_loss
+            loss = loss + self.local_transformer_loss_scale * local_transformer_loss
 
         # Compute phoneme loss if applicable
         phoneme_loss = None
@@ -2167,7 +2168,7 @@ def validation_step(self, batch, batch_idx):
         if self.run_val_inference:
             infer_output = self.infer_batch(
                 batch,
-                max_decoder_steps=300,
+                max_decoder_steps=330,
                 temperature=0.7,
                 topk=80,
                 use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR,
diff --git a/nemo/collections/tts/modules/nemotron_h_decoder.py b/nemo/collections/tts/modules/nemotron_h_decoder.py
index ba5aa25a77c0..ec30a1e7a699 100644
--- a/nemo/collections/tts/modules/nemotron_h_decoder.py
+++ b/nemo/collections/tts/modules/nemotron_h_decoder.py
@@ -63,11 +63,19 @@
     CAUSAL_CONV1D_AVAILABLE = False
 
 try:
-    from flash_attn import flash_attn_func
+    from transformers.utils.import_utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10
 
-    FLASH_ATTN_AVAILABLE = True
+    if is_flash_attn_2_available():
+        from transformers.modeling_flash_attention_utils import _flash_attention_forward
+
+        FLASH_ATTN_AVAILABLE = True
+    else:
+        _flash_attention_forward = None
+        FLASH_ATTN_AVAILABLE = False
 except ImportError:
-    flash_attn_func = None
+    is_flash_attn_2_available = None
+    is_flash_attn_greater_or_equal_2_10 = None
+    _flash_attention_forward = None
     FLASH_ATTN_AVAILABLE = False
 
 
@@ -858,6 +866,101 @@ def forward(
         return attn_output, None, past_key_value
 
 
+class NemotronHFlashAttention2(NemotronHAttention):
+    """
+    FlashAttention2 path for NemotronH attention.
+
+    Falls back to eager/SDPA attention if flash-attn is not installed.
+    """
+
+    def __init__(self, config: NemotronHConfig, layer_idx: int):
+        super().__init__(config=config, layer_idx=layer_idx)
+        self._flash_attn_uses_top_left_mask = (
+            not is_flash_attn_greater_or_equal_2_10() if is_flash_attn_greater_or_equal_2_10 is not None else True
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if not FLASH_ATTN_AVAILABLE or _flash_attention_forward is None:
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Query is [B, T, H, D] for flash-attn helper.
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        # Keep key/value as [B, H_kv, T, D] while updating cache.
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Convert key/value to [B, T, H, D] for flash-attn helper.
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+NEMOTRONH_ATTENTION_CLASSES = {
+    "eager": NemotronHAttention,
+    "sdpa": NemotronHAttention,
+    "flash_attention_2": NemotronHFlashAttention2,
+}
+
+
 class NemotronHMLP(nn.Module):
     """MLP layer for NemotronH."""
 
@@ -1082,7 +1185,15 @@ def __init__(self, config: NemotronHConfig, layer_idx: int):
         if self.block_type == "mamba":
             self.mixer = NemotronHMamba2Mixer(config, layer_idx=layer_idx)
         elif self.block_type == "attention":
-            self.mixer = NemotronHAttention(config, layer_idx=layer_idx)
+            attn_impl = config._attn_implementation
+            if attn_impl == "flash_attention_2" and not FLASH_ATTN_AVAILABLE:
+                logging.warning(
+                    "NemotronH requested _attn_implementation='flash_attention_2' but flash-attn is unavailable. "
+                    "Falling back to sdpa."
+                )
+                attn_impl = "sdpa"
+            attn_cls = NEMOTRONH_ATTENTION_CLASSES.get(attn_impl, NemotronHAttention)
+            self.mixer = attn_cls(config, layer_idx=layer_idx)
         elif self.block_type == "mlp":
             self.mixer = NemotronHMLP(config, layer_idx=layer_idx)
         elif self.block_type == "moe":
@@ -1119,7 +1230,12 @@ def _forward_impl(
         if self.block_type == "mamba":
             hidden_states = self.mixer(hidden_states, cache_params=cache_params, cache_position=cache_position)
         elif self.block_type == "attention":
-            hidden_states = self.mixer(hidden_states, cache_position=cache_position, past_key_value=cache_params)
+            hidden_states = self.mixer(
+                hidden_states,
+                attention_mask=attention_mask,
+                cache_position=cache_position,
+                past_key_value=cache_params,
+            )
             hidden_states = hidden_states[0]
         elif self.block_type in ("mlp", "moe"):
             hidden_states = self.mixer(hidden_states)
@@ -1284,6 +1400,11 @@ def forward(
 
     def _create_causal_mask(self, attention_mask, input_tensor, cache_position):
         """Create causal attention mask."""
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and torch.any(attention_mask == 0):
+                return attention_mask
+            return None
+
         dtype, device = input_tensor.dtype, input_tensor.device
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]

From dab64378c8833ba57763253f05b2dbafe7b41006 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Fri, 20 Feb 2026 17:55:05 -0800
Subject: [PATCH 64/94] add do tts method

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 examples/tts/magpietts_inference.py           |   2 +-
 nemo/collections/tts/models/easy_magpietts.py | 125 ++++++++++++++++++
 2 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py
index feead6519875..e97fc0ea7e9e 100644
--- a/examples/tts/magpietts_inference.py
+++ b/examples/tts/magpietts_inference.py
@@ -558,7 +558,7 @@ def main(argv=None):
         if args.longform_mode in {'always', 'auto'}:
             model_inference_parameters["max_decoder_steps"] = args.longform_max_decoder_steps
         elif args.is_decoder_only_model:
-            model_inference_parameters["max_decoder_steps"] = 220
+            model_inference_parameters["max_decoder_steps"] = 300
         else:
             model_inference_parameters["max_decoder_steps"] = 440
 
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index a69bb9b80801..91a790a8c2a5 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -3556,6 +3556,131 @@ def infer_batch(
                 phoneme_prediction_start_idx=state.phoneme_prediction_start_idx.clone() if ib_phoneme_tokens is not None else None,
             )
 
+    @staticmethod
+    def _load_audio_for_inference(audio_path: str, target_sample_rate: int) -> torch.Tensor:
+        """
+        Load context audio and resample if needed.
+        Returns tensor of shape (1, num_samples).
+        """
+        audio, sr = sf.read(audio_path, dtype='float32')
+        if len(audio.shape) > 1:
+            audio = audio.mean(axis=1)
+        if sr != target_sample_rate:
+            import librosa
+
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate)
+        return torch.from_numpy(audio).unsqueeze(0)
+
+    @staticmethod
+    def _adjust_audio_to_duration_for_inference(
+        audio: torch.Tensor,
+        sample_rate: int,
+        target_duration: float,
+        codec_model_samples_per_frame: int,
+    ) -> torch.Tensor:
+        """
+        Match the same duration-alignment logic used in magpietts_streaming_inference.py.
+        """
+        num_codec_frames = int(target_duration * sample_rate / codec_model_samples_per_frame)
+        target_num_samples = num_codec_frames * codec_model_samples_per_frame
+        current_num_samples = audio.size(1)
+
+        if current_num_samples >= target_num_samples:
+            audio = audio[:, :target_num_samples]
+        else:
+            num_repeats = int(np.ceil(target_num_samples / current_num_samples))
+            audio_repeated = audio.repeat(1, num_repeats)
+            audio = audio_repeated[:, :target_num_samples]
+        return audio
+
+    def do_tts(
+        self,
+        transcript: str,
+        context_audio_file_path: Optional[str] = None,
+        context_text: str = "[NO TEXT CONTEXT]",
+        main_tokenizer_name: Optional[str] = None,
+        context_audio_duration: float = 5.0,
+        use_cfg: bool = True,
+        cfg_scale: float = 2.5,
+        use_local_transformer: bool = True,
+        temperature: float = 0.7,
+        topk: int = 80,
+        max_steps: int = 330,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Generate speech from transcript using EasyMagpie inference with optional context text/audio.
+        """
+        if transcript is None or transcript.strip() == "":
+            raise ValueError("`transcript` must be a non-empty string.")
+
+        device = next(self.parameters()).device
+        transcript = transcript.strip()
+        context_text = (context_text or "[NO TEXT CONTEXT]").strip()
+
+        if main_tokenizer_name is None:
+            # Match model init behavior: default to first configured tokenizer.
+            main_tokenizer_name = list(self.cfg.text_tokenizers.keys())[0]
+        if main_tokenizer_name not in self.tokenizer.tokenizers:
+            raise ValueError(
+                f"Unknown main_tokenizer_name='{main_tokenizer_name}'. "
+                f"Available tokenizers: {list(self.tokenizer.tokenizers.keys())}"
+            )
+
+        text_tokens = self.tokenizer.encode(transcript, tokenizer_name=main_tokenizer_name) + [self.eos_id]
+        text = torch.tensor([text_tokens], dtype=torch.long, device=device)
+        text_lens = torch.tensor([len(text_tokens)], dtype=torch.long, device=device)
+
+        context_text_tokens = self.tokenizer.encode(context_text, tokenizer_name=self.text_conditioning_tokenizer_name)
+        context_text_tensor = torch.tensor([context_text_tokens], dtype=torch.long, device=device)
+        context_text_lens = torch.tensor([len(context_text_tokens)], dtype=torch.long, device=device)
+
+        if context_audio_file_path is not None and context_audio_file_path.strip() != "":
+            context_audio = self._load_audio_for_inference(context_audio_file_path, self.sample_rate)
+            context_audio = self._adjust_audio_to_duration_for_inference(
+                context_audio,
+                self.sample_rate,
+                context_audio_duration,
+                self.codec_model_samples_per_frame,
+            )
+            context_audio = context_audio.to(device)
+            context_audio_lens = torch.tensor([context_audio.size(1)], dtype=torch.long, device=device)
+            with torch.inference_mode():
+                context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
+        else:
+            context_audio_codes = torch.zeros(
+                1,
+                self.data_num_audio_codebooks,
+                0,
+                dtype=torch.long,
+                device=device,
+            )
+            context_audio_codes_lens = torch.zeros(1, dtype=torch.long, device=device)
+
+        batch = {
+            'text': text,
+            'text_lens': text_lens,
+            'context_text_tokens': context_text_tensor,
+            'context_text_tokens_lens': context_text_lens,
+            'context_audio_codes': context_audio_codes,
+            'context_audio_codes_lens': context_audio_codes_lens,
+        }
+
+        with torch.inference_mode():
+            output = self.infer_batch(
+                batch=batch,
+                max_decoder_steps=max_steps,
+                temperature=temperature,
+                topk=topk,
+                use_cfg=use_cfg,
+                cfg_scale=cfg_scale,
+                use_local_transformer_for_inference=use_local_transformer,
+                phoneme_input_type='pred',
+                phoneme_sampling_method='argmax',
+                use_teacher_forced=False,
+                use_inference_mode=True,
+            )
+        return output.predicted_audio, output.predicted_audio_lens
+
     @classmethod
     def list_available_models(cls) -> List[PretrainedModelInfo]:
         return []

From d58581b22c0b6b7f9e2a68654eda38df1ca2e482 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Sun, 22 Feb 2026 22:23:11 -0500
Subject: [PATCH 65/94] bug fix

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 examples/tts/magpietts_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py
index e97fc0ea7e9e..19085f78eb96 100644
--- a/examples/tts/magpietts_inference.py
+++ b/examples/tts/magpietts_inference.py
@@ -510,7 +510,7 @@ def create_argument_parser() -> argparse.ArgumentParser:
     target_group.add_argument('--legacy_context_stacking', action='store_true', help='Use audio_bos_id/audio_eos_id instead of context_audio_bos_id/context_audio_eos_id for context stacking')
     target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt'])
     target_group.add_argument(
-        '--phoneme_sampling_method', type=str, default='greedy', choices=['greedy', 'multinomial']
+        '--phoneme_sampling_method', type=str, default='argmax', choices=['argmax', 'multinomial']
     )
     target_group.add_argument('--dropout_text_input', action='store_true')
 

From 9ec6767637f31b5d37c634654dc9e0cc9e6f17ac Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Mon, 23 Feb 2026 22:37:40 -0800
Subject: [PATCH 66/94] Magpietts decoderonly 2601 utmos po (#67)

* add utmos to PO

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>

* utmos in PO

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>

* whisper update

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>

* batched utmos

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>

---------

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>
---
 nemo/collections/tts/models/easy_magpietts.py |  97 +++++++++--
 .../easy_magpietts_preference_optimization.py | 161 +++++++++++++-----
 .../magpietts_preference_optimization.py      |  20 ++-
 nemo/collections/tts/modules/utmosv2.py       |  13 +-
 nemo/collections/tts/parts/utils/helpers.py   | 136 +++++++++++++--
 5 files changed, 351 insertions(+), 76 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 91a790a8c2a5..680a313618e6 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -51,15 +51,24 @@
     cosine_schedule,
 )
 from nemo.collections.tts.parts.utils.helpers import (
+    compute_utmos_scores_from_filepaths,
     get_mask_from_lengths,
     get_speaker_embeddings_from_filepaths,
     process_text_for_cer,
     transcribe_with_whisper,
+    transcribe_with_whisper_from_filepaths,
 )
 from nemo.core.classes import ModelPT
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
 
+try:
+    from nemo.collections.tts.modules.utmosv2 import UTMOSv2Calculator
+
+    HAVE_UTMOSV2 = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_UTMOSV2 = False
+
 
 @dataclass
 class TrainingMode:
@@ -562,6 +571,16 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self._eval_speaker_verification_model.freeze()
             logging.info("Eval models loaded successfully.")
 
+        # UTMOSv2 naturalness scoring for validation (optional)
+        self.use_utmos = cfg.get('use_utmos', False)
+        if self.use_utmos:
+            assert HAVE_UTMOSV2, (
+                "UTMOSv2 is required for UTMOS scoring but is not installed. "
+                "Install it with: pip install git+https://github.com/sarulab-speech/UTMOSv2.git@v1.2.1"
+            )
+            self._utmos_calculator = UTMOSv2Calculator(device='cpu')
+            logging.info("UTMOSv2 calculator initialized for validation naturalness scoring")
+
     def setup_optimizer_param_groups(self):
         """
         Override to exclude frozen eval/inference-only models from the optimizer.
@@ -575,6 +594,7 @@ def setup_optimizer_param_groups(self):
             '_eval_speaker_verification_model',
             'whisper_model',
             'whisper_processor',
+            '_utmos_calculator',
         }
 
         # Collect parameter ids to exclude
@@ -610,6 +630,7 @@ def state_dict(self, destination=None, prefix='', keep_vars=False):
             '_eval_speaker_verification_model',
             'whisper_model',
             'whisper_processor',
+            '_utmos_calculator',
         ]
         for key in list(state_dict.keys()):
             if any([substring in key for substring in keys_substrings_to_exclude]):
@@ -633,6 +654,7 @@ def load_state_dict(self, state_dict, strict=True):
                 '_eval_speaker_verification_model',
                 'whisper_model',
                 'whisper_processor',
+                '_utmos_calculator',
             ]:
                 continue
             if any(param.numel() > 0 for param in child.parameters()):
@@ -2243,21 +2265,34 @@ def validation_step(self, batch, batch_idx):
                         languages = batch.get('languages', None)
                         if languages is None:
                             languages = ['en'] * len(predicted_audio_paths)
-                        pred_transcripts = []
-                        for audio_path, lang in zip(predicted_audio_paths, languages):
-                            try:
-                                transcript = transcribe_with_whisper(
-                                    audio_path,
-                                    lang,
-                                    self.whisper_processor,
-                                    self.whisper_model,
-                                    self.device,
-                                    normalizer=None,
-                                )
-                                pred_transcripts.append(process_text_for_cer(transcript))
-                            except Exception as e:
-                                logging.warning(f"Val ASR transcription failed for {audio_path}: {e}")
-                                pred_transcripts.append(None)
+                        try:
+                            transcripts = transcribe_with_whisper_from_filepaths(
+                                audio_filepaths=predicted_audio_paths,
+                                language=languages,
+                                whisper_processor=self.whisper_processor,
+                                whisper_model=self.whisper_model,
+                                device=self.device,
+                                normalizer=None,
+                            )
+                            pred_transcripts = [process_text_for_cer(transcript) for transcript in transcripts]
+                        except Exception as e:
+                            logging.warning(f"Val batched ASR transcription failed, falling back to per-file mode: {e}")
+                            pred_transcripts = []
+                            for item_idx, audio_path in enumerate(predicted_audio_paths):
+                                lang = languages[item_idx] if item_idx < len(languages) else 'en'
+                                try:
+                                    transcript = transcribe_with_whisper(
+                                        audio_path,
+                                        lang,
+                                        self.whisper_processor,
+                                        self.whisper_model,
+                                        self.device,
+                                        normalizer=None,
+                                    )
+                                    pred_transcripts.append(process_text_for_cer(transcript))
+                                except Exception as inner_e:
+                                    logging.warning(f"Val ASR transcription failed for {audio_path}: {inner_e}")
+                                    pred_transcripts.append(None)
                     else:
                         pred_transcripts = self._eval_asr_model.transcribe(
                             predicted_audio_paths,
@@ -2280,8 +2315,23 @@ def validation_step(self, batch, batch_idx):
                         logging.warning(f"Val speaker embeddings failed: {e}")
                         pred_embeddings = ctx_embeddings = None
 
+                    utmos_scores = None
+                    if getattr(self, 'use_utmos', False) and hasattr(self, '_utmos_calculator'):
+                        utmos_batch_size = max(int(self.cfg.get('utmos_batch_size', len(predicted_audio_paths))), 1)
+                        utmos_num_workers = max(int(self.cfg.get('utmos_num_workers', 0)), 0)
+                        try:
+                            utmos_scores = compute_utmos_scores_from_filepaths(
+                                audio_filepaths=predicted_audio_paths,
+                                utmos_calculator=self._utmos_calculator,
+                                batch_size=utmos_batch_size,
+                                num_workers=utmos_num_workers,
+                                rank_tag=str(self.global_rank),
+                            )
+                        except Exception as e:
+                            raise RuntimeError(f"Val UTMOSv2 batched scoring failed: {e}") from e
+
                     # Compute per-sample metrics for successful cases only
-                    batch_cer, batch_wer, batch_ssim = [], [], []
+                    batch_cer, batch_wer, batch_ssim, batch_utmos = [], [], [], []
                     for idx in range(len(predicted_audio_paths)):
                         if pred_transcripts[idx] is None:
                             continue
@@ -2296,9 +2346,16 @@ def validation_step(self, batch, batch_idx):
                             ctx_emb = ctx_embeddings[idx].cpu().float().numpy()
                             ssim = float(np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb)))
                             batch_ssim.append(ssim)
+
+                        # UTMOSv2 naturalness score (MOS on 1-5 scale)
+                        utmos_score = None if utmos_scores is None else float(utmos_scores[idx])
+                        if utmos_score is not None:
+                            batch_utmos.append(utmos_score)
+
+                        utmos_str = f", UTMOS={utmos_score:.4f}" if utmos_score is not None else ""
                         logging.info(
                             f"[Val] rank{self.global_rank}_batch{batch_idx}_idx{idx}: "
-                            f"CER={cer:.4f}, WER={wer:.4f} | GT: '{gt_transcript[:50]}...' | Pred: '{pred_transcripts[idx][:50]}...'"
+                            f"CER={cer:.4f}, WER={wer:.4f}{utmos_str} | GT: '{gt_transcript[:50]}...' | Pred: '{pred_transcripts[idx][:50]}...'"
                         )
 
                         # Save per-audio metrics JSON file alongside the audio file
@@ -2307,6 +2364,7 @@ def validation_step(self, batch, batch_idx):
                                 'cer': float(cer),
                                 'wer': float(wer),
                                 'ssim': ssim,
+                                'utmos': utmos_score,
                                 'gt_transcript': gt_transcript,
                                 'pred_transcript': pred_transcripts[idx],
                                 'audio_path': predicted_audio_paths[idx],
@@ -2331,6 +2389,8 @@ def validation_step(self, batch, batch_idx):
                             val_output['val_wer_list'] = batch_wer
                     if batch_ssim:
                         val_output['val_ssim'] = torch.tensor(np.mean(batch_ssim), device=self.device)
+                    if batch_utmos:
+                        val_output['val_utmos'] = torch.tensor(np.mean(batch_utmos), device=self.device)
 
         self.validation_step_outputs.append(val_output)
 
@@ -2363,6 +2423,7 @@ def collect_if_exists(key):
             val_cer = collect_if_exists("val_cer")
             val_wer = collect_if_exists("val_wer")
             val_ssim = collect_if_exists("val_ssim")
+            val_utmos = collect_if_exists("val_utmos")
 
             if val_cer is not None:
                 self.log("val/cer", val_cer, prog_bar=True, sync_dist=True)
@@ -2370,6 +2431,8 @@ def collect_if_exists(key):
                 self.log("val/wer", val_wer, prog_bar=True, sync_dist=True)
             if val_ssim is not None:
                 self.log("val/ssim", val_ssim, prog_bar=True, sync_dist=True)
+            if val_utmos is not None:
+                self.log("val/utmos", val_utmos, prog_bar=True, sync_dist=True)
 
             if self.use_multilingual_asr:
                 lang_cer = {}
diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
index d5c94fec59b1..45d9bd542b59 100644
--- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
+++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
@@ -29,10 +29,11 @@
 from nemo.collections.asr.parts.mixins.transcription import TranscribeConfig
 from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel
 from nemo.collections.tts.parts.utils.helpers import (
+    compute_utmos_scores_from_filepaths,
     get_mask_from_lengths,
     get_speaker_embeddings_from_filepaths,
     process_text_for_cer,
-    transcribe_with_whisper,
+    transcribe_with_whisper_from_filepaths,
 )
 from nemo.utils import logging
 
@@ -52,6 +53,13 @@
     Normalizer = None
     PYNINI_AVAILABLE = False
 
+try:
+    from nemo.collections.tts.modules.utmosv2 import UTMOSv2Calculator
+
+    HAVE_UTMOSV2 = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_UTMOSV2 = False
+
 
 class EasyMagpieTTSModelOnlinePO(EasyMagpieTTSModel):
     """
@@ -59,7 +67,7 @@ class EasyMagpieTTSModelOnlinePO(EasyMagpieTTSModel):
 
     Training flow:
     1. Sample multiple generations per prompt.
-    2. Compute rewards (CER/SSIM/PESQ).
+    2. Compute rewards (CER/SSIM/PESQ/UTMOSv2).
     3. Compute group-normalized advantages.
     4. Run teacher-forced policy forward on generated codes and optimize GRPO objective.
     5. Add auxiliary phoneme loss from the same forward pass with GT phoneme tokens.
@@ -118,6 +126,16 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             assert HAVE_TORCHAUDIO, "torchaudio is required for PESQ reward."
             self.squim_objective_model = SQUIM_OBJECTIVE.get_model()
 
+        self.use_utmos = self.cfg.get('use_utmos', False)
+        if self.use_utmos:
+            assert HAVE_UTMOSV2, (
+                "UTMOSv2 is required for the UTMOS reward but is not installed. "
+                "Install it with: pip install git+https://github.com/sarulab-speech/UTMOSv2.git@v1.2.1"
+            )
+            # Initialize on CPU; we score from saved wav files so no GPU needed.
+            self._utmos_calculator = UTMOSv2Calculator(device='cpu')
+            logging.info("UTMOSv2 calculator initialized for naturalness reward")
+
         self.loss_type = self.cfg.get('loss_type', 'grpo')
         if self.loss_type not in ['grpo', 'dr_grpo']:
             raise ValueError(
@@ -151,6 +169,7 @@ def _get_trainable_module_groups(self) -> Dict[str, List[torch.nn.Parameter]]:
             '_speaker_verification_model', '_codec_model', '_eval_asr_model',
             '_eval_speaker_verification_model', '_reference_model',
             'whisper_model', 'whisper_processor', 'squim_objective_model',
+            '_utmos_calculator',
         }
         groups: Dict[str, List[torch.nn.Parameter]] = {}
         for name, module in self.named_children():
@@ -259,6 +278,7 @@ def setup_optimizer_param_groups(self):
             '_reference_model',
             'whisper_model',
             'whisper_processor',
+            '_utmos_calculator',
             # These modules are not used by the PO loss and receive no gradients.
             # Including them would only apply weight decay, degrading their weights.
             'final_proj',
@@ -277,7 +297,7 @@ def setup_optimizer_param_groups(self):
 
     def state_dict(self, destination=None, prefix='', keep_vars=False):
         state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
-        keys_substrings_to_exclude = ['_reference_model']
+        keys_substrings_to_exclude = ['_reference_model', '_utmos_calculator']
         for key in list(state_dict.keys()):
             if any(substring in key for substring in keys_substrings_to_exclude):
                 del state_dict[key]
@@ -496,18 +516,86 @@ def _print_group_cer_wer_table(
                     f"{item_metrics['cer_gt']:.4f}",
                     f"{item_metrics['wer_gt']:.4f}",
                     f"{item_metrics['spk_similarity']:.4f}",
+                    f"{item_metrics.get('utmos', 0.0):.4f}",
                     f"{item_metrics['reward']:.4f}",
                     f"{item_metrics.get('advantage', 0.0):.4f}",
                 ]
             )
 
-        table = self._format_text_table(headers=["item", "cer", "wer", "ssim", "reward", "advantage"], rows=rows)
+        table = self._format_text_table(headers=["item", "cer", "wer", "ssim", "utmos", "reward", "advantage"], rows=rows)
         print(
             f"[generate_and_reward] group={group_idx} valid={is_group_valid} "
             f"mean_reward={mean_reward:.4f} std_reward={std_reward:.4f}\n"
             f"prompt: {prompt_text}\n{table}\n"
         )
 
+    def _compute_pred_transcripts(self, predicted_audio_paths: List[str], batch_repeated: Dict, reward_asr_model: str) -> List[str]:
+        if reward_asr_model == 'nemo':
+            pred_transcripts = self._eval_asr_model.transcribe(
+                predicted_audio_paths,
+                batch_size=len(predicted_audio_paths),
+                override_config=TranscribeConfig(use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0),
+            )
+            return [process_text_for_cer(transcript.text) for transcript in pred_transcripts]
+
+        self.whisper_model.to(self.device)
+        pred_transcripts = [""] * len(predicted_audio_paths)
+        langs = batch_repeated.get('languages', ['en'] * len(predicted_audio_paths))
+        language_groups = {}
+        for item_idx, audio_path in enumerate(predicted_audio_paths):
+            language = langs[item_idx] if item_idx < len(langs) else 'en'
+            language_groups.setdefault(language, []).append((item_idx, audio_path))
+
+        for language, grouped_items in language_groups.items():
+            normalizer = self._get_cached_normalizer(language) if self._normalize_whisper_transcript else None
+            grouped_paths = [audio_path for _, audio_path in grouped_items]
+            group_transcripts = transcribe_with_whisper_from_filepaths(
+                audio_filepaths=grouped_paths,
+                language=language,
+                whisper_processor=self.whisper_processor,
+                whisper_model=self.whisper_model,
+                device=self.device,
+                normalizer=normalizer,
+            )
+            for (item_idx, _), transcript in zip(grouped_items, group_transcripts):
+                pred_transcripts[item_idx] = process_text_for_cer(transcript)
+        return pred_transcripts
+
+    def _compute_speaker_embeddings_parallel(
+        self, predicted_audio_paths: List[str], batch: Dict, num_generations_per_item: int
+    ):
+        reference_audio_paths = self._get_reference_audio_paths(batch)
+        pred_speaker_embeddings = get_speaker_embeddings_from_filepaths(
+            predicted_audio_paths, self._eval_speaker_verification_model, self.device
+        )
+        gt_speaker_embeddings = get_speaker_embeddings_from_filepaths(
+            reference_audio_paths, self._eval_speaker_verification_model, self.device
+        )
+        if num_generations_per_item > 1:
+            gt_speaker_embeddings = gt_speaker_embeddings.repeat_interleave(num_generations_per_item, dim=0)
+
+        if gt_speaker_embeddings.size(0) != pred_speaker_embeddings.size(0):
+            raise RuntimeError(
+                f"Speaker embedding size mismatch. GT={gt_speaker_embeddings.size(0)}, "
+                f"Pred={pred_speaker_embeddings.size(0)}."
+            )
+        return pred_speaker_embeddings, gt_speaker_embeddings
+
+    def _compute_utmos_scores_batched(self, predicted_audio_paths: List[str]) -> List[float]:
+        if not self.use_utmos:
+            return [0.0] * len(predicted_audio_paths)
+        if len(predicted_audio_paths) == 0:
+            return []
+        utmos_batch_size = max(int(self.cfg.get('utmos_batch_size', len(predicted_audio_paths))), 1)
+        utmos_num_workers = max(int(self.cfg.get('utmos_num_workers', 0)), 0)
+        return compute_utmos_scores_from_filepaths(
+            audio_filepaths=predicted_audio_paths,
+            utmos_calculator=self._utmos_calculator,
+            batch_size=utmos_batch_size,
+            num_workers=utmos_num_workers,
+            rank_tag=str(self.global_rank),
+        )
+
     def generate_and_reward(
         self,
         batch: Dict,
@@ -566,57 +654,31 @@ def generate_and_reward(
         audio_durations = [int(predicted_audio_lens[idx].item()) / self.output_sample_rate for idx in range(predicted_audio.size(0))]
 
         rewarding_start_time = time.perf_counter()
-        if reward_asr_model == 'nemo':
-            pred_transcripts = self._eval_asr_model.transcribe(
-                predicted_audio_paths,
-                batch_size=len(predicted_audio_paths),
-                override_config=TranscribeConfig(use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0),
-            )
-            pred_transcripts = [process_text_for_cer(transcript.text) for transcript in pred_transcripts]
-        else:
-            self.whisper_model.to(self.device)
-            pred_transcripts = []
-            langs = batch_repeated.get('languages', ['en'] * len(predicted_audio_paths))
-            for item_idx, audio_path in enumerate(predicted_audio_paths):
-                language = langs[item_idx] if item_idx < len(langs) else 'en'
-                normalizer = self._get_cached_normalizer(language) if self._normalize_whisper_transcript else None
-                print(f"Transcribing audio {audio_path} with language {language}")
-                transcript = transcribe_with_whisper(
-                    audio_filepath=audio_path,
-                    language=language,
-                    whisper_processor=self.whisper_processor,
-                    whisper_model=self.whisper_model,
-                    device=self.device,
-                    normalizer=normalizer,
-                )
-                print(f"Pred Transcript: {transcript}")
-                print(f"Normalized Pred Text: {process_text_for_cer(transcript)}")
-                print(f"Raw Text: {batch_repeated['raw_texts'][item_idx]}")
-                print("--------------------------------")
-                pred_transcripts.append(process_text_for_cer(transcript))
-
-        reference_audio_paths = self._get_reference_audio_paths(batch_repeated)
+        pred_transcripts = self._compute_pred_transcripts(predicted_audio_paths, batch_repeated, reward_asr_model)
         try:
-            pred_speaker_embeddings = get_speaker_embeddings_from_filepaths(
-                predicted_audio_paths, self._eval_speaker_verification_model, self.device
-            )
-            gt_speaker_embeddings = get_speaker_embeddings_from_filepaths(
-                reference_audio_paths, self._eval_speaker_verification_model, self.device
+            pred_speaker_embeddings, gt_speaker_embeddings = self._compute_speaker_embeddings_parallel(
+                predicted_audio_paths, batch, num_generations_per_item
             )
         except Exception as e:
             logging.warning(f"Speaker-embedding reward failed. Falling back to zero SSIM reward. Error: {e}")
             pred_speaker_embeddings = None
             gt_speaker_embeddings = None
+        utmos_scores = self._compute_utmos_scores_batched(predicted_audio_paths)
 
         batch_metrics = []
         cer_reward_weight = self.cfg.get('cer_reward_weight', 0.5)
         ssim_reward_weight = self.cfg.get('ssim_reward_weight', 0.5)
         pesq_reward_weight = self.cfg.get('pesq_reward_weight', 0.0)
+        utmos_reward_weight = self.cfg.get('utmos_reward_weight', 0.0)
         min_valid_codes_len = self.cfg.get('min_valid_codes_len', 4)
         max_valid_codes_len = self.cfg.get(
             'max_valid_codes_len', self.max_decoder_steps * self.frame_stacking_factor - 1
         )
 
+        # UTMOSv2 reward shaping parameters (MOS scale is 1–5).
+        mean_utmos_dataset = self.cfg.get('mean_utmos_dataset', 3.5)
+        best_utmos_achievable = self.cfg.get('best_utmos_achievable', 4.5)
+
         for idx in range(predicted_audio.size(0)):
             pred_transcript = pred_transcripts[idx]
             gt_transcript = process_text_for_cer(batch_repeated['raw_texts'][idx])
@@ -641,6 +703,8 @@ def generate_and_reward(
             else:
                 pesq_hyp = 0.0
 
+            utmos_score = utmos_scores[idx]
+
             item_metrics = {
                 'cer_gt': float(cer_gt),
                 'wer_gt': float(wer_gt),
@@ -650,6 +714,7 @@ def generate_and_reward(
                 'gt_transcript': gt_transcript,
                 'codes_len': int(predicted_codes_lens[idx].item()),
                 'pesq': float(pesq_hyp),
+                'utmos': float(utmos_score),
             }
 
             best_ssim_achievable = self.cfg.get('best_ssim_achievable', 0.9)
@@ -671,10 +736,27 @@ def generate_and_reward(
                 spk_similarity_reward = 0.5 - 0.5 * (mean_ssim_dataset - item_ssim) / max(mean_ssim_dataset, 1e-8)
 
             pesq_reward = item_metrics['pesq'] / 4.5 if use_pesq else 0.0
+
+            # UTMOSv2 reward: piecewise linear shaping centered on mean_utmos_dataset,
+            # analogous to the CER and SSIM reward shaping.
+            if self.use_utmos:
+                item_utmos = max(min(utmos_score, best_utmos_achievable), 1.0)
+                if item_utmos >= mean_utmos_dataset:
+                    utmos_reward = 0.5 + 0.5 * (item_utmos - mean_utmos_dataset) / max(
+                        best_utmos_achievable - mean_utmos_dataset, 1e-8
+                    )
+                else:
+                    utmos_reward = 0.5 - 0.5 * (mean_utmos_dataset - item_utmos) / max(
+                        mean_utmos_dataset - 1.0, 1e-8
+                    )
+            else:
+                utmos_reward = 0.0
+
             reward = (
                 cer_reward * cer_reward_weight
                 + spk_similarity_reward * ssim_reward_weight
                 + pesq_reward * pesq_reward_weight
+                + utmos_reward * utmos_reward_weight
             )
             if (item_metrics['codes_len'] >= max_valid_codes_len) or (item_metrics['codes_len'] <= min_valid_codes_len):
                 item_metrics['_needs_group_min_reward'] = True
@@ -684,6 +766,7 @@ def generate_and_reward(
             item_metrics['cer_reward'] = float(cer_reward)
             item_metrics['spk_similarity_reward'] = float(spk_similarity_reward)
             item_metrics['pesq_reward'] = float(pesq_reward)
+            item_metrics['utmos_reward'] = float(utmos_reward)
             item_metrics['reward'] = float(reward)
             batch_metrics.append(item_metrics)
 
diff --git a/nemo/collections/tts/models/magpietts_preference_optimization.py b/nemo/collections/tts/models/magpietts_preference_optimization.py
index a6d11f6ac1ae..d754f5718130 100644
--- a/nemo/collections/tts/models/magpietts_preference_optimization.py
+++ b/nemo/collections/tts/models/magpietts_preference_optimization.py
@@ -30,6 +30,7 @@
     get_speaker_embeddings_from_filepaths,
     process_text_for_cer,
     transcribe_with_whisper,
+    transcribe_with_whisper_from_filepaths,
 )
 from nemo.utils import logging
 
@@ -661,14 +662,25 @@ def generate_and_reward(
                 )
                 pred_transcripts = [process_text_for_cer(transcript.text) for transcript in pred_transcripts]
             elif self.cfg.get("reward_asr_model", "nemo") == "whisper":
-                pred_transcripts = []
+                pred_transcripts = [""] * len(predicted_audio_paths)
+                language_groups = {}
                 for item_idx, audio_path in enumerate(predicted_audio_paths):
                     language = batch_repeated['languages'][item_idx]
+                    language_groups.setdefault(language, []).append((item_idx, audio_path))
+
+                for language, grouped_items in language_groups.items():
                     normalizer = self._get_cached_normalizer(language) if self._normalize_whisper_transcript else None
-                    transcript = transcribe_with_whisper(
-                        audio_path, language, self.whisper_processor, self.whisper_model, self.device, normalizer
+                    grouped_paths = [audio_path for _, audio_path in grouped_items]
+                    grouped_transcripts = transcribe_with_whisper_from_filepaths(
+                        audio_filepaths=grouped_paths,
+                        language=language,
+                        whisper_processor=self.whisper_processor,
+                        whisper_model=self.whisper_model,
+                        device=self.device,
+                        normalizer=normalizer,
                     )
-                    pred_transcripts.append(transcript)
+                    for (item_idx, _), transcript in zip(grouped_items, grouped_transcripts):
+                        pred_transcripts[item_idx] = transcript
                 pred_transcripts = [process_text_for_cer(transcript) for transcript in pred_transcripts]
             else:
                 # Address CodeQL issue where pred_transcripts might be undefined for future code
diff --git a/nemo/collections/tts/modules/utmosv2.py b/nemo/collections/tts/modules/utmosv2.py
index fb1dc76d17bd..46b17316d0ea 100644
--- a/nemo/collections/tts/modules/utmosv2.py
+++ b/nemo/collections/tts/modules/utmosv2.py
@@ -62,21 +62,28 @@ def __call__(self, file_path):
                 mos_score = self.model.predict(input_path=file_path, num_repetitions=1, num_workers=0)
         return mos_score
 
-    def process_directory(self, input_dir: str, batch_size: int = 16) -> list[dict[str, str | float]]:
+    def process_directory(
+        self, input_dir: str, batch_size: int = 16, num_workers: int = None
+    ) -> list[dict[str, str | float]]:
         """
         Computes UTMOSv2 scores for all `*.wav` files in the given directory.
         Args:
             input_dir: The directory containing the audio files.
-            batch_size: The number of audio files to process in parallel.
+            batch_size: The number of audio files per scoring batch.
+            num_workers: Number of worker processes used by UTMOS internals.
+                Set to 0 to avoid multiprocessing pickling issues.
         Returns:
             A list of dictionaries, each containing the file path and the UTMOSv2 score.
         """
+        if num_workers is None:
+            num_workers = batch_size
+            
         with torch.inference_mode():
             # UTMOSV2 tends to launch many of OpenMP threads which overloads the machine's CPUs
             # while actually slowing down the prediction. Limit the number of threads here.
             with threadpool_limits(limits=1):
                 results = self.model.predict(
-                    input_dir=input_dir, num_repetitions=1, num_workers=batch_size, batch_size=batch_size
+                    input_dir=input_dir, num_repetitions=1, num_workers=num_workers, batch_size=batch_size
                 )
         return results
 
diff --git a/nemo/collections/tts/parts/utils/helpers.py b/nemo/collections/tts/parts/utils/helpers.py
index a8ee48ce57ef..cf6dbbdcd494 100644
--- a/nemo/collections/tts/parts/utils/helpers.py
+++ b/nemo/collections/tts/parts/utils/helpers.py
@@ -43,8 +43,12 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import string
+import os
+import shutil
+import tempfile
 from enum import Enum
-from typing import Any, Optional, Tuple
+from collections import defaultdict
+from typing import Any, List, Optional, Sequence, Tuple, Union
 
 import librosa
 import matplotlib.pylab as plt
@@ -845,19 +849,69 @@ def transcribe_with_whisper(
     """
     Transcribe audio with Whisper. Optionally normalize the transcript if a normalizer is provided.
     """
-    speech_array, sampling_rate = librosa.load(audio_filepath, sr=16000)
-    forced_decoder_ids = (
-        whisper_processor.get_decoder_prompt_ids(language=language, task="transcribe") if language else None
+    transcripts = transcribe_with_whisper_from_filepaths(
+        audio_filepaths=[audio_filepath],
+        language=language,
+        whisper_processor=whisper_processor,
+        whisper_model=whisper_model,
+        device=device,
+        normalizer=normalizer,
     )
-    inputs = whisper_processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt").input_features
-    inputs = inputs.to(device)
-    with torch.no_grad():
-        predicted_ids = whisper_model.generate(inputs, forced_decoder_ids=forced_decoder_ids)
-    transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
-    result = transcription[0]
-    if normalizer is not None:
-        result = normalizer.normalize(result)
-    return result
+    return transcripts[0]
+
+
+def transcribe_with_whisper_from_filepaths(
+    audio_filepaths: Sequence[str],
+    language: Optional[Union[str, Sequence[Optional[str]]]],
+    whisper_processor: Any,
+    whisper_model: Any,
+    device: torch.device,
+    normalizer: Optional[Any] = None,
+    batch_size: Optional[int] = None,
+) -> List[str]:
+    """
+    Transcribe a list of audios with Whisper using batched inference.
+    Supports a single language for all files or per-file language values.
+    """
+    if len(audio_filepaths) == 0:
+        return []
+
+    if batch_size is None:
+        batch_size = len(audio_filepaths)
+    if batch_size <= 0:
+        raise ValueError(f"batch_size must be > 0, but received: {batch_size}")
+
+    if isinstance(language, str) or language is None:
+        languages = [language] * len(audio_filepaths)
+    else:
+        if len(language) != len(audio_filepaths):
+            raise ValueError(
+                f"Expected len(language) == len(audio_filepaths), but got {len(language)} and {len(audio_filepaths)}."
+            )
+        languages = list(language)
+
+    grouped_indices = defaultdict(list)
+    for idx, lang in enumerate(languages):
+        grouped_indices[lang].append(idx)
+
+    transcripts = [""] * len(audio_filepaths)
+    for lang, indices in grouped_indices.items():
+        forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language=lang, task="transcribe") if lang else None
+        for start_idx in range(0, len(indices), batch_size):
+            batch_indices = indices[start_idx : start_idx + batch_size]
+            speech_arrays = [librosa.load(audio_filepaths[idx], sr=16000)[0] for idx in batch_indices]
+            inputs = whisper_processor(
+                speech_arrays, sampling_rate=16000, return_tensors="pt", padding=True
+            ).input_features.to(device)
+            with torch.no_grad():
+                predicted_ids = whisper_model.generate(inputs, forced_decoder_ids=forced_decoder_ids)
+            batch_transcripts = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
+            if normalizer is not None:
+                batch_transcripts = [normalizer.normalize(text) for text in batch_transcripts]
+            for idx, text in zip(batch_indices, batch_transcripts):
+                transcripts[idx] = text
+
+    return transcripts
 
 
 def get_speaker_embeddings_from_filepaths(filepaths, speaker_verification_model, device):
@@ -883,3 +937,59 @@ def get_speaker_embeddings_from_filepaths(filepaths, speaker_verification_model,
     )
 
     return speaker_embeddings
+
+
+def compute_utmos_scores_from_filepaths(
+    audio_filepaths: Sequence[str],
+    utmos_calculator: Any,
+    batch_size: int = 8,
+    num_workers: int = 0,
+    rank_tag: str = "0",
+) -> List[float]:
+    """
+    Compute UTMOS scores in strict batched mode for a list of wav filepaths.
+
+    Expected UTMOS batch output schema (per item):
+      {'file_path': <path>, 'predicted_mos': <float>}
+    """
+    if len(audio_filepaths) == 0:
+        return []
+
+    batch_size = max(int(batch_size), 1)
+    num_workers = max(int(num_workers), 0)
+    scores = [0.0] * len(audio_filepaths)
+
+    with tempfile.TemporaryDirectory(prefix=f"utmos_rank{rank_tag}_") as tmp_dir:
+        file_to_idx = {}
+        for idx, src_path in enumerate(audio_filepaths):
+            tmp_name = f"{idx:06d}.wav"
+            tmp_path = os.path.join(tmp_dir, tmp_name)
+            try:
+                os.symlink(src_path, tmp_path)
+            except OSError:
+                try:
+                    os.link(src_path, tmp_path)
+                except OSError:
+                    shutil.copy2(src_path, tmp_path)
+            file_to_idx[tmp_name] = idx
+
+        batch_results = utmos_calculator.process_directory(tmp_dir, batch_size=batch_size, num_workers=num_workers)
+        if not isinstance(batch_results, list):
+            raise RuntimeError(f"Unexpected UTMOSv2 output type: {type(batch_results)}")
+
+        for item in batch_results:
+            if not isinstance(item, dict):
+                raise RuntimeError(f"Unexpected UTMOSv2 batch item type: {type(item)}")
+            if 'file_path' not in item or 'predicted_mos' not in item:
+                raise RuntimeError(
+                    "Unexpected UTMOSv2 batch item schema. Expected keys: 'file_path' and 'predicted_mos'. "
+                    f"Got keys: {list(item.keys())}"
+                )
+            idx = file_to_idx.get(os.path.basename(str(item['file_path'])))
+            if idx is None:
+                raise RuntimeError(
+                    f"UTMOSv2 returned unknown file path '{item['file_path']}' that does not map to this batch."
+                )
+            scores[idx] = float(item['predicted_mos'])
+
+    return scores

From acc05a10f89c82f0870b232a0ca3455c1133af71 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Tue, 24 Feb 2026 14:43:22 -0500
Subject: [PATCH 67/94] full phoneme channel dropout option

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 examples/tts/conf/magpietts/easy_magpietts.yaml        | 1 +
 examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml | 1 +
 nemo/collections/tts/models/easy_magpietts.py          | 5 +++--
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
index 3a9a274b624c..a668686dc28c 100644
--- a/examples/tts/conf/magpietts/easy_magpietts.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -104,6 +104,7 @@ model:
   phoneme_corruption_batch_prob: 0.1
   phoneme_corruption_timestep_ratio: 0.15
   phoneme_corruption_unk_mode_prob: 0.5
+  phoneme_corruption_type: "repeat_skip_unk" # "repeat_skip_unk" or "complete_channel"
 
   phoneme_tokenizer:
     _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
index 459c7cd071df..6eb4d03a98d2 100644
--- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -99,6 +99,7 @@ model:
   phoneme_corruption_batch_prob: 0.1
   phoneme_corruption_timestep_ratio: 0.15
   phoneme_corruption_unk_mode_prob: 0.5
+  phoneme_corruption_type: "repeat_skip_unk" # "repeat_skip_unk" or "complete_channel"
   
   phoneme_tokenizer:
     _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 680a313618e6..bb6d8c208f1f 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -385,6 +385,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.phoneme_corruption_batch_prob = cfg.get('phoneme_corruption_batch_prob', 0.0)
         self.phoneme_corruption_timestep_ratio = cfg.get('phoneme_corruption_timestep_ratio', 0.0)
         self.phoneme_corruption_unk_mode_prob = cfg.get('phoneme_corruption_unk_mode_prob', 0.5)
+        self.phoneme_corruption_type = cfg.get('phoneme_corruption_type', 'repeat_skip_unk')
         self.phoneme_loss_weight = cfg.get('phoneme_loss_weight', 1.0)
         self.parallel_codebook_loss_scale = cfg.get('parallel_codebook_loss_scale', 1.0)
         self.local_transformer_loss_scale = cfg.get('local_transformer_loss_scale', 1.0)
@@ -1847,8 +1848,8 @@ def process_batch(
         dropout_complete_phoneme_channel = False
         if self.phoneme_tokenizer is not None and phoneme_tokens is not None:
             # Corrupt phonemes only when text input is not dropped.
-            apply_phoneme_corruption = mode == 'train' and (not dropout_text_input) and (not dropout_conditional_input)
-            dropout_complete_phoneme_channel = dropout_conditional_input
+            apply_phoneme_corruption = mode == 'train' and (not dropout_text_input) and (not dropout_conditional_input) and self.phoneme_corruption_type == 'repeat_skip_unk'
+            dropout_complete_phoneme_channel = mode == 'train' and ( dropout_conditional_input or (self.phoneme_corruption_type == 'complete_channel' and torch.rand(1).item() < self.phoneme_corruption_batch_prob))
             (
                 phoneme_channel_embedding,
                 phoneme_channel_lens,

From 2c4520b41850e2f725a3d9ceaa78532e22f05d12 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Fri, 27 Feb 2026 01:12:54 -0800
Subject: [PATCH 68/94] gt phoneme option in do_tts

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index bb6d8c208f1f..f7b254ec7977 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -3670,9 +3670,11 @@ def do_tts(
         temperature: float = 0.7,
         topk: int = 80,
         max_steps: int = 330,
+        gt_phoneme_text: Optional[str] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Generate speech from transcript using EasyMagpie inference with optional context text/audio.
+        Optionally accepts ground-truth phoneme text (IPA string) for decoder-only inference.
         """
         if transcript is None or transcript.strip() == "":
             raise ValueError("`transcript` must be a non-empty string.")
@@ -3728,6 +3730,19 @@ def do_tts(
             'context_audio_codes': context_audio_codes,
             'context_audio_codes_lens': context_audio_codes_lens,
         }
+        phoneme_input_type = 'pred'
+        if gt_phoneme_text is not None:
+            if self.phoneme_tokenizer is None:
+                raise ValueError("Model does not have a phoneme tokenizer configured, but gt_phoneme_text was provided.")
+            gt_phoneme_text = gt_phoneme_text.strip()
+            if gt_phoneme_text == "":
+                raise ValueError("`gt_phoneme_text` must be a non-empty string when provided.")
+            gt_phoneme_tokens = self.phoneme_tokenizer.encode(gt_phoneme_text)
+            if len(gt_phoneme_tokens) == 0:
+                raise ValueError("Failed to encode `gt_phoneme_text` into phoneme tokens.")
+            batch['phoneme_tokens'] = torch.tensor([gt_phoneme_tokens], dtype=torch.long, device=device)
+            batch['phoneme_tokens_lens'] = torch.tensor([len(gt_phoneme_tokens)], dtype=torch.long, device=device)
+            phoneme_input_type = 'gt'
 
         with torch.inference_mode():
             output = self.infer_batch(
@@ -3738,7 +3753,7 @@ def do_tts(
                 use_cfg=use_cfg,
                 cfg_scale=cfg_scale,
                 use_local_transformer_for_inference=use_local_transformer,
-                phoneme_input_type='pred',
+                phoneme_input_type=phoneme_input_type,
                 phoneme_sampling_method='argmax',
                 use_teacher_forced=False,
                 use_inference_mode=True,

From 6501ada7c5d2f7eeb59b050f92578ae551dbed00 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Fri, 27 Feb 2026 11:53:08 -0800
Subject: [PATCH 69/94] bug fix

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index f7b254ec7977..54305e088663 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -3738,6 +3738,7 @@ def do_tts(
             if gt_phoneme_text == "":
                 raise ValueError("`gt_phoneme_text` must be a non-empty string when provided.")
             gt_phoneme_tokens = self.phoneme_tokenizer.encode(gt_phoneme_text)
+            gt_phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + gt_phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
             if len(gt_phoneme_tokens) == 0:
                 raise ValueError("Failed to encode `gt_phoneme_text` into phoneme tokens.")
             batch['phoneme_tokens'] = torch.tensor([gt_phoneme_tokens], dtype=torch.long, device=device)

From 6d635aa5f2eecb1932874ae1ad287e7714282840 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Sun, 1 Mar 2026 19:52:49 -0800
Subject: [PATCH 70/94] ignore phoneme channel for some languages

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/data/text_to_speech_dataset.py       | 8 +++++++-
 .../collections/tts/data/text_to_speech_dataset_lhotse.py | 5 +++++
 nemo/collections/tts/models/easy_magpietts.py             | 2 ++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py
index f680a8d9eb34..65671b8606ed 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset.py
@@ -379,6 +379,7 @@ def __init__(
         context_duration_max: float = 10.0,
         text_context_remapping: Dict[str, str] = None,
         text_context_remapping_prob: float = 0.0,
+        ignore_phoneme_languages: List[str] = None,
     ):
         super().__init__(
             dataset_meta=dataset_meta,
@@ -412,6 +413,7 @@ def __init__(
         self.context_duration_max = context_duration_max
         self.text_context_remapping = text_context_remapping
         self.text_context_remapping_prob = text_context_remapping_prob
+        self.ignore_phoneme_languages = ignore_phoneme_languages or []
 
     def get_num_audio_samples_to_slice(self, duration, sample_rate):
         num_codec_frames = int(duration * sample_rate / self.codec_model_samples_per_frame)
@@ -430,6 +432,7 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float)
         if data.tokenizer_names is not None:
             # Pick a random tokenizer from the list of tokenizers
             tokenizer_name = random.choice(data.tokenizer_names)
+        language = data.manifest_entry.get('language', 'en')
         tokens = self.text_tokenizer.encode(text=data.text, tokenizer_name=tokenizer_name)
         tokens = tokens + [self.eos_id]  # Not adding BOS id
         tokens = torch.tensor(tokens, dtype=torch.int32)
@@ -450,6 +453,9 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float)
                         f"Text: {data.text}"
                     )
                 phoneme_text = data.manifest_entry['ipa']
+                if language in self.ignore_phoneme_languages:
+                    # Ignore phoneme tokenization for this language.
+                    phoneme_text = ""
             else:
                 phoneme_text = data.text
             phoneme_tokens = self.phoneme_tokenizer.encode(phoneme_text)
@@ -628,7 +634,7 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float)
         else:
             example['raw_text'] = data.text
 
-        example['language'] = data.manifest_entry.get('language', 'en')
+        example['language'] = language
 
         if "reward" in data.manifest_entry:
             example["reward"] = data.manifest_entry["reward"]
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index ffd6b5629cc4..cb478c87fe7f 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -151,6 +151,7 @@ def __init__(
         text_context_remapping: Dict[str, str] = None,
         text_context_remapping_prob: float = 0.0,
         phoneme_tokenizer_config: DictConfig = None,
+        ignore_phoneme_languages: List[str] = None,
     ):
         super().__init__()
         self.sample_rate = sample_rate
@@ -175,6 +176,7 @@ def __init__(
         self.text_context_remapping = text_context_remapping
         self.text_context_remapping_prob = text_context_remapping_prob
         self.phoneme_tokenizer_config = phoneme_tokenizer_config
+        self.ignore_phoneme_languages = ignore_phoneme_languages or []
 
     def get_num_audio_samples_to_slice(self, duration, sample_rate):
         num_codec_frames = int(duration * sample_rate / self.codec_model_samples_per_frame)
@@ -436,6 +438,9 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float)
                             f"Cut ID: {cut.id}, Text: {text_str}"
                         )
                     phoneme_text = cut.supervisions[0].ipa
+                    if language in self.ignore_phoneme_languages:
+                        # Ignore phoneme tokenization for this language
+                        phoneme_text = ""
                 else:
                     phoneme_text = text_str
                 phoneme_tokens = self.phoneme_tokenizer.encode(phoneme_text)
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 54305e088663..e0f2a87da55a 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -2477,6 +2477,7 @@ def get_dataset(self, dataset_cfg, dataset_type):
             pad_context_text_to_max_duration=self.pad_context_text_to_max_duration,
             context_duration_min=self.cfg.context_duration_min,
             context_duration_max=self.cfg.context_duration_max,
+            ignore_phoneme_languages=self.cfg.get("ignore_phoneme_languages", []),
         )
         dataset.load_16khz_audio = False
         dataset.tokenizer_config = (
@@ -2506,6 +2507,7 @@ def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.D
             text_conditioning_tokenizer_name=self.text_conditioning_tokenizer_name,
             tokenizer_config=self.cfg.text_tokenizers,
             phoneme_tokenizer_config=self.cfg.get("phoneme_tokenizer", None),
+            ignore_phoneme_languages=self.cfg.get("ignore_phoneme_languages", []),
         )
 
         data_loader = get_lhotse_dataloader_from_config(

From df4027704b6c6e0505024de3e33ba417885b1aa8 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeenh@nvidia.com>
Date: Mon, 2 Mar 2026 18:31:49 -0800
Subject: [PATCH 71/94] PO updates, cross lingual dataset creation

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>
---
 .../easy_magpietts_preference_optimization.py |  32 +-
 .../create_crosslingual_context_dataset.py    | 939 ++++++++++++++++++
 .../magpietts/inspect_crosslingual_dataset.py | 151 +++
 3 files changed, 1109 insertions(+), 13 deletions(-)
 create mode 100644 scripts/magpietts/create_crosslingual_context_dataset.py
 create mode 100644 scripts/magpietts/inspect_crosslingual_dataset.py

diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
index 45d9bd542b59..1643474dc5ce 100644
--- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
+++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
@@ -145,6 +145,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.max_decoder_steps = self.cfg.get('max_decoder_steps', 220)
         self.aux_phoneme_loss_weight = self.cfg.get('aux_phoneme_loss_weight', 1.0)
         self.po_groups_per_subbatch = max(int(self.cfg.get('po_groups_per_subbatch', 1)), 1)
+        self.batch_size_for_chunked_tf = self.cfg.get('batch_size_for_chunked_tf', 4)
 
         self._normalize_whisper_transcript = self.cfg.get('normalize_whisper_transcript', True)
         if reward_asr_model == 'whisper' and self._normalize_whisper_transcript:
@@ -1049,8 +1050,14 @@ def _run_teacher_forced_chunked_po(
         n_generations_per_item: int,
         do_backward: bool,
     ):
-        num_groups = len(batch_repeated['raw_texts']) // n_generations_per_item
-        groups_per_subbatch = max(self.po_groups_per_subbatch, 1)
+        total_items = len(batch_repeated['raw_texts'])
+        if self.batch_size_for_chunked_tf is not None:
+            chunk_size = self.batch_size_for_chunked_tf
+        else:
+            # Backward compatibility: preserve previous effective item-chunk size
+            # when the new explicit batch-size chunking config is not set.
+            chunk_size = max(self.po_groups_per_subbatch, 1) * max(n_generations_per_item, 1)
+        chunk_size = max(int(chunk_size), 1)
 
         accumulated_loss = torch.tensor(0.0, device=self.device)
         accumulated_po_loss = torch.tensor(0.0, device=self.device)
@@ -1059,10 +1066,9 @@ def _run_teacher_forced_chunked_po(
         accumulated_entropy = torch.tensor(0.0, device=self.device)
         used_gt_phoneme_input = 0.0
 
-        for group_start_idx, group_end_idx in self._iter_group_ranges(num_groups, groups_per_subbatch):
-            item_start_idx = group_start_idx * n_generations_per_item
-            item_end_idx = group_end_idx * n_generations_per_item
-            group_weight = float(group_end_idx - group_start_idx) / max(float(num_groups), 1.0)
+        for item_start_idx in range(0, total_items, chunk_size):
+            item_end_idx = min(item_start_idx + chunk_size, total_items)
+            chunk_weight = float(item_end_idx - item_start_idx) / max(float(total_items), 1.0)
 
             batch_sub = self._slice_batch_range(batch_repeated, item_start_idx, item_end_idx)
             predicted_codes_sub = predicted_codes[item_start_idx:item_end_idx]
@@ -1102,15 +1108,15 @@ def _run_teacher_forced_chunked_po(
             )
 
             if do_backward:
-                self.manual_backward(chunk_outputs['loss'] * group_weight)
+                self.manual_backward(chunk_outputs['loss'] * chunk_weight)
 
-            accumulated_loss = accumulated_loss + chunk_outputs['loss'].detach() * group_weight
-            accumulated_po_loss = accumulated_po_loss + chunk_outputs['po_loss'].detach() * group_weight
+            accumulated_loss = accumulated_loss + chunk_outputs['loss'].detach() * chunk_weight
+            accumulated_po_loss = accumulated_po_loss + chunk_outputs['po_loss'].detach() * chunk_weight
             accumulated_phoneme_aux_loss = (
-                accumulated_phoneme_aux_loss + chunk_outputs['phoneme_aux_loss'].detach() * group_weight
+                accumulated_phoneme_aux_loss + chunk_outputs['phoneme_aux_loss'].detach() * chunk_weight
             )
-            accumulated_kl_loss = accumulated_kl_loss + chunk_outputs['kl_loss'].detach() * group_weight
-            accumulated_entropy = accumulated_entropy + chunk_outputs['entropy'].detach() * group_weight
+            accumulated_kl_loss = accumulated_kl_loss + chunk_outputs['kl_loss'].detach() * chunk_weight
+            accumulated_entropy = accumulated_entropy + chunk_outputs['entropy'].detach() * chunk_weight
             used_gt_phoneme_input = max(used_gt_phoneme_input, chunk_outputs['used_gt_phoneme_input'])
 
         return {
@@ -1151,7 +1157,7 @@ def training_step(self, batch, batch_idx):
         teacher_forced_time_sec = time.perf_counter() - teacher_forced_start_time
 
         # Clip gradients to prevent catastrophic updates from outlier batches.
-        max_grad_norm = self.cfg.get('max_grad_norm', 1.0)
+        max_grad_norm = self.cfg.get('max_grad_norm', 0.0)
         if max_grad_norm > 0:
             torch.nn.utils.clip_grad_norm_(
                 [p for p in self.parameters() if p.requires_grad and p.grad is not None],
diff --git a/scripts/magpietts/create_crosslingual_context_dataset.py b/scripts/magpietts/create_crosslingual_context_dataset.py
new file mode 100644
index 000000000000..2b488eb4097c
--- /dev/null
+++ b/scripts/magpietts/create_crosslingual_context_dataset.py
@@ -0,0 +1,939 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Creates a cross-lingual context dataset for TTS training.
+
+For each target utterance in language A, finds the closest speaker voice from a
+different language B (using TitaNet speaker embeddings) and pairs the target with
+context audio from that cross-lingual speaker.
+
+The script operates in three stages:
+  Stage 1: Build a per-speaker TitaNet embedding index across all languages.
+  Stage 2: Compute cross-lingual speaker matches and sample a language-balanced subset.
+  Stage 3: Extract audio to disk and write a NeMo-format JSONL manifest.
+
+After running this script, use create_lhotse_shar_from_nemo_manifest.py to convert
+the output manifest into lhotse shar format, then optionally run
+extend_lhotse_shards_with_audio_codes.py to add codec codes.
+
+Example usage:
+    python scripts/magpietts/create_crosslingual_context_dataset.py \
+        --master-yaml /data/magpie_pretraining_data/manifests/ipa_manifests/train_25fpsSpectralCodecBWE_en_de_es_fr_hi_it_vi_zh_with_ipa.yaml \
+        --output-dir /data/crosslingual_context_dataset \
+        --target-hours 50.0 \
+        --samples-per-speaker 5 \
+        --seed 42 \
+        --log-level INFO
+"""
+
+import argparse
+import glob as glob_module
+import gzip
+import json
+import logging
+import os
+import pickle
+import random
+import re
+from collections import defaultdict
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+import soundfile as sf
+import torch
+import yaml
+from lhotse import CutSet
+from tqdm import tqdm
+
+TITANET_MODEL_NAME = "nvidia/speakerverification_en_titanet_large"
+TITANET_SAMPLE_RATE = 16000
+
+
+# ---------------------------------------------------------------------------
+# YAML / shar helpers
+# ---------------------------------------------------------------------------
+
+def parse_master_yaml(yaml_path: str) -> Dict[str, List[Dict]]:
+    """
+    Parse the master multilingual YAML and each per-language YAML it references.
+    Returns {language: [list of shar_entry dicts with context_audio]}.
+    """
+    yaml_base_dir = os.path.dirname(yaml_path)
+    with open(yaml_path, 'r') as f:
+        master_entries = yaml.safe_load(f)
+
+    lang_to_shar_entries: Dict[str, List[Dict]] = defaultdict(list)
+    for entry in master_entries:
+        lang = entry.get("tags", {}).get("lang")
+        child_yaml_path = entry.get("input_cfg")
+        if not lang or not child_yaml_path:
+            continue
+        if not os.path.isabs(child_yaml_path):
+            child_yaml_path = os.path.join(yaml_base_dir, child_yaml_path)
+        if not os.path.isfile(child_yaml_path):
+            logging.warning(f"Per-language YAML not found: {child_yaml_path}")
+            continue
+        with open(child_yaml_path, 'r') as f:
+            child_entries = yaml.safe_load(f)
+        for ce in child_entries:
+            shar_path = ce.get("shar_path", {})
+            if "context_audio" not in shar_path:
+                logging.debug(f"Skipping text-context-only entry (no context_audio): {shar_path.get('cuts', 'unknown')}")
+                continue
+            lang_to_shar_entries[lang].append(ce)
+
+    return dict(lang_to_shar_entries)
+
+
+def expand_shar_range(pattern: str) -> List[str]:
+    """
+    Expand a shar path pattern like '.../cuts.{000000..001231}.jsonl.gz'
+    into a list of concrete file paths.
+    """
+    match = re.search(r'\{(\d+)\.\.(\d+)\}', pattern)
+    if not match:
+        return [pattern]
+    start_idx = int(match.group(1))
+    end_idx = int(match.group(2))
+    width = len(match.group(1))
+    prefix = pattern[:match.start()]
+    suffix = pattern[match.end():]
+    return [f"{prefix}{i:0{width}d}{suffix}" for i in range(start_idx, end_idx + 1)]
+
+
+def parse_speaker_field(speaker_str: str) -> Tuple[str, str, str]:
+    """Extract (language, dataset, speaker_id) from '| Language:XX Dataset:YYY Speaker:ZZZ |'."""
+    lang_m = re.search(r"Language:(\w+)", speaker_str)
+    dataset_m = re.search(r"Dataset:([\w\d\W]+?) Speaker:", speaker_str)
+    spk_m = re.search(r"Speaker:([\w\d\W]+?) \|", speaker_str)
+    lang = lang_m.group(1) if lang_m else "unknown"
+    dataset = dataset_m.group(1).strip() if dataset_m else "unknown"
+    speaker_id = spk_m.group(1).strip() if spk_m else "unknown"
+    return lang, dataset, speaker_id
+
+
+# ---------------------------------------------------------------------------
+# Stage 1: Build speaker embedding index
+# ---------------------------------------------------------------------------
+
+def discover_speakers_from_cuts(
+    lang_to_shar_entries: Dict[str, List[Dict]],
+    max_cuts_per_speaker: int,
+    max_shards_per_dataset: int = 0,
+) -> Dict[str, Dict]:
+    """
+    Pass 1 (metadata only): Read cut JSONL files to discover unique speakers
+    and collect up to max_cuts_per_speaker cut metadata entries per speaker.
+
+    Args:
+        max_shards_per_dataset: If > 0, only scan this many .jsonl.gz shard
+            files per shar group (dataset) instead of all shards. This
+            dramatically speeds up discovery for large datasets while still
+            finding most speakers.
+
+    Returns: {speaker_str: {"language": str, "cut_metas": [list of (shar_entry, shard_idx, cut_json_dict)]}}
+    """
+    speaker_info: Dict[str, Dict] = {}
+
+    for lang, shar_entries in lang_to_shar_entries.items():
+        logging.info(f"[Stage 1] Discovering speakers for language: {lang} ({len(shar_entries)} shar groups)")
+        for se in shar_entries:
+            cuts_pattern = se["shar_path"]["cuts"]
+            cuts_files = expand_shar_range(cuts_pattern)
+            if max_shards_per_dataset > 0 and len(cuts_files) > max_shards_per_dataset:
+                logging.info(
+                    f"  Limiting scan to {max_shards_per_dataset}/{len(cuts_files)} "
+                    f"shards for dataset: {cuts_pattern}"
+                )
+                cuts_files = cuts_files[:max_shards_per_dataset]
+            for cuts_file in cuts_files:
+                if not os.path.isfile(cuts_file):
+                    continue
+                shard_idx_match = re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_file)
+                shard_idx = int(shard_idx_match.group(1)) if shard_idx_match else 0
+                try:
+                    with gzip.open(cuts_file, 'rt', encoding='utf-8') as f:
+                        for line in f:
+                            cut_json = json.loads(line)
+                            supervisions = cut_json.get("supervisions", [])
+                            if not supervisions:
+                                continue
+                            speaker_str = supervisions[0].get("speaker", "")
+                            if not speaker_str:
+                                continue
+                            if speaker_str not in speaker_info:
+                                speaker_info[speaker_str] = {
+                                    "language": lang,
+                                    "cut_metas": [],
+                                }
+                            if len(speaker_info[speaker_str]["cut_metas"]) < max_cuts_per_speaker:
+                                speaker_info[speaker_str]["cut_metas"].append((se, shard_idx, cut_json))
+                except Exception as e:
+                    logging.warning(f"Error reading {cuts_file}: {e}")
+
+    logging.info(f"[Stage 1] Discovered {len(speaker_info)} unique speakers across {len(lang_to_shar_entries)} languages")
+    for lang in sorted(lang_to_shar_entries.keys()):
+        n = sum(1 for v in speaker_info.values() if v["language"] == lang)
+        logging.info(f"  {lang}: {n} speakers")
+    return speaker_info
+
+
+def compute_speaker_embeddings(
+    speaker_info: Dict[str, Dict],
+    sv_model: torch.nn.Module,
+    device: torch.device,
+    batch_size: int = 16,
+) -> Dict[str, Dict]:
+    """
+    Pass 2: For each speaker, load audio from shar tars for the sampled cuts,
+    compute TitaNet embeddings, and average them into a single representative vector.
+
+    Returns: {speaker_str: {"language": str, "embedding": np.ndarray}}
+    """
+    speaker_embeddings: Dict[str, Dict] = {}
+
+    speakers_needing_audio = {}
+    for spk, info in speaker_info.items():
+        cut_metas = info["cut_metas"]
+        if not cut_metas:
+            continue
+        grouped_by_shar_and_shard: Dict[str, Dict[int, List]] = defaultdict(lambda: defaultdict(list))
+        for (se, shard_idx, cut_json) in cut_metas:
+            shar_key = json.dumps(se["shar_path"], sort_keys=True)
+            grouped_by_shar_and_shard[shar_key][shard_idx].append((se, cut_json))
+        speakers_needing_audio[spk] = {
+            "language": info["language"],
+            "grouped": grouped_by_shar_and_shard,
+        }
+
+    # Collect audio in batches: load from shar, accumulate waveforms per speaker
+    speaker_audio_tensors: Dict[str, List[torch.Tensor]] = defaultdict(list)
+
+    logging.info(f"[Stage 1] Loading audio for {len(speakers_needing_audio)} speakers to compute embeddings...")
+
+    # Group all (shar_entry, shard_idx) that we need to load
+    shar_shard_to_speakers: Dict[Tuple[str, int], List[Tuple[str, str]]] = defaultdict(list)
+    for spk, data in speakers_needing_audio.items():
+        for shar_key, shard_map in data["grouped"].items():
+            for shard_idx, items in shard_map.items():
+                for (se, cut_json) in items:
+                    cut_id = cut_json.get("id", "")
+                    shar_shard_to_speakers[(shar_key, shard_idx)].append((spk, cut_id))
+
+    # Process shard by shard to minimize tar file openings
+    total_shards = len(shar_shard_to_speakers)
+    for (shar_key, shard_idx), spk_cut_pairs in tqdm(
+        shar_shard_to_speakers.items(), desc="[Stage 1] Loading audio shards", total=total_shards
+    ):
+        se_shar_path = json.loads(shar_key)
+        cuts_files = expand_shar_range(se_shar_path["cuts"])
+        target_audio_files = expand_shar_range(se_shar_path.get("target_audio", ""))
+
+        if shard_idx >= len(cuts_files) or shard_idx >= len(target_audio_files):
+            logging.warning(f"Shard index {shard_idx} out of range, skipping")
+            continue
+
+        cut_file = cuts_files[shard_idx]
+        target_tar = target_audio_files[shard_idx]
+
+        if not os.path.isfile(cut_file) or not os.path.isfile(target_tar):
+            logging.warning(f"Missing shard files: cuts={cut_file}, target={target_tar}")
+            continue
+
+        needed_cut_ids = {cut_id for (_, cut_id) in spk_cut_pairs}
+        cut_id_to_spk = {cut_id: spk for (spk, cut_id) in spk_cut_pairs}
+
+        try:
+            fields = {
+                "cuts": [cut_file],
+                "recording": [target_tar],
+            }
+            # Also include context_recording if available, to avoid errors
+            context_audio_files = expand_shar_range(se_shar_path.get("context_audio", ""))
+            if shard_idx < len(context_audio_files) and os.path.isfile(context_audio_files[shard_idx]):
+                fields["context_recording"] = [context_audio_files[shard_idx]]
+
+            shard_cutset = CutSet.from_shar(fields=fields)
+            for cut in shard_cutset:
+                if cut.id in needed_cut_ids:
+                    spk = cut_id_to_spk[cut.id]
+                    audio_np = cut.recording.resample(TITANET_SAMPLE_RATE).load_audio().squeeze(0)
+                    audio_tensor = torch.from_numpy(audio_np).float()
+                    speaker_audio_tensors[spk].append(audio_tensor)
+                    needed_cut_ids.discard(cut.id)
+                    if not needed_cut_ids:
+                        break
+        except Exception as e:
+            logging.warning(f"Error loading shard {cut_file}: {e}")
+
+    # Now compute embeddings in batches
+    logging.info(f"[Stage 1] Computing TitaNet embeddings for {len(speaker_audio_tensors)} speakers...")
+    all_speakers = list(speaker_audio_tensors.keys())
+
+    for batch_start in tqdm(range(0, len(all_speakers), batch_size), desc="[Stage 1] TitaNet batches"):
+        batch_speakers = all_speakers[batch_start : batch_start + batch_size]
+        audio_list = []
+        audio_lens = []
+        spk_indices = []  # maps each audio in batch back to speaker
+
+        for spk in batch_speakers:
+            for audio_t in speaker_audio_tensors[spk]:
+                audio_list.append(audio_t.to(device))
+                audio_lens.append(audio_t.size(0))
+                spk_indices.append(spk)
+
+        if not audio_list:
+            continue
+
+        batch_lens = torch.tensor(audio_lens, device=device).long()
+        max_len = int(batch_lens.max().item())
+        padded = torch.zeros(len(audio_list), max_len, device=device, dtype=torch.float32)
+        for i, t in enumerate(audio_list):
+            padded[i, : t.size(0)] = t
+
+        with torch.inference_mode():
+            _, embeddings = sv_model.forward(input_signal=padded, input_signal_length=batch_lens)
+
+        embeddings_np = embeddings.cpu().float().numpy()
+
+        # Average embeddings per speaker
+        spk_emb_accum: Dict[str, List[np.ndarray]] = defaultdict(list)
+        for i, spk in enumerate(spk_indices):
+            spk_emb_accum[spk].append(embeddings_np[i])
+
+        for spk in batch_speakers:
+            if spk in spk_emb_accum and spk_emb_accum[spk]:
+                avg_emb = np.mean(spk_emb_accum[spk], axis=0)
+                avg_emb = avg_emb / (np.linalg.norm(avg_emb) + 1e-8)
+                speaker_embeddings[spk] = {
+                    "language": speakers_needing_audio[spk]["language"],
+                    "embedding": avg_emb,
+                }
+
+    logging.info(f"[Stage 1] Computed embeddings for {len(speaker_embeddings)} speakers")
+    return speaker_embeddings
+
+
+def run_stage1(
+    lang_to_shar_entries: Dict[str, List[Dict]],
+    samples_per_speaker: int,
+    device: torch.device,
+    index_path: str,
+    batch_size: int = 16,
+    max_shards_per_dataset: int = 0,
+) -> Dict[str, Dict]:
+    """Run full Stage 1: discover speakers, load audio, compute embeddings, save index."""
+    if os.path.isfile(index_path):
+        logging.info(f"[Stage 1] Loading cached speaker index from {index_path}")
+        with open(index_path, 'rb') as f:
+            return pickle.load(f)
+
+    from nemo.collections.asr.models import EncDecSpeakerLabelModel
+
+    logging.info(f"[Stage 1] Loading TitaNet model: {TITANET_MODEL_NAME}")
+    sv_model = EncDecSpeakerLabelModel.from_pretrained(TITANET_MODEL_NAME)
+    sv_model = sv_model.to(device)
+    sv_model.eval()
+
+    speaker_info = discover_speakers_from_cuts(
+        lang_to_shar_entries,
+        max_cuts_per_speaker=samples_per_speaker,
+        max_shards_per_dataset=max_shards_per_dataset,
+    )
+    speaker_embeddings = compute_speaker_embeddings(speaker_info, sv_model, device, batch_size=batch_size)
+
+    os.makedirs(os.path.dirname(index_path), exist_ok=True)
+    with open(index_path, 'wb') as f:
+        pickle.dump(speaker_embeddings, f)
+    logging.info(f"[Stage 1] Saved speaker index to {index_path}")
+
+    del sv_model
+    torch.cuda.empty_cache()
+    return speaker_embeddings
+
+
+# ---------------------------------------------------------------------------
+# Stage 2: Cross-lingual speaker matching + language-balanced sampling
+# ---------------------------------------------------------------------------
+
+def build_crosslingual_map(speaker_embeddings: Dict[str, Dict]) -> Dict[str, Tuple[str, float]]:
+    """
+    For each speaker S in language L, find the closest speaker S' from a different
+    language by cosine similarity of their TitaNet embeddings.
+
+    Returns: {speaker_str: (best_match_speaker_str, cosine_similarity)}
+    """
+    speakers = list(speaker_embeddings.keys())
+    n = len(speakers)
+    logging.info(f"[Stage 2] Building cross-lingual map for {n} speakers...")
+
+    # Build embedding matrix
+    emb_matrix = np.stack([speaker_embeddings[s]["embedding"] for s in speakers])
+    langs = [speaker_embeddings[s]["language"] for s in speakers]
+
+    # Cosine similarity matrix (embeddings are already L2-normalized)
+    sim_matrix = emb_matrix @ emb_matrix.T
+
+    cross_lingual_map: Dict[str, Tuple[str, float]] = {}
+    for i in range(n):
+        best_j = -1
+        best_sim = -2.0
+        for j in range(n):
+            if langs[j] == langs[i]:
+                continue
+            if sim_matrix[i, j] > best_sim:
+                best_sim = sim_matrix[i, j]
+                best_j = j
+        if best_j >= 0:
+            cross_lingual_map[speakers[i]] = (speakers[best_j], float(best_sim))
+        else:
+            logging.warning(f"No cross-lingual match found for speaker: {speakers[i]}")
+
+    logging.info(f"[Stage 2] Built cross-lingual map with {len(cross_lingual_map)} entries")
+    avg_sim = np.mean([v[1] for v in cross_lingual_map.values()]) if cross_lingual_map else 0
+    logging.info(f"[Stage 2] Average cross-lingual similarity: {avg_sim:.4f}")
+    return cross_lingual_map
+
+
+def sample_balanced_cuts(
+    lang_to_shar_entries: Dict[str, List[Dict]],
+    cross_lingual_map: Dict[str, Tuple[str, float]],
+    target_hours: float,
+    seed: int,
+    max_shards_per_dataset: int = 0,
+) -> Tuple[Dict[str, List[Dict]], Dict[str, List[Dict]]]:
+    """
+    Sample cuts across languages so each language contributes approximately
+    target_hours / num_languages hours of target audio.
+
+    Args:
+        max_shards_per_dataset: If > 0, only read this many shard files per
+            dataset. Since we only need ~6.25h per language, reading a small
+            fraction of shards is sufficient and avoids scanning tens of
+            thousands of files for large datasets.
+
+    Returns:
+        target_cuts_by_lang: {lang: [list of cut_json dicts with extra metadata]}
+        context_pool_by_speaker: {speaker_str: [list of (shar_entry, shard_idx, cut_json)]}
+    """
+    rng = random.Random(seed)
+    num_langs = len(lang_to_shar_entries)
+    hours_per_lang = target_hours / num_langs
+    secs_per_lang = hours_per_lang * 3600
+    # Collect 3x the target to allow shuffling diversity
+    collect_secs_per_lang = secs_per_lang * 3
+
+    logging.info(f"[Stage 2] Sampling ~{hours_per_lang:.2f}h per language ({num_langs} languages, {target_hours}h total)")
+
+    all_matched_speakers = set(v[0] for v in cross_lingual_map.values())
+
+    target_cuts_by_lang: Dict[str, List[Dict]] = {}
+    context_pool_by_speaker: Dict[str, List] = defaultdict(list)
+
+    for lang, shar_entries in lang_to_shar_entries.items():
+        logging.info(f"[Stage 2] Reading cuts for language: {lang}")
+        lang_cuts = []
+        lang_collected_secs = 0.0
+        lang_done = False
+
+        for se in shar_entries:
+            if lang_done:
+                break
+            cuts_pattern = se["shar_path"]["cuts"]
+            cuts_files = expand_shar_range(cuts_pattern)
+            if max_shards_per_dataset > 0 and len(cuts_files) > max_shards_per_dataset:
+                cuts_files = cuts_files[:max_shards_per_dataset]
+                logging.info(
+                    f"  Limiting to {max_shards_per_dataset} shards for dataset: "
+                    f"{se['shar_path']['cuts']}"
+                )
+            for cuts_file in cuts_files:
+                if lang_done:
+                    break
+                if not os.path.isfile(cuts_file):
+                    continue
+                shard_idx_match = re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_file)
+                shard_idx = int(shard_idx_match.group(1)) if shard_idx_match else 0
+                try:
+                    with gzip.open(cuts_file, 'rt', encoding='utf-8') as f:
+                        for line in f:
+                            cut_json = json.loads(line)
+                            speaker_str = cut_json.get("supervisions", [{}])[0].get("speaker", "")
+                            if not speaker_str:
+                                continue
+                            if speaker_str in all_matched_speakers:
+                                context_pool_by_speaker[speaker_str].append((se, shard_idx, cut_json))
+                            if speaker_str in cross_lingual_map:
+                                cut_json["_shar_entry"] = se
+                                cut_json["_shard_idx"] = shard_idx
+                                cut_json["_speaker_str"] = speaker_str
+                                lang_cuts.append(cut_json)
+                                lang_collected_secs += cut_json.get("duration", 0)
+                                if lang_collected_secs >= collect_secs_per_lang:
+                                    lang_done = True
+                                    break
+                except Exception as e:
+                    logging.warning(f"Error reading {cuts_file}: {e}")
+
+        logging.info(f"  {lang}: {len(lang_cuts)} candidate target cuts ({lang_collected_secs / 3600:.2f}h collected)")
+
+        rng.shuffle(lang_cuts)
+        sampled = []
+        total_dur = 0.0
+        for cut_json in lang_cuts:
+            dur = cut_json.get("duration", 0)
+            if dur <= 0:
+                continue
+            sampled.append(cut_json)
+            total_dur += dur
+            if total_dur >= secs_per_lang:
+                break
+
+        target_cuts_by_lang[lang] = sampled
+        logging.info(f"  {lang}: sampled {len(sampled)} cuts, {total_dur / 3600:.2f}h")
+
+    total_sampled = sum(len(v) for v in target_cuts_by_lang.values())
+    total_hours = sum(sum(c.get("duration", 0) for c in v) for v in target_cuts_by_lang.values()) / 3600
+    logging.info(f"[Stage 2] Total sampled: {total_sampled} cuts, {total_hours:.2f}h")
+    return target_cuts_by_lang, dict(context_pool_by_speaker)
+
+
+# ---------------------------------------------------------------------------
+# Stage 3: Extract audio + write NeMo manifest
+# ---------------------------------------------------------------------------
+
+def run_stage3(
+    target_cuts_by_lang: Dict[str, List[Dict]],
+    context_pool_by_speaker: Dict[str, List],
+    cross_lingual_map: Dict[str, Tuple[str, float]],
+    speaker_embeddings: Dict[str, Dict],
+    output_dir: str,
+    sample_rate: int,
+    seed: int,
+):
+    """
+    For each sampled target cut, pick a context utterance from the matched
+    cross-lingual speaker, extract both audios to disk, and write the manifest.
+    """
+    rng = random.Random(seed)
+    audio_dir = os.path.join(output_dir, "extracted_audio")
+    target_audio_dir = os.path.join(audio_dir, "target")
+    context_audio_dir = os.path.join(audio_dir, "context")
+    os.makedirs(target_audio_dir, exist_ok=True)
+    os.makedirs(context_audio_dir, exist_ok=True)
+
+    manifest_path = os.path.join(output_dir, "manifest.json")
+
+    # Build a quick lookup: for each context cut we might need to load,
+    # index by (shar_key, shard_idx, cut_id)
+    # First, assign a context cut to each target
+    assignments: List[Dict] = []
+    for lang, cuts in target_cuts_by_lang.items():
+        for cut_json in cuts:
+            spk = cut_json["_speaker_str"]
+            matched_spk, ssim = cross_lingual_map[spk]
+            ctx_pool = context_pool_by_speaker.get(matched_spk, [])
+            if not ctx_pool:
+                logging.warning(f"No context pool for matched speaker {matched_spk}, skipping cut {cut_json.get('id', '')}")
+                continue
+            ctx_se, ctx_shard_idx, ctx_cut_json = rng.choice(ctx_pool)
+            assignments.append({
+                "target_cut_json": cut_json,
+                "target_shar_entry": cut_json["_shar_entry"],
+                "target_shard_idx": cut_json["_shard_idx"],
+                "target_speaker": spk,
+                "context_cut_json": ctx_cut_json,
+                "context_shar_entry": ctx_se,
+                "context_shard_idx": ctx_shard_idx,
+                "context_speaker": matched_spk,
+                "ssim": ssim,
+                "lang": lang,
+            })
+
+    logging.info(f"[Stage 3] Total assignments: {len(assignments)}")
+
+    # Group by (shar_key, shard_idx) for efficient loading
+    # We need to load target and context audio from potentially different shards
+    # Strategy: process all assignments, grouping audio loads by shard
+    target_loads: Dict[Tuple[str, int], List[int]] = defaultdict(list)
+    context_loads: Dict[Tuple[str, int], List[int]] = defaultdict(list)
+
+    for idx, a in enumerate(assignments):
+        t_shar_key = json.dumps(a["target_shar_entry"]["shar_path"], sort_keys=True)
+        target_loads[(t_shar_key, a["target_shard_idx"])].append(idx)
+        c_shar_key = json.dumps(a["context_shar_entry"]["shar_path"], sort_keys=True)
+        context_loads[(c_shar_key, a["context_shard_idx"])].append(idx)
+
+    # Arrays to hold extracted audio file paths
+    target_audio_paths = [None] * len(assignments)
+    context_audio_paths = [None] * len(assignments)
+
+    def _save_audio_from_shard(
+        shard_loads: Dict[Tuple[str, int], List[int]],
+        assignments_list: List[Dict],
+        cut_json_key: str,
+        out_subdir: str,
+        out_paths_array: List,
+        audio_field: str,
+    ):
+        """Load cuts from shar tars and save individual audio files to disk."""
+        total_shards = len(shard_loads)
+        for (shar_key_str, shard_idx), indices in tqdm(
+            shard_loads.items(), desc=f"[Stage 3] Extracting {audio_field}", total=total_shards
+        ):
+            se_shar_path = json.loads(shar_key_str)
+            cuts_files = expand_shar_range(se_shar_path["cuts"])
+            target_audio_files = expand_shar_range(se_shar_path.get("target_audio", ""))
+
+            if shard_idx >= len(cuts_files) or shard_idx >= len(target_audio_files):
+                logging.warning(f"Shard {shard_idx} out of range, skipping")
+                continue
+
+            cut_file = cuts_files[shard_idx]
+            tar_file = target_audio_files[shard_idx]
+
+            if not os.path.isfile(cut_file) or not os.path.isfile(tar_file):
+                logging.warning(f"Missing files: {cut_file} or {tar_file}")
+                continue
+
+            needed_cut_ids = {}
+            for i in indices:
+                cj = assignments_list[i][cut_json_key]
+                cid = cj.get("id", "")
+                needed_cut_ids[cid] = i
+
+            try:
+                fields = {"cuts": [cut_file], "recording": [tar_file]}
+                ctx_audio_files = expand_shar_range(se_shar_path.get("context_audio", ""))
+                if ctx_audio_files and shard_idx < len(ctx_audio_files) and os.path.isfile(ctx_audio_files[shard_idx]):
+                    fields["context_recording"] = [ctx_audio_files[shard_idx]]
+
+                shard_cutset = CutSet.from_shar(fields=fields)
+                for cut in shard_cutset:
+                    if cut.id in needed_cut_ids:
+                        assign_idx = needed_cut_ids[cut.id]
+                        audio_np = cut.recording.resample(sample_rate).load_audio().squeeze(0)
+                        safe_id = cut.id.replace("/", "_")
+                        out_file = os.path.join(out_subdir, f"{safe_id}.wav")
+                        sf.write(out_file, audio_np, sample_rate)
+                        out_paths_array[assign_idx] = os.path.relpath(out_file, os.path.join(output_dir, "extracted_audio"))
+                        del needed_cut_ids[cut.id]
+                        if not needed_cut_ids:
+                            break
+            except Exception as e:
+                logging.warning(f"Error processing shard {cut_file}: {e}")
+
+    # Extract target audio
+    logging.info(f"[Stage 3] Extracting target audio from {len(target_loads)} shards...")
+    _save_audio_from_shard(
+        target_loads, assignments, "target_cut_json",
+        target_audio_dir, target_audio_paths, "target_audio",
+    )
+
+    # Extract context audio
+    logging.info(f"[Stage 3] Extracting context audio from {len(context_loads)} shards...")
+    _save_audio_from_shard(
+        context_loads, assignments, "context_cut_json",
+        context_audio_dir, context_audio_paths, "context_audio",
+    )
+
+    # Write manifest
+    logging.info(f"[Stage 3] Writing manifest to {manifest_path}")
+    written = 0
+    skipped = 0
+    with open(manifest_path, 'w', encoding='utf-8') as f:
+        for idx, a in enumerate(assignments):
+            if target_audio_paths[idx] is None or context_audio_paths[idx] is None:
+                skipped += 1
+                continue
+
+            t_cut = a["target_cut_json"]
+            c_cut = a["context_cut_json"]
+            t_sup = t_cut.get("supervisions", [{}])[0]
+
+            text = t_sup.get("text", "")
+            normalized_text = t_sup.get("custom", {}).get("normalized_text", text)
+            ipa = t_sup.get("custom", {}).get("ipa", "")
+            speaker = t_sup.get("speaker", "")
+            duration = t_cut.get("duration", 0)
+            context_duration = c_cut.get("duration", 0)
+            ctx_lang_parsed, _, _ = parse_speaker_field(a["context_speaker"])
+
+            target_lang_parsed, _, _ = parse_speaker_field(speaker)
+
+            entry = {
+                "audio_filepath": target_audio_paths[idx],
+                "text": text,
+                "normalized_text": normalized_text,
+                "speaker": speaker,
+                "language": target_lang_parsed,
+                "duration": duration,
+                "context_audio_filepath": context_audio_paths[idx],
+                "context_audio_duration": context_duration,
+                "context_speaker_similarity": round(a["ssim"], 6),
+                "context_language": ctx_lang_parsed,
+                "context_speaker": a["context_speaker"],
+            }
+            if ipa:
+                entry["ipa"] = ipa
+
+            # Carry over any additional custom fields from the target supervision
+            _exclude_custom_keys = {
+                "target_audio_codes_path", "context_audio_codes_path",
+                "context_audio_text", "context_audio_normalized_text",
+                "context_audio_offset"
+            }
+            for k, v in t_sup.get("custom", {}).items():
+                if k not in entry and k not in _exclude_custom_keys:
+                    entry[k] = v
+
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+            written += 1
+
+    logging.info(f"[Stage 3] Manifest written: {written} entries, {skipped} skipped")
+    return manifest_path
+
+
+# ---------------------------------------------------------------------------
+# YAML config generation (post Stage 4)
+# ---------------------------------------------------------------------------
+
+def generate_yaml_config(lhotse_shar_dir: str, output_yaml_path: str, data_mount_prefix: str = "/data"):
+    """
+    Generate a lhotse YAML config pointing to the cross-lingual shar dataset.
+    Call this after running create_lhotse_shar_from_nemo_manifest.py on the manifest.
+
+    Args:
+        lhotse_shar_dir: Absolute path to the lhotse_shar output directory
+                         (containing cuts/, target_audio/, context_audio/).
+        output_yaml_path: Path to write the YAML config file.
+        data_mount_prefix: If shar_dir is under a mount, replace the host prefix
+                           with this docker-internal prefix. Pass empty string to skip.
+    """
+    cuts_dir = os.path.join(lhotse_shar_dir, "cuts")
+    target_audio_dir = os.path.join(lhotse_shar_dir, "target_audio")
+    context_audio_dir = os.path.join(lhotse_shar_dir, "context_audio")
+
+    cuts_files = sorted(glob_module.glob(os.path.join(cuts_dir, "cuts.*.jsonl.gz")))
+    context_files = sorted(glob_module.glob(os.path.join(context_audio_dir, "recording.*.tar")))
+
+    if not cuts_files:
+        logging.error(f"No cut files found in {cuts_dir}")
+        return
+
+    # Determine shard range
+    first_idx = int(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[0]).group(1))
+    last_idx = int(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[-1]).group(1))
+    width = len(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[0]).group(1))
+
+    def _make_range_pattern(directory: str, prefix: str, ext: str) -> str:
+        path = os.path.join(directory, f"{prefix}.{{{first_idx:0{width}d}..{last_idx:0{width}d}}}.{ext}")
+        return path
+
+    shar_path = {
+        "cuts": _make_range_pattern(cuts_dir, "cuts", "jsonl.gz"),
+        "target_audio": _make_range_pattern(target_audio_dir, "recording", "tar"),
+    }
+    if context_files:
+        shar_path["context_audio"] = _make_range_pattern(context_audio_dir, "recording", "tar")
+
+    # Check for codec codes
+    for codec_dir_name in os.listdir(lhotse_shar_dir):
+        codec_subdir = os.path.join(lhotse_shar_dir, codec_dir_name)
+        if not os.path.isdir(codec_subdir):
+            continue
+        target_codes_dir = os.path.join(codec_subdir, "target_codes")
+        context_codes_dir = os.path.join(codec_subdir, "context_codes")
+        if os.path.isdir(target_codes_dir):
+            tc_files = sorted(glob_module.glob(os.path.join(target_codes_dir, "codes.*.tar")))
+            if tc_files:
+                tc_first = int(re.search(r"codes\.(\d+)\.tar$", tc_files[0]).group(1))
+                tc_last = int(re.search(r"codes\.(\d+)\.tar$", tc_files[-1]).group(1))
+                tc_width = len(re.search(r"codes\.(\d+)\.tar$", tc_files[0]).group(1))
+                shar_path["target_codes"] = os.path.join(
+                    target_codes_dir, f"codes.{{{tc_first:0{tc_width}d}..{tc_last:0{tc_width}d}}}.tar"
+                )
+        if os.path.isdir(context_codes_dir):
+            cc_files = sorted(glob_module.glob(os.path.join(context_codes_dir, "codes.*.tar")))
+            if cc_files:
+                cc_first = int(re.search(r"codes\.(\d+)\.tar$", cc_files[0]).group(1))
+                cc_last = int(re.search(r"codes\.(\d+)\.tar$", cc_files[-1]).group(1))
+                cc_width = len(re.search(r"codes\.(\d+)\.tar$", cc_files[0]).group(1))
+                shar_path["context_codes"] = os.path.join(
+                    context_codes_dir, f"codes.{{{cc_first:0{cc_width}d}..{cc_last:0{cc_width}d}}}.tar"
+                )
+
+    yaml_entry = [{
+        "type": "lhotse_shar",
+        "shar_path": shar_path,
+        "weight": 1.0,
+        "tags": {
+            "task": "tts",
+            "lang": "crosslingual",
+            "tokenizer_names": ["nemotron_nano_30b"],
+        },
+    }]
+
+    os.makedirs(os.path.dirname(output_yaml_path) or ".", exist_ok=True)
+    with open(output_yaml_path, 'w') as f:
+        yaml.dump(yaml_entry, f, default_flow_style=False, sort_keys=False)
+    logging.info(f"YAML config written to {output_yaml_path}")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="Create a cross-lingual context TTS dataset from multilingual lhotse shar data.",
+    )
+    parser.add_argument(
+        "--master-yaml", required=True, type=str,
+        help="Path to the master multilingual YAML (e.g. train_25fpsSpectralCodecBWE_en_de_es_fr_hi_it_vi_zh_with_ipa.yaml).",
+    )
+    parser.add_argument(
+        "--output-dir", required=True, type=str,
+        help="Base directory for all outputs (extracted audio, manifest, speaker index).",
+    )
+    parser.add_argument(
+        "--target-hours", type=float, default=50.0,
+        help="Total hours of target audio to sample (split equally across languages).",
+    )
+    parser.add_argument(
+        "--samples-per-speaker", type=int, default=5,
+        help="Number of utterances per speaker to use for computing the average TitaNet embedding.",
+    )
+    parser.add_argument(
+        "--sample-rate", type=int, default=24000,
+        help="Sample rate for saving extracted audio files.",
+    )
+    parser.add_argument(
+        "--embedding-batch-size", type=int, default=16,
+        help="Batch size for TitaNet embedding computation.",
+    )
+    parser.add_argument(
+        "--max-shards-per-dataset", type=int, default=0,
+        help="Max number of .jsonl.gz shard files to scan per dataset during "
+             "speaker discovery (Stage 1). 0 means scan all shards. "
+             "Setting this to e.g. 10 dramatically speeds up discovery while "
+             "still finding most speakers.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42,
+        help="Random seed for reproducibility.",
+    )
+    parser.add_argument(
+        "--log-level", type=str, default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+        help="Logging level.",
+    )
+    parser.add_argument(
+        "--generate-yaml", type=str, default=None,
+        help="If provided, skip stages 1-3 and instead generate a YAML config "
+             "pointing to the lhotse shar in OUTPUT_DIR/lhotse_shar. "
+             "Value is the output YAML file path.",
+    )
+    args = parser.parse_args()
+
+    log_level = getattr(logging, args.log_level.upper(), logging.INFO)
+    logging.basicConfig(
+        level=log_level,
+        format='%(asctime)s - %(levelname)s - %(message)s',
+    )
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # --- Generate YAML config mode (post Stage 4) ---
+    if args.generate_yaml:
+        lhotse_shar_dir = os.path.join(args.output_dir, "lhotse_shar")
+        generate_yaml_config(lhotse_shar_dir, args.generate_yaml)
+        return
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logging.info(f"Using device: {device}")
+
+    # --- Parse master YAML ---
+    logging.info(f"Parsing master YAML: {args.master_yaml}")
+    lang_to_shar_entries = parse_master_yaml(args.master_yaml)
+    if not lang_to_shar_entries:
+        logging.error("No shar entries found. Check the master YAML path and contents.")
+        return
+
+    for lang, entries in sorted(lang_to_shar_entries.items()):
+        logging.info(f"  Language '{lang}': {len(entries)} shar groups (with context_audio)")
+
+    # --- Stage 1: Build speaker embedding index ---
+    index_path = os.path.join(args.output_dir, "speaker_embedding_index.pkl")
+    speaker_embeddings = run_stage1(
+        lang_to_shar_entries,
+        samples_per_speaker=args.samples_per_speaker,
+        device=device,
+        index_path=index_path,
+        batch_size=args.embedding_batch_size,
+        max_shards_per_dataset=args.max_shards_per_dataset,
+    )
+
+    # --- Stage 2: Cross-lingual matching + balanced sampling ---
+    cross_lingual_map = build_crosslingual_map(speaker_embeddings)
+    target_cuts_by_lang, context_pool_by_speaker = sample_balanced_cuts(
+        lang_to_shar_entries, cross_lingual_map,
+        target_hours=args.target_hours, seed=args.seed,
+        max_shards_per_dataset=args.max_shards_per_dataset,
+    )
+
+    # --- Stage 3: Extract audio + write manifest ---
+    manifest_path = run_stage3(
+        target_cuts_by_lang, context_pool_by_speaker, cross_lingual_map,
+        speaker_embeddings, args.output_dir, args.sample_rate, args.seed,
+    )
+
+    # --- Summary ---
+    logging.info("=" * 60)
+    logging.info("Cross-lingual context dataset creation complete!")
+    logging.info(f"  Manifest: {manifest_path}")
+    logging.info(f"  Audio dir: {os.path.join(args.output_dir, 'extracted_audio')}")
+    logging.info("")
+    logging.info("Next steps:")
+    logging.info("  1. Convert to lhotse shar format:")
+    logging.info(f"     python scripts/magpietts/create_lhotse_shar_from_nemo_manifest.py \\")
+    logging.info(f"       --manifest-path {manifest_path} \\")
+    logging.info(f"       --audio-base-dir {os.path.join(args.output_dir, 'extracted_audio')} \\")
+    logging.info(f"       --output-dir {os.path.join(args.output_dir, 'lhotse_shar')} \\")
+    logging.info(f"       --num-jobs 16 --processing-chunk-size 256 --audio-format flac --shuffle --shuffle-seed 42")
+    logging.info("")
+    logging.info("  2. (Optional) Add codec codes:")
+    logging.info(f"     python scripts/magpietts/extend_lhotse_shards_with_audio_codes.py \\")
+    logging.info(f"       --cuts-dir {os.path.join(args.output_dir, 'lhotse_shar', 'cuts')} \\")
+    logging.info(f"       --target-audio-dir {os.path.join(args.output_dir, 'lhotse_shar', 'target_audio')} \\")
+    logging.info(f"       --context-audio-dir {os.path.join(args.output_dir, 'lhotse_shar', 'context_audio')} \\")
+    logging.info(f"       --output-dir {os.path.join(args.output_dir, 'lhotse_shar')} \\")
+    logging.info(f"       --codec-model-path <YOUR_CODEC_MODEL_PATH>")
+    logging.info("")
+    yaml_out = os.path.join(args.output_dir, "crosslingual_context.yaml")
+    logging.info("  3. Generate YAML config for training:")
+    logging.info(f"     python scripts/magpietts/create_crosslingual_context_dataset.py \\")
+    logging.info(f"       --master-yaml {args.master_yaml} \\")
+    logging.info(f"       --output-dir {args.output_dir} \\")
+    logging.info(f"       --generate-yaml {yaml_out}")
+    logging.info("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/magpietts/inspect_crosslingual_dataset.py b/scripts/magpietts/inspect_crosslingual_dataset.py
new file mode 100644
index 000000000000..9ce0c648fe39
--- /dev/null
+++ b/scripts/magpietts/inspect_crosslingual_dataset.py
@@ -0,0 +1,151 @@
+"""
+Inspect the cross-lingual context dataset by decoding target and context
+audio codes back to waveforms and saving them alongside the original
+recording audio for comparison.
+
+Usage (inside docker):
+    python scripts/magpietts/inspect_crosslingual_dataset.py \
+        --shar-dir /data/crosslingual_context_dataset/lhotse_shar \
+        --codec-model-path /model_artifacts/25fps_spectral_codec_with_bandwidth_extension.nemo \
+        --codec-name 25fpsSpectralCodecBWE \
+        --output-dir /data/crosslingual_context_dataset/inspect \
+        --num-samples 10
+"""
+
+import argparse
+import logging
+import os
+
+import numpy as np
+import soundfile as sf
+import torch
+from lhotse import CutSet
+
+from nemo.collections.tts.models import AudioCodecModel
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Inspect cross-lingual dataset: decode codes and save audio.")
+    parser.add_argument("--shar-dir", required=True, help="Path to lhotse_shar directory.")
+    parser.add_argument("--codec-model-path", required=True, help="Path to .nemo codec model.")
+    parser.add_argument("--codec-name", default="25fpsSpectralCodecBWE", help="Codec subdirectory name.")
+    parser.add_argument("--output-dir", required=True, help="Directory to save inspection outputs.")
+    parser.add_argument("--num-samples", type=int, default=10, help="Number of samples to inspect.")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Load codec model
+    logging.info(f"Loading codec model from {args.codec_model_path}")
+    codec_model = AudioCodecModel.restore_from(args.codec_model_path, map_location="cpu", strict=False)
+    codec_model = codec_model.to(device)
+    codec_model.eval()
+    codec_sr = codec_model.sample_rate
+    logging.info(f"Codec output sample rate: {codec_sr}")
+
+    # Build shar fields for first shard
+    cuts_dir = os.path.join(args.shar_dir, "cuts")
+    target_audio_dir = os.path.join(args.shar_dir, "target_audio")
+    context_audio_dir = os.path.join(args.shar_dir, "context_audio")
+    target_codes_dir = os.path.join(args.shar_dir, args.codec_name, "target_codes")
+    context_codes_dir = os.path.join(args.shar_dir, args.codec_name, "context_codes")
+
+    # Use first shard only
+    fields = {
+        "cuts": [os.path.join(cuts_dir, "cuts.000000.jsonl.gz")],
+        "recording": [os.path.join(target_audio_dir, "recording.000000.tar")],
+        "context_recording": [os.path.join(context_audio_dir, "recording.000000.tar")],
+        "target_codes": [os.path.join(target_codes_dir, "codes.000000.tar")],
+        "context_codes": [os.path.join(context_codes_dir, "codes.000000.tar")],
+    }
+
+    for k, v in fields.items():
+        if not os.path.isfile(v[0]):
+            logging.error(f"Missing file for '{k}': {v[0]}")
+            return
+
+    logging.info("Loading CutSet from shar...")
+    cutset = CutSet.from_shar(fields=fields)
+
+    count = 0
+    for cut in cutset:
+        if count >= args.num_samples:
+            break
+
+        sup = cut.supervisions[0] if cut.supervisions else None
+        lang = sup.language if sup else "unk"
+        speaker = sup.speaker if sup else "unk"
+        ctx_lang = sup.custom.get("context_language", "unk") if sup and hasattr(sup, "custom") else "unk"
+        ssim = sup.custom.get("context_speaker_similarity", "N/A") if sup and hasattr(sup, "custom") else "N/A"
+
+        sample_dir = os.path.join(args.output_dir, f"sample_{count:03d}_{lang}")
+        os.makedirs(sample_dir, exist_ok=True)
+
+        logging.info(f"--- Sample {count} ---")
+        logging.info(f"  Cut ID: {cut.id}")
+        logging.info(f"  Target lang: {lang}, Context lang: {ctx_lang}, SSIM: {ssim}")
+        logging.info(f"  Speaker: {speaker}")
+        if sup:
+            logging.info(f"  Text: {sup.text[:80]}...")
+
+        # 1. Save original target recording audio
+        target_audio_np = cut.recording.resample(codec_sr).load_audio().squeeze(0)
+        sf.write(os.path.join(sample_dir, "target_recording.wav"), target_audio_np, codec_sr)
+        logging.info(f"  Saved target_recording.wav ({len(target_audio_np)/codec_sr:.2f}s)")
+
+        # 2. Save original context recording audio
+        if cut.has_custom("context_recording"):
+            ctx_audio_np = cut.context_recording.resample(codec_sr).load_audio().squeeze(0)
+            sf.write(os.path.join(sample_dir, "context_recording.wav"), ctx_audio_np, codec_sr)
+            logging.info(f"  Saved context_recording.wav ({len(ctx_audio_np)/codec_sr:.2f}s)")
+
+        # 3. Decode target codes -> audio
+        if cut.has_custom("target_codes"):
+            target_codes_np = cut.target_codes.load().astype(np.int32)  # (C, T)
+            target_codes_t = torch.from_numpy(target_codes_np).unsqueeze(0).to(device)  # (1, C, T)
+            target_codes_len = torch.tensor([target_codes_t.shape[2]], device=device)
+            with torch.inference_mode():
+                decoded_target, decoded_target_len = codec_model.decode(
+                    tokens=target_codes_t, tokens_len=target_codes_len
+                )
+            decoded_target_np = decoded_target[0, :decoded_target_len[0]].cpu().float().numpy()
+            sf.write(os.path.join(sample_dir, "target_decoded_from_codes.wav"), decoded_target_np, codec_model.output_sample_rate)
+            logging.info(f"  Saved target_decoded_from_codes.wav ({len(decoded_target_np)/codec_model.output_sample_rate:.2f}s), codes shape: {target_codes_np.shape}")
+        else:
+            logging.warning(f"  No target_codes found for cut {cut.id}")
+
+        # 4. Decode context codes -> audio
+        if cut.has_custom("context_codes"):
+            ctx_codes_np = cut.context_codes.load().astype(np.int32)  # (C, T)
+            ctx_codes_t = torch.from_numpy(ctx_codes_np).unsqueeze(0).to(device)  # (1, C, T)
+            ctx_codes_len = torch.tensor([ctx_codes_t.shape[2]], device=device)
+            with torch.inference_mode():
+                decoded_ctx, decoded_ctx_len = codec_model.decode(
+                    tokens=ctx_codes_t, tokens_len=ctx_codes_len
+                )
+            decoded_ctx_np = decoded_ctx[0, :decoded_ctx_len[0]].cpu().float().numpy()
+            sf.write(os.path.join(sample_dir, "context_decoded_from_codes.wav"), decoded_ctx_np, codec_model.output_sample_rate)
+            logging.info(f"  Saved context_decoded_from_codes.wav ({len(decoded_ctx_np)/codec_model.output_sample_rate:.2f}s), codes shape: {ctx_codes_np.shape}")
+        else:
+            logging.warning(f"  No context_codes found for cut {cut.id}")
+
+        # 5. Write metadata
+        with open(os.path.join(sample_dir, "info.txt"), "w") as f:
+            f.write(f"cut_id: {cut.id}\n")
+            f.write(f"target_language: {lang}\n")
+            f.write(f"context_language: {ctx_lang}\n")
+            f.write(f"speaker: {speaker}\n")
+            f.write(f"context_speaker_similarity: {ssim}\n")
+            f.write(f"text: {sup.text if sup else ''}\n")
+            f.write(f"duration: {cut.duration}\n")
+
+        count += 1
+
+    logging.info(f"Done. Saved {count} samples to {args.output_dir}")
+
+
+if __name__ == "__main__":
+    main()

From c14083a5a7dfddf5b0702431a3b666e95f2a6f68 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeenh@nvidia.com>
Date: Wed, 4 Mar 2026 18:33:21 -0800
Subject: [PATCH 72/94] add language to text contexts.

Signed-off-by: Shehzeen Hussain <shehzeenh@nvidia.com>
---
 nemo/collections/tts/data/text_to_speech_dataset.py      | 8 +++++++-
 .../tts/data/text_to_speech_dataset_lhotse.py            | 9 ++++++++-
 nemo/collections/tts/models/easy_magpietts.py            | 3 +++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py
index 65671b8606ed..4d99c463d18b 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset.py
@@ -380,6 +380,7 @@ def __init__(
         text_context_remapping: Dict[str, str] = None,
         text_context_remapping_prob: float = 0.0,
         ignore_phoneme_languages: List[str] = None,
+        add_language_to_context_text: bool = False,
     ):
         super().__init__(
             dataset_meta=dataset_meta,
@@ -414,6 +415,7 @@ def __init__(
         self.text_context_remapping = text_context_remapping
         self.text_context_remapping_prob = text_context_remapping_prob
         self.ignore_phoneme_languages = ignore_phoneme_languages or []
+        self.add_language_to_context_text = add_language_to_context_text
 
     def get_num_audio_samples_to_slice(self, duration, sample_rate):
         num_codec_frames = int(duration * sample_rate / self.codec_model_samples_per_frame)
@@ -602,7 +604,11 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float)
                 context_tokens = self.text_tokenizer.encode(context_text, self.text_conditioning_tokenizer_name)
                 example['has_text_context'] = True
             else:
-                context_tokens = self.text_tokenizer.encode("[NO TEXT CONTEXT]", self.text_conditioning_tokenizer_name)
+                if self.add_language_to_context_text:
+                    context_text = f"[{language.upper()}]"
+                else:
+                    context_text = "[NO TEXT CONTEXT]"
+                context_tokens = self.text_tokenizer.encode(context_text, self.text_conditioning_tokenizer_name)
                 example['has_text_context'] = False
             if self.pad_context_text_to_max_duration:
                 _required_len = (
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index cb478c87fe7f..356cc8ca4d15 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -152,6 +152,7 @@ def __init__(
         text_context_remapping_prob: float = 0.0,
         phoneme_tokenizer_config: DictConfig = None,
         ignore_phoneme_languages: List[str] = None,
+        add_language_to_context_text: bool = False,
     ):
         super().__init__()
         self.sample_rate = sample_rate
@@ -177,6 +178,7 @@ def __init__(
         self.text_context_remapping_prob = text_context_remapping_prob
         self.phoneme_tokenizer_config = phoneme_tokenizer_config
         self.ignore_phoneme_languages = ignore_phoneme_languages or []
+        self.add_language_to_context_text = add_language_to_context_text
 
     def get_num_audio_samples_to_slice(self, duration, sample_rate):
         num_codec_frames = int(duration * sample_rate / self.codec_model_samples_per_frame)
@@ -388,8 +390,13 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float)
                     )
                     has_text_context = True
                 else:
+                    if self.add_language_to_context_text:
+                        context_text = f"[{language.upper()}]"
+                    else:
+                        context_text = "[NO TEXT CONTEXT]"
+
                     context_text_tokens = self.text_tokenizer.encode(
-                        "[NO TEXT CONTEXT]", tokenizer_name=self.text_conditioning_tokenizer_name
+                        context_text, tokenizer_name=self.text_conditioning_tokenizer_name
                     )
                     has_text_context = False
                 if self.pad_context_text_to_max_duration:
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index e0f2a87da55a..e69e877ed94c 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -402,6 +402,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0)
 
         self.pad_context_text_to_max_duration = False
+        self.add_language_to_context_text = cfg.get('add_language_to_context_text', False)
 
         super().__init__(cfg=cfg, trainer=trainer)
 
@@ -2475,6 +2476,7 @@ def get_dataset(self, dataset_cfg, dataset_type):
             use_text_conditioning_tokenizer=True,
             text_conditioning_tokenizer_name=self.text_conditioning_tokenizer_name,
             pad_context_text_to_max_duration=self.pad_context_text_to_max_duration,
+            add_language_to_context_text=self.add_language_to_context_text,
             context_duration_min=self.cfg.context_duration_min,
             context_duration_max=self.cfg.context_duration_max,
             ignore_phoneme_languages=self.cfg.get("ignore_phoneme_languages", []),
@@ -2508,6 +2510,7 @@ def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.D
             tokenizer_config=self.cfg.text_tokenizers,
             phoneme_tokenizer_config=self.cfg.get("phoneme_tokenizer", None),
             ignore_phoneme_languages=self.cfg.get("ignore_phoneme_languages", []),
+            add_language_to_context_text=self.add_language_to_context_text,
         )
 
         data_loader = get_lhotse_dataloader_from_config(

From 19ff0eae460568bb7b840db141bbef3867270e95 Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Mon, 9 Mar 2026 16:32:58 +0000
Subject: [PATCH 73/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 examples/tts/magpietts_inference.py           |   6 +-
 nemo/collections/tts/models/easy_magpietts.py |  88 ++++++---
 .../easy_magpietts_preference_optimization.py | 110 ++++++-----
 .../modules/magpietts_inference/inference.py  |   2 +
 nemo/collections/tts/modules/utmosv2.py       |   2 +-
 nemo/collections/tts/parts/utils/helpers.py   |   9 +-
 .../create_crosslingual_context_dataset.py    | 174 ++++++++++++------
 .../magpietts/inspect_crosslingual_dataset.py |  28 ++-
 .../tts/test_infer_vs_process_batch.py        |  20 +-
 9 files changed, 285 insertions(+), 154 deletions(-)

diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py
index 19085f78eb96..d38c093eb1de 100644
--- a/examples/tts/magpietts_inference.py
+++ b/examples/tts/magpietts_inference.py
@@ -507,7 +507,11 @@ def create_argument_parser() -> argparse.ArgumentParser:
     target_group.add_argument('--cer_target', type=float, default=None)
     target_group.add_argument('--ssim_target', type=float, default=None)
     target_group.add_argument('--is_decoder_only_model', action='store_true')
-    target_group.add_argument('--legacy_context_stacking', action='store_true', help='Use audio_bos_id/audio_eos_id instead of context_audio_bos_id/context_audio_eos_id for context stacking')
+    target_group.add_argument(
+        '--legacy_context_stacking',
+        action='store_true',
+        help='Use audio_bos_id/audio_eos_id instead of context_audio_bos_id/context_audio_eos_id for context stacking',
+    )
     target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt'])
     target_group.add_argument(
         '--phoneme_sampling_method', type=str, default='argmax', choices=['argmax', 'multinomial']
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index e69e877ed94c..e8bb877dfb53 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -752,7 +752,7 @@ def codes_to_audio(self, codes, codes_len):
                 codes = torch.nn.functional.pad(input=codes, pad=(0, 4 - codes_len.min()), value=0)
                 # Updates all lens less than 4 to 4
                 codes_len = torch.where(codes_len < 4, torch.ones_like(codes_len) * 4, codes_len)
-                codes = codes[:,:,:codes_len.max()]
+                codes = codes[:, :, : codes_len.max()]
 
             audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len)
             # audio: (B, T)
@@ -1291,9 +1291,13 @@ def prepare_context_tensors(
         )
 
         # Use legacy audio_bos_id/audio_eos_id if flag is set
-        stack_bos_id = self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id
-        stack_eos_id = self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id
-        
+        stack_bos_id = (
+            self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id
+        )
+        stack_eos_id = (
+            self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id
+        )
+
         context_audio_codes, context_audio_codes_lens = self.stack_codes(
             context_audio_codes,
             context_audio_codes_lens,
@@ -1532,7 +1536,9 @@ def corrupt_stacked_phoneme_tokens(
         source_index = torch.arange(min_len, device=phoneme_tokens_stacked.device, dtype=torch.long)
         step_delta = torch.ones(min_len, device=phoneme_tokens_stacked.device, dtype=torch.long)
         op_is_repeat = torch.rand(corrupt_steps.numel(), device=phoneme_tokens_stacked.device) < 0.5
-        step_delta[corrupt_steps] = torch.where(op_is_repeat, torch.zeros_like(corrupt_steps), torch.full_like(corrupt_steps, 2))
+        step_delta[corrupt_steps] = torch.where(
+            op_is_repeat, torch.zeros_like(corrupt_steps), torch.full_like(corrupt_steps, 2)
+        )
         source_index = torch.cumsum(step_delta, dim=0) - step_delta[0]
         source_index = torch.clamp(source_index, min=0, max=min_len - 1)
         source_index[0] = 0
@@ -1849,8 +1855,19 @@ def process_batch(
         dropout_complete_phoneme_channel = False
         if self.phoneme_tokenizer is not None and phoneme_tokens is not None:
             # Corrupt phonemes only when text input is not dropped.
-            apply_phoneme_corruption = mode == 'train' and (not dropout_text_input) and (not dropout_conditional_input) and self.phoneme_corruption_type == 'repeat_skip_unk'
-            dropout_complete_phoneme_channel = mode == 'train' and ( dropout_conditional_input or (self.phoneme_corruption_type == 'complete_channel' and torch.rand(1).item() < self.phoneme_corruption_batch_prob))
+            apply_phoneme_corruption = (
+                mode == 'train'
+                and (not dropout_text_input)
+                and (not dropout_conditional_input)
+                and self.phoneme_corruption_type == 'repeat_skip_unk'
+            )
+            dropout_complete_phoneme_channel = mode == 'train' and (
+                dropout_conditional_input
+                or (
+                    self.phoneme_corruption_type == 'complete_channel'
+                    and torch.rand(1).item() < self.phoneme_corruption_batch_prob
+                )
+            )
             (
                 phoneme_channel_embedding,
                 phoneme_channel_lens,
@@ -1999,7 +2016,9 @@ def process_batch(
             pb_phoneme_tokens_target = phoneme_tokens_stacked_clean[:, :, 1:].long()
             pb_phoneme_tokens_lens_target = phoneme_tokens_lens_stacked - 1
 
-            if (phoneme_corruption_mode != 'repeat_skip') and not (dropout_complete_phoneme_channel or dropout_conditional_input or dropout_text_input):
+            if (phoneme_corruption_mode != 'repeat_skip') and not (
+                dropout_complete_phoneme_channel or dropout_conditional_input or dropout_text_input
+            ):
                 phoneme_loss, _ = self.compute_phoneme_loss(
                     pb_phoneme_logits, pb_phoneme_tokens_target, pb_phoneme_tokens_lens_target
                 )
@@ -2197,7 +2216,7 @@ def validation_step(self, batch, batch_idx):
                 topk=80,
                 use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR,
                 use_cfg=self.cfg.get('inference_use_cfg_in_val', True),
-                cfg_scale=2.5
+                cfg_scale=2.5,
             )
 
             # Get audio output directory
@@ -2248,11 +2267,7 @@ def validation_step(self, batch, batch_idx):
 
                     # Save context audio for SSIM computation
                     ctx_audio_np = (
-                        context_audio_cleaned[idx]
-                        .float()
-                        .detach()
-                        .cpu()
-                        .numpy()[: context_audio_lens_cleaned[idx]]
+                        context_audio_cleaned[idx].float().detach().cpu().numpy()[: context_audio_lens_cleaned[idx]]
                     )
                     ctx_path = os.path.join(audio_dir, f'rank{self.global_rank}_batch{batch_idx}_idx{idx}_context.wav')
                     sf.write(ctx_path, ctx_audio_np, self.output_sample_rate)
@@ -2278,7 +2293,9 @@ def validation_step(self, batch, batch_idx):
                             )
                             pred_transcripts = [process_text_for_cer(transcript) for transcript in transcripts]
                         except Exception as e:
-                            logging.warning(f"Val batched ASR transcription failed, falling back to per-file mode: {e}")
+                            logging.warning(
+                                f"Val batched ASR transcription failed, falling back to per-file mode: {e}"
+                            )
                             pred_transcripts = []
                             for item_idx, audio_path in enumerate(predicted_audio_paths):
                                 lang = languages[item_idx] if item_idx < len(languages) else 'en'
@@ -2346,7 +2363,9 @@ def validation_step(self, batch, batch_idx):
                         if pred_embeddings is not None and ctx_embeddings is not None:
                             pred_emb = pred_embeddings[idx].cpu().float().numpy()
                             ctx_emb = ctx_embeddings[idx].cpu().float().numpy()
-                            ssim = float(np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb)))
+                            ssim = float(
+                                np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb))
+                            )
                             batch_ssim.append(ssim)
 
                         # UTMOSv2 naturalness score (MOS on 1-5 scale)
@@ -2932,7 +2951,9 @@ def streaming_step(
             # ==================== DETERMINE PHASES PER BATCH ITEM ====================
             needs_context = state.context_position < state.full_context_lens  # (B,) bool
             needs_text = (~needs_context) & (~state.text_finished)
-            needs_phoneme = (~needs_context) & (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended)
+            needs_phoneme = (
+                (~needs_context) & (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended)
+            )
             needs_audio = (~needs_context) & (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished)
 
             next_input = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
@@ -2974,7 +2995,7 @@ def streaming_step(
                 # The EOS token itself IS embedded normally (matching process_batch behavior
                 # where EOS is part of the text sequence). After this step, text_finished is set
                 # so subsequent steps won't add any text embedding.
-                is_eos_token = (text_tokens == self.eos_id)  & needs_text # (B,) bool
+                is_eos_token = (text_tokens == self.eos_id) & needs_text  # (B,) bool
                 text_add_mask = needs_text.view(batch_size, 1, 1).float()
                 next_input = next_input + text_embedded * text_add_mask
                 state.text_finished = state.text_finished | is_eos_token
@@ -3023,12 +3044,11 @@ def streaming_step(
                             )  # (B, 1, E)
                             last_mask = has_last_phoneme.view(batch_size, 1, 1).float()
                             phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask
-                        
+
                         # Only end phoneme stream in prediction mode when the phoneme EOS is detected
                         state.phoneme_stream_ended = state.phoneme_stream_ended | state.phoneme_eos_detected
 
                     next_input = next_input + phoneme_emb
-                    
 
             # --- Audio embedding for audio phase items ---
             if needs_audio.any():
@@ -3041,7 +3061,9 @@ def streaming_step(
                     positions = state.audio_steps.clamp(max=state.gt_audio_embeddings.size(1) - 1)
                     gt_emb = state.gt_audio_embeddings[
                         torch.arange(batch_size, device=device), positions, :
-                    ].unsqueeze(1)  # (B, 1, E)
+                    ].unsqueeze(
+                        1
+                    )  # (B, 1, E)
                     audio_mask = (needs_audio & within_gt_len).view(batch_size, 1, 1).float()
                     audio_emb = audio_emb + gt_emb * audio_mask
                 else:
@@ -3145,7 +3167,7 @@ def streaming_step(
                 ).any(
                     dim=1
                 )  # (B,)
-                
+
                 state.phoneme_eos_detected = state.phoneme_eos_detected | phoneme_eos_detected
 
                 # Track phoneme prediction end index for items that just ended
@@ -3203,7 +3225,7 @@ def streaming_step(
                         torch.full((batch_size,), S, device=device),  # no EOS in this step
                     )  # (B,)
 
-                    audio_eos_detected = eos_any_codebook.any(dim=1)  & needs_audio
+                    audio_eos_detected = eos_any_codebook.any(dim=1) & needs_audio
                     state.finished = state.finished | audio_eos_detected
 
                     # Track audio prediction end index (in frames) for items that just ended
@@ -3238,7 +3260,9 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor:
         # Get phoneme logits
         all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :])
         all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size]
-        phoneme_logits = all_code_logits_t_phoneme.view(actual_batch_size, self.phoneme_stacking_factor, self.phoneme_vocab_size)
+        phoneme_logits = all_code_logits_t_phoneme.view(
+            actual_batch_size, self.phoneme_stacking_factor, self.phoneme_vocab_size
+        )
         max_probs = torch.softmax(phoneme_logits, dim=-1).max(dim=-1).values  # (B, phoneme_stacking_factor)
 
         # Sample phonemes
@@ -3256,7 +3280,9 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor:
             and hasattr(self.phoneme_tokenizer, 'unk_token_id')
             and self.phoneme_confidence_unk_threshold > 0.0
         ):
-            underconfident_step = (max_probs < self.phoneme_confidence_unk_threshold).any(dim=1, keepdim=True)  # (B, 1)
+            underconfident_step = (max_probs < self.phoneme_confidence_unk_threshold).any(
+                dim=1, keepdim=True
+            )  # (B, 1)
             eos_predicted_step = (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1, keepdim=True)
             replace_with_unk = underconfident_step & (~eos_predicted_step)
             if replace_with_unk.any():
@@ -3622,7 +3648,9 @@ def infer_batch(
                 rtf_metrics=rtf_metrics,
                 predicted_phoneme_tokens=ib_phoneme_tokens,
                 predicted_phoneme_tokens_lens=ib_phoneme_tokens_lens,
-                phoneme_prediction_start_idx=state.phoneme_prediction_start_idx.clone() if ib_phoneme_tokens is not None else None,
+                phoneme_prediction_start_idx=(
+                    state.phoneme_prediction_start_idx.clone() if ib_phoneme_tokens is not None else None
+                ),
             )
 
     @staticmethod
@@ -3738,12 +3766,16 @@ def do_tts(
         phoneme_input_type = 'pred'
         if gt_phoneme_text is not None:
             if self.phoneme_tokenizer is None:
-                raise ValueError("Model does not have a phoneme tokenizer configured, but gt_phoneme_text was provided.")
+                raise ValueError(
+                    "Model does not have a phoneme tokenizer configured, but gt_phoneme_text was provided."
+                )
             gt_phoneme_text = gt_phoneme_text.strip()
             if gt_phoneme_text == "":
                 raise ValueError("`gt_phoneme_text` must be a non-empty string when provided.")
             gt_phoneme_tokens = self.phoneme_tokenizer.encode(gt_phoneme_text)
-            gt_phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + gt_phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
+            gt_phoneme_tokens = (
+                [self.phoneme_tokenizer.bos_token_id] + gt_phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
+            )
             if len(gt_phoneme_tokens) == 0:
                 raise ValueError("Failed to encode `gt_phoneme_text` into phoneme tokens.")
             batch['phoneme_tokens'] = torch.tensor([gt_phoneme_tokens], dtype=torch.long, device=device)
diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
index 1643474dc5ce..020e7af77aa5 100644
--- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
+++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
@@ -75,8 +75,8 @@ class EasyMagpieTTSModelOnlinePO(EasyMagpieTTSModel):
 
     def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         super().__init__(cfg, trainer)
-        
-        self.run_val_inference = True # Always run validation inference in PO.
+
+        self.run_val_inference = True  # Always run validation inference in PO.
         self.automatic_optimization = False
 
         ref_model_cfg = copy.deepcopy(cfg)
@@ -138,9 +138,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
 
         self.loss_type = self.cfg.get('loss_type', 'grpo')
         if self.loss_type not in ['grpo', 'dr_grpo']:
-            raise ValueError(
-                f"Received loss_type={self.loss_type}. Supported values: ['grpo', 'dr_grpo']."
-            )
+            raise ValueError(f"Received loss_type={self.loss_type}. Supported values: ['grpo', 'dr_grpo'].")
         self.scale_rewards = self.cfg.get('scale_rewards', True)
         self.max_decoder_steps = self.cfg.get('max_decoder_steps', 220)
         self.aux_phoneme_loss_weight = self.cfg.get('aux_phoneme_loss_weight', 1.0)
@@ -159,17 +157,20 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.best_cer_threshold = self.cfg.get('best_cer_threshold', 1.0)
         self.worst_cer_threshold = self.cfg.get('worst_cer_threshold', 1.0)
 
-
-
         if self.trainer is not None and str(self.trainer.precision) in ("32", "32-true"):
             self.decoder.float()
 
     def _get_trainable_module_groups(self) -> Dict[str, List[torch.nn.Parameter]]:
         """Return a dict mapping module-group name → list of trainable parameters."""
         modules_to_exclude = {
-            '_speaker_verification_model', '_codec_model', '_eval_asr_model',
-            '_eval_speaker_verification_model', '_reference_model',
-            'whisper_model', 'whisper_processor', 'squim_objective_model',
+            '_speaker_verification_model',
+            '_codec_model',
+            '_eval_asr_model',
+            '_eval_speaker_verification_model',
+            '_reference_model',
+            'whisper_model',
+            'whisper_processor',
+            'squim_objective_model',
             '_utmos_calculator',
         }
         groups: Dict[str, List[torch.nn.Parameter]] = {}
@@ -195,21 +196,21 @@ def _compute_grad_and_weight_metrics(self) -> Dict[str, float]:
                 if p.grad is not None:
                     grad_norms.append(p.grad.data.norm(2).item())
 
-            module_weight_norm = float(np.sqrt(sum(w ** 2 for w in weight_norms)))
+            module_weight_norm = float(np.sqrt(sum(w**2 for w in weight_norms)))
             metrics[f'weight_norm/{group_name}'] = module_weight_norm
             all_weight_norms.extend(weight_norms)
 
             if grad_norms:
-                module_grad_norm = float(np.sqrt(sum(g ** 2 for g in grad_norms)))
+                module_grad_norm = float(np.sqrt(sum(g**2 for g in grad_norms)))
                 metrics[f'grad_norm/{group_name}'] = module_grad_norm
                 all_grad_norms.extend(grad_norms)
             else:
                 metrics[f'grad_norm/{group_name}'] = 0.0
 
         if all_grad_norms:
-            metrics['grad_norm/global'] = float(np.sqrt(sum(g ** 2 for g in all_grad_norms)))
+            metrics['grad_norm/global'] = float(np.sqrt(sum(g**2 for g in all_grad_norms)))
         if all_weight_norms:
-            metrics['weight_norm/global'] = float(np.sqrt(sum(w ** 2 for w in all_weight_norms)))
+            metrics['weight_norm/global'] = float(np.sqrt(sum(w**2 for w in all_weight_norms)))
         return metrics
 
     @torch.no_grad()
@@ -225,10 +226,10 @@ def _compute_weight_update_metrics(self, prev_weights: Dict[int, torch.Tensor])
                 if pid in prev_weights:
                     deltas.append((p.data - prev_weights[pid]).norm(2).item())
             if deltas:
-                metrics[f'weight_delta/{group_name}'] = float(np.sqrt(sum(d ** 2 for d in deltas)))
+                metrics[f'weight_delta/{group_name}'] = float(np.sqrt(sum(d**2 for d in deltas)))
                 all_deltas.extend(deltas)
         if all_deltas:
-            metrics['weight_delta/global'] = float(np.sqrt(sum(d ** 2 for d in all_deltas)))
+            metrics['weight_delta/global'] = float(np.sqrt(sum(d**2 for d in all_deltas)))
         return metrics
 
     @torch.no_grad()
@@ -245,14 +246,15 @@ def _print_grad_weight_summary(self, metrics: Dict[str, float], step: int) -> No
         if not getattr(self.trainer, "is_global_zero", True):
             return
 
-        lines = [f"\n[grad/weight] step={step}  "
-                 f"grad={metrics.get('grad_norm/global', 0.0):.6f}  "
-                 f"w={metrics.get('weight_norm/global', 0.0):.4f}  "
-                 f"Δw={metrics.get('weight_delta/global', 0.0):.8f}"]
+        lines = [
+            f"\n[grad/weight] step={step}  "
+            f"grad={metrics.get('grad_norm/global', 0.0):.6f}  "
+            f"w={metrics.get('weight_norm/global', 0.0):.4f}  "
+            f"Δw={metrics.get('weight_delta/global', 0.0):.8f}"
+        ]
 
         module_names = sorted(
-            k.split('/')[1] for k in metrics
-            if k.startswith('weight_norm/') and k != 'weight_norm/global'
+            k.split('/')[1] for k in metrics if k.startswith('weight_norm/') and k != 'weight_norm/global'
         )
         for name in module_names:
             gn = metrics.get(f'grad_norm/{name}', 0.0)
@@ -317,7 +319,9 @@ def _get_cached_normalizer(self, lang_key: Optional[str]):
                 self._normalizer_cache[lang_key] = None
         return self._normalizer_cache[lang_key]
 
-    def _get_per_token_logps(self, logits: torch.Tensor, labels: torch.Tensor, loss_mask: torch.Tensor) -> torch.Tensor:
+    def _get_per_token_logps(
+        self, logits: torch.Tensor, labels: torch.Tensor, loss_mask: torch.Tensor
+    ) -> torch.Tensor:
         # Force fp32 for log_softmax to avoid bf16 precision issues that sever the
         # gradient path through the GRPO "exp(logps - logps.detach())" trick.
         # Under bf16 autocast, the tiny gradient signal through this identity-like
@@ -328,7 +332,6 @@ def _get_per_token_logps(self, logits: torch.Tensor, labels: torch.Tensor, loss_
             per_token_logps = per_token_logps * loss_mask.float()
         return per_token_logps
 
-
     def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False):
         """
         Override parent to force fp32 computation for the entire local transformer logits path.
@@ -428,7 +431,7 @@ def _get_reference_audio_paths(self, batch_repeated: Dict) -> List[str]:
                 context_codes[short_context_mask] = target_codes[short_context_mask]
                 context_lens[short_context_mask] = target_lens[short_context_mask]
                 # Slice to the actual max length needed
-                context_codes = context_codes[..., :context_lens.max()]
+                context_codes = context_codes[..., : context_lens.max()]
 
             if self._codec_converter is not None:
                 context_codes = self._codec_converter.convert_original_to_new(
@@ -523,19 +526,25 @@ def _print_group_cer_wer_table(
                 ]
             )
 
-        table = self._format_text_table(headers=["item", "cer", "wer", "ssim", "utmos", "reward", "advantage"], rows=rows)
+        table = self._format_text_table(
+            headers=["item", "cer", "wer", "ssim", "utmos", "reward", "advantage"], rows=rows
+        )
         print(
             f"[generate_and_reward] group={group_idx} valid={is_group_valid} "
             f"mean_reward={mean_reward:.4f} std_reward={std_reward:.4f}\n"
             f"prompt: {prompt_text}\n{table}\n"
         )
 
-    def _compute_pred_transcripts(self, predicted_audio_paths: List[str], batch_repeated: Dict, reward_asr_model: str) -> List[str]:
+    def _compute_pred_transcripts(
+        self, predicted_audio_paths: List[str], batch_repeated: Dict, reward_asr_model: str
+    ) -> List[str]:
         if reward_asr_model == 'nemo':
             pred_transcripts = self._eval_asr_model.transcribe(
                 predicted_audio_paths,
                 batch_size=len(predicted_audio_paths),
-                override_config=TranscribeConfig(use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0),
+                override_config=TranscribeConfig(
+                    use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0
+                ),
             )
             return [process_text_for_cer(transcript.text) for transcript in pred_transcripts]
 
@@ -652,7 +661,9 @@ def generate_and_reward(
             sample_rate=self.output_sample_rate,
         )
         audio_save_time_sec = time.perf_counter() - save_start_time
-        audio_durations = [int(predicted_audio_lens[idx].item()) / self.output_sample_rate for idx in range(predicted_audio.size(0))]
+        audio_durations = [
+            int(predicted_audio_lens[idx].item()) / self.output_sample_rate for idx in range(predicted_audio.size(0))
+        ]
 
         rewarding_start_time = time.perf_counter()
         pred_transcripts = self._compute_pred_transcripts(predicted_audio_paths, batch_repeated, reward_asr_model)
@@ -747,9 +758,7 @@ def generate_and_reward(
                         best_utmos_achievable - mean_utmos_dataset, 1e-8
                     )
                 else:
-                    utmos_reward = 0.5 - 0.5 * (mean_utmos_dataset - item_utmos) / max(
-                        mean_utmos_dataset - 1.0, 1e-8
-                    )
+                    utmos_reward = 0.5 - 0.5 * (mean_utmos_dataset - item_utmos) / max(mean_utmos_dataset - 1.0, 1e-8)
             else:
                 utmos_reward = 0.0
 
@@ -759,7 +768,9 @@ def generate_and_reward(
                 + pesq_reward * pesq_reward_weight
                 + utmos_reward * utmos_reward_weight
             )
-            if (item_metrics['codes_len'] >= max_valid_codes_len) or (item_metrics['codes_len'] <= min_valid_codes_len):
+            if (item_metrics['codes_len'] >= max_valid_codes_len) or (
+                item_metrics['codes_len'] <= min_valid_codes_len
+            ):
                 item_metrics['_needs_group_min_reward'] = True
             else:
                 item_metrics['_needs_group_min_reward'] = False
@@ -843,10 +854,12 @@ def generate_and_reward(
         }
 
     def process_batch_online_po(self, batch: Dict, n_generations_per_item: int, mode: str = 'train'):
-        generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = self._prepare_online_po_inputs(
-            batch=batch,
-            n_generations_per_item=n_generations_per_item,
-            mode=mode,
+        generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = (
+            self._prepare_online_po_inputs(
+                batch=batch,
+                n_generations_per_item=n_generations_per_item,
+                mode=mode,
+            )
         )
         chunked_outputs = self._run_teacher_forced_chunked_po(
             generated_codes_and_metrics=generated_codes_and_metrics,
@@ -970,14 +983,17 @@ def _compute_po_losses_from_outputs(
             per_token_logps = self._get_per_token_logps(codebook_logits, codebook_labels, audio_loss_mask)
             # Ensure the GRPO policy gradient trick stays in fp32 to preserve gradient signal
             with torch.cuda.amp.autocast(enabled=False):
-                per_token_loss = -(torch.exp(per_token_logps.float() - per_token_logps.float().detach()) * advantages.float().unsqueeze(1))
+                per_token_loss = -(
+                    torch.exp(per_token_logps.float() - per_token_logps.float().detach())
+                    * advantages.float().unsqueeze(1)
+                )
                 per_token_loss = per_token_loss * group_validities.float().unsqueeze(1)
 
             # Per-token entropy of the policy distribution (always computed for logging).
             with torch.cuda.amp.autocast(enabled=False):
                 logits_fp32 = codebook_logits.float()
-                log_probs = logits_fp32.log_softmax(-1)          # [B, T, V]
-                probs = log_probs.exp()                           # [B, T, V]
+                log_probs = logits_fp32.log_softmax(-1)  # [B, T, V]
+                probs = log_probs.exp()  # [B, T, V]
                 per_token_entropy = -(probs * log_probs).sum(-1)  # [B, T]
             codebook_entropy = (
                 (per_token_entropy * audio_loss_mask).sum(dim=1) / audio_loss_mask.sum(dim=1).clamp_min(1e-8)
@@ -991,7 +1007,9 @@ def _compute_po_losses_from_outputs(
                     )
                 with torch.cuda.amp.autocast(enabled=False):
                     per_token_kl = (
-                        torch.exp(per_token_ref_logps.float() - per_token_logps.float()) - (per_token_ref_logps.float() - per_token_logps.float()) - 1
+                        torch.exp(per_token_ref_logps.float() - per_token_logps.float())
+                        - (per_token_ref_logps.float() - per_token_logps.float())
+                        - 1
                     )
                     per_token_loss = per_token_loss + self.cfg.get('grpo_beta', 0.0) * per_token_kl
                 codebook_kl_loss_mean = (
@@ -1140,10 +1158,12 @@ def training_step(self, batch, batch_idx):
         # Snapshot weights before optimizer step to measure weight deltas.
         prev_weights = self._snapshot_trainable_weights()
 
-        generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = self._prepare_online_po_inputs(
-            batch=batch,
-            n_generations_per_item=n_generations_per_item,
-            mode='train',
+        generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = (
+            self._prepare_online_po_inputs(
+                batch=batch,
+                n_generations_per_item=n_generations_per_item,
+                mode='train',
+            )
         )
         teacher_forced_start_time = time.perf_counter()
         po_outputs = self._run_teacher_forced_chunked_po(
diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index 70ca811f58a2..cf325b91d71c 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -84,6 +84,7 @@ class InferenceConfig:
     longform_word_threshold: int = 40  # Word threshold for auto-detection
 
     is_decoder_only_model: bool = False
+
     def build_identifier(self) -> str:
         """Build a unique identifier string for this configuration.
 
@@ -374,6 +375,7 @@ def _run_decoder_only_inference(
                 item_idx += 1
 
         return all_rtf_metrics, generated_audio_paths, codec_file_paths
+
     @staticmethod
     def _batch_to_cuda(batch: dict) -> dict:
         """Move batch tensors to CUDA device."""
diff --git a/nemo/collections/tts/modules/utmosv2.py b/nemo/collections/tts/modules/utmosv2.py
index 46b17316d0ea..e71d7e5f0316 100644
--- a/nemo/collections/tts/modules/utmosv2.py
+++ b/nemo/collections/tts/modules/utmosv2.py
@@ -77,7 +77,7 @@ def process_directory(
         """
         if num_workers is None:
             num_workers = batch_size
-            
+
         with torch.inference_mode():
             # UTMOSV2 tends to launch many of OpenMP threads which overloads the machine's CPUs
             # while actually slowing down the prediction. Limit the number of threads here.
diff --git a/nemo/collections/tts/parts/utils/helpers.py b/nemo/collections/tts/parts/utils/helpers.py
index cf6dbbdcd494..1bbb88ef3434 100644
--- a/nemo/collections/tts/parts/utils/helpers.py
+++ b/nemo/collections/tts/parts/utils/helpers.py
@@ -42,12 +42,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import string
 import os
 import shutil
+import string
 import tempfile
-from enum import Enum
 from collections import defaultdict
+from enum import Enum
 from typing import Any, List, Optional, Sequence, Tuple, Union
 
 import librosa
@@ -62,7 +62,6 @@
 from nemo.utils import logging
 from nemo.utils.decorators import deprecated
 
-
 try:
     from lightning.pytorch.utilities import rank_zero_only
 except ModuleNotFoundError:
@@ -896,7 +895,9 @@ def transcribe_with_whisper_from_filepaths(
 
     transcripts = [""] * len(audio_filepaths)
     for lang, indices in grouped_indices.items():
-        forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language=lang, task="transcribe") if lang else None
+        forced_decoder_ids = (
+            whisper_processor.get_decoder_prompt_ids(language=lang, task="transcribe") if lang else None
+        )
         for start_idx in range(0, len(indices), batch_size):
             batch_indices = indices[start_idx : start_idx + batch_size]
             speech_arrays = [librosa.load(audio_filepaths[idx], sr=16000)[0] for idx in batch_indices]
diff --git a/scripts/magpietts/create_crosslingual_context_dataset.py b/scripts/magpietts/create_crosslingual_context_dataset.py
index 2b488eb4097c..c3bb7008a509 100644
--- a/scripts/magpietts/create_crosslingual_context_dataset.py
+++ b/scripts/magpietts/create_crosslingual_context_dataset.py
@@ -65,6 +65,7 @@
 # YAML / shar helpers
 # ---------------------------------------------------------------------------
 
+
 def parse_master_yaml(yaml_path: str) -> Dict[str, List[Dict]]:
     """
     Parse the master multilingual YAML and each per-language YAML it references.
@@ -90,7 +91,9 @@ def parse_master_yaml(yaml_path: str) -> Dict[str, List[Dict]]:
         for ce in child_entries:
             shar_path = ce.get("shar_path", {})
             if "context_audio" not in shar_path:
-                logging.debug(f"Skipping text-context-only entry (no context_audio): {shar_path.get('cuts', 'unknown')}")
+                logging.debug(
+                    f"Skipping text-context-only entry (no context_audio): {shar_path.get('cuts', 'unknown')}"
+                )
                 continue
             lang_to_shar_entries[lang].append(ce)
 
@@ -108,8 +111,8 @@ def expand_shar_range(pattern: str) -> List[str]:
     start_idx = int(match.group(1))
     end_idx = int(match.group(2))
     width = len(match.group(1))
-    prefix = pattern[:match.start()]
-    suffix = pattern[match.end():]
+    prefix = pattern[: match.start()]
+    suffix = pattern[match.end() :]
     return [f"{prefix}{i:0{width}d}{suffix}" for i in range(start_idx, end_idx + 1)]
 
 
@@ -128,6 +131,7 @@ def parse_speaker_field(speaker_str: str) -> Tuple[str, str, str]:
 # Stage 1: Build speaker embedding index
 # ---------------------------------------------------------------------------
 
+
 def discover_speakers_from_cuts(
     lang_to_shar_entries: Dict[str, List[Dict]],
     max_cuts_per_speaker: int,
@@ -183,7 +187,9 @@ def discover_speakers_from_cuts(
                 except Exception as e:
                     logging.warning(f"Error reading {cuts_file}: {e}")
 
-    logging.info(f"[Stage 1] Discovered {len(speaker_info)} unique speakers across {len(lang_to_shar_entries)} languages")
+    logging.info(
+        f"[Stage 1] Discovered {len(speaker_info)} unique speakers across {len(lang_to_shar_entries)} languages"
+    )
     for lang in sorted(lang_to_shar_entries.keys()):
         n = sum(1 for v in speaker_info.values() if v["language"] == lang)
         logging.info(f"  {lang}: {n} speakers")
@@ -210,7 +216,7 @@ def compute_speaker_embeddings(
         if not cut_metas:
             continue
         grouped_by_shar_and_shard: Dict[str, Dict[int, List]] = defaultdict(lambda: defaultdict(list))
-        for (se, shard_idx, cut_json) in cut_metas:
+        for se, shard_idx, cut_json in cut_metas:
             shar_key = json.dumps(se["shar_path"], sort_keys=True)
             grouped_by_shar_and_shard[shar_key][shard_idx].append((se, cut_json))
         speakers_needing_audio[spk] = {
@@ -228,7 +234,7 @@ def compute_speaker_embeddings(
     for spk, data in speakers_needing_audio.items():
         for shar_key, shard_map in data["grouped"].items():
             for shard_idx, items in shard_map.items():
-                for (se, cut_json) in items:
+                for se, cut_json in items:
                     cut_id = cut_json.get("id", "")
                     shar_shard_to_speakers[(shar_key, shard_idx)].append((spk, cut_id))
 
@@ -368,6 +374,7 @@ def run_stage1(
 # Stage 2: Cross-lingual speaker matching + language-balanced sampling
 # ---------------------------------------------------------------------------
 
+
 def build_crosslingual_map(speaker_embeddings: Dict[str, Dict]) -> Dict[str, Tuple[str, float]]:
     """
     For each speaker S in language L, find the closest speaker S' from a different
@@ -435,7 +442,9 @@ def sample_balanced_cuts(
     # Collect 3x the target to allow shuffling diversity
     collect_secs_per_lang = secs_per_lang * 3
 
-    logging.info(f"[Stage 2] Sampling ~{hours_per_lang:.2f}h per language ({num_langs} languages, {target_hours}h total)")
+    logging.info(
+        f"[Stage 2] Sampling ~{hours_per_lang:.2f}h per language ({num_langs} languages, {target_hours}h total)"
+    )
 
     all_matched_speakers = set(v[0] for v in cross_lingual_map.values())
 
@@ -456,8 +465,7 @@ def sample_balanced_cuts(
             if max_shards_per_dataset > 0 and len(cuts_files) > max_shards_per_dataset:
                 cuts_files = cuts_files[:max_shards_per_dataset]
                 logging.info(
-                    f"  Limiting to {max_shards_per_dataset} shards for dataset: "
-                    f"{se['shar_path']['cuts']}"
+                    f"  Limiting to {max_shards_per_dataset} shards for dataset: " f"{se['shar_path']['cuts']}"
                 )
             for cuts_file in cuts_files:
                 if lang_done:
@@ -514,6 +522,7 @@ def sample_balanced_cuts(
 # Stage 3: Extract audio + write NeMo manifest
 # ---------------------------------------------------------------------------
 
+
 def run_stage3(
     target_cuts_by_lang: Dict[str, List[Dict]],
     context_pool_by_speaker: Dict[str, List],
@@ -546,21 +555,25 @@ def run_stage3(
             matched_spk, ssim = cross_lingual_map[spk]
             ctx_pool = context_pool_by_speaker.get(matched_spk, [])
             if not ctx_pool:
-                logging.warning(f"No context pool for matched speaker {matched_spk}, skipping cut {cut_json.get('id', '')}")
+                logging.warning(
+                    f"No context pool for matched speaker {matched_spk}, skipping cut {cut_json.get('id', '')}"
+                )
                 continue
             ctx_se, ctx_shard_idx, ctx_cut_json = rng.choice(ctx_pool)
-            assignments.append({
-                "target_cut_json": cut_json,
-                "target_shar_entry": cut_json["_shar_entry"],
-                "target_shard_idx": cut_json["_shard_idx"],
-                "target_speaker": spk,
-                "context_cut_json": ctx_cut_json,
-                "context_shar_entry": ctx_se,
-                "context_shard_idx": ctx_shard_idx,
-                "context_speaker": matched_spk,
-                "ssim": ssim,
-                "lang": lang,
-            })
+            assignments.append(
+                {
+                    "target_cut_json": cut_json,
+                    "target_shar_entry": cut_json["_shar_entry"],
+                    "target_shard_idx": cut_json["_shard_idx"],
+                    "target_speaker": spk,
+                    "context_cut_json": ctx_cut_json,
+                    "context_shar_entry": ctx_se,
+                    "context_shard_idx": ctx_shard_idx,
+                    "context_speaker": matched_spk,
+                    "ssim": ssim,
+                    "lang": lang,
+                }
+            )
 
     logging.info(f"[Stage 3] Total assignments: {len(assignments)}")
 
@@ -628,7 +641,9 @@ def _save_audio_from_shard(
                         safe_id = cut.id.replace("/", "_")
                         out_file = os.path.join(out_subdir, f"{safe_id}.wav")
                         sf.write(out_file, audio_np, sample_rate)
-                        out_paths_array[assign_idx] = os.path.relpath(out_file, os.path.join(output_dir, "extracted_audio"))
+                        out_paths_array[assign_idx] = os.path.relpath(
+                            out_file, os.path.join(output_dir, "extracted_audio")
+                        )
                         del needed_cut_ids[cut.id]
                         if not needed_cut_ids:
                             break
@@ -638,15 +653,23 @@ def _save_audio_from_shard(
     # Extract target audio
     logging.info(f"[Stage 3] Extracting target audio from {len(target_loads)} shards...")
     _save_audio_from_shard(
-        target_loads, assignments, "target_cut_json",
-        target_audio_dir, target_audio_paths, "target_audio",
+        target_loads,
+        assignments,
+        "target_cut_json",
+        target_audio_dir,
+        target_audio_paths,
+        "target_audio",
     )
 
     # Extract context audio
     logging.info(f"[Stage 3] Extracting context audio from {len(context_loads)} shards...")
     _save_audio_from_shard(
-        context_loads, assignments, "context_cut_json",
-        context_audio_dir, context_audio_paths, "context_audio",
+        context_loads,
+        assignments,
+        "context_cut_json",
+        context_audio_dir,
+        context_audio_paths,
+        "context_audio",
     )
 
     # Write manifest
@@ -691,9 +714,11 @@ def _save_audio_from_shard(
 
             # Carry over any additional custom fields from the target supervision
             _exclude_custom_keys = {
-                "target_audio_codes_path", "context_audio_codes_path",
-                "context_audio_text", "context_audio_normalized_text",
-                "context_audio_offset"
+                "target_audio_codes_path",
+                "context_audio_codes_path",
+                "context_audio_text",
+                "context_audio_normalized_text",
+                "context_audio_offset",
             }
             for k, v in t_sup.get("custom", {}).items():
                 if k not in entry and k not in _exclude_custom_keys:
@@ -710,6 +735,7 @@ def _save_audio_from_shard(
 # YAML config generation (post Stage 4)
 # ---------------------------------------------------------------------------
 
+
 def generate_yaml_config(lhotse_shar_dir: str, output_yaml_path: str, data_mount_prefix: str = "/data"):
     """
     Generate a lhotse YAML config pointing to the cross-lingual shar dataset.
@@ -775,16 +801,18 @@ def _make_range_pattern(directory: str, prefix: str, ext: str) -> str:
                     context_codes_dir, f"codes.{{{cc_first:0{cc_width}d}..{cc_last:0{cc_width}d}}}.tar"
                 )
 
-    yaml_entry = [{
-        "type": "lhotse_shar",
-        "shar_path": shar_path,
-        "weight": 1.0,
-        "tags": {
-            "task": "tts",
-            "lang": "crosslingual",
-            "tokenizer_names": ["nemotron_nano_30b"],
-        },
-    }]
+    yaml_entry = [
+        {
+            "type": "lhotse_shar",
+            "shar_path": shar_path,
+            "weight": 1.0,
+            "tags": {
+                "task": "tts",
+                "lang": "crosslingual",
+                "tokenizer_names": ["nemotron_nano_30b"],
+            },
+        }
+    ]
 
     os.makedirs(os.path.dirname(output_yaml_path) or ".", exist_ok=True)
     with open(output_yaml_path, 'w') as f:
@@ -796,56 +824,77 @@ def _make_range_pattern(directory: str, prefix: str, ext: str) -> str:
 # Main
 # ---------------------------------------------------------------------------
 
+
 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description="Create a cross-lingual context TTS dataset from multilingual lhotse shar data.",
     )
     parser.add_argument(
-        "--master-yaml", required=True, type=str,
+        "--master-yaml",
+        required=True,
+        type=str,
         help="Path to the master multilingual YAML (e.g. train_25fpsSpectralCodecBWE_en_de_es_fr_hi_it_vi_zh_with_ipa.yaml).",
     )
     parser.add_argument(
-        "--output-dir", required=True, type=str,
+        "--output-dir",
+        required=True,
+        type=str,
         help="Base directory for all outputs (extracted audio, manifest, speaker index).",
     )
     parser.add_argument(
-        "--target-hours", type=float, default=50.0,
+        "--target-hours",
+        type=float,
+        default=50.0,
         help="Total hours of target audio to sample (split equally across languages).",
     )
     parser.add_argument(
-        "--samples-per-speaker", type=int, default=5,
+        "--samples-per-speaker",
+        type=int,
+        default=5,
         help="Number of utterances per speaker to use for computing the average TitaNet embedding.",
     )
     parser.add_argument(
-        "--sample-rate", type=int, default=24000,
+        "--sample-rate",
+        type=int,
+        default=24000,
         help="Sample rate for saving extracted audio files.",
     )
     parser.add_argument(
-        "--embedding-batch-size", type=int, default=16,
+        "--embedding-batch-size",
+        type=int,
+        default=16,
         help="Batch size for TitaNet embedding computation.",
     )
     parser.add_argument(
-        "--max-shards-per-dataset", type=int, default=0,
+        "--max-shards-per-dataset",
+        type=int,
+        default=0,
         help="Max number of .jsonl.gz shard files to scan per dataset during "
-             "speaker discovery (Stage 1). 0 means scan all shards. "
-             "Setting this to e.g. 10 dramatically speeds up discovery while "
-             "still finding most speakers.",
+        "speaker discovery (Stage 1). 0 means scan all shards. "
+        "Setting this to e.g. 10 dramatically speeds up discovery while "
+        "still finding most speakers.",
     )
     parser.add_argument(
-        "--seed", type=int, default=42,
+        "--seed",
+        type=int,
+        default=42,
         help="Random seed for reproducibility.",
     )
     parser.add_argument(
-        "--log-level", type=str, default="INFO",
+        "--log-level",
+        type=str,
+        default="INFO",
         choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
         help="Logging level.",
     )
     parser.add_argument(
-        "--generate-yaml", type=str, default=None,
+        "--generate-yaml",
+        type=str,
+        default=None,
         help="If provided, skip stages 1-3 and instead generate a YAML config "
-             "pointing to the lhotse shar in OUTPUT_DIR/lhotse_shar. "
-             "Value is the output YAML file path.",
+        "pointing to the lhotse shar in OUTPUT_DIR/lhotse_shar. "
+        "Value is the output YAML file path.",
     )
     args = parser.parse_args()
 
@@ -893,15 +942,22 @@ def main():
     # --- Stage 2: Cross-lingual matching + balanced sampling ---
     cross_lingual_map = build_crosslingual_map(speaker_embeddings)
     target_cuts_by_lang, context_pool_by_speaker = sample_balanced_cuts(
-        lang_to_shar_entries, cross_lingual_map,
-        target_hours=args.target_hours, seed=args.seed,
+        lang_to_shar_entries,
+        cross_lingual_map,
+        target_hours=args.target_hours,
+        seed=args.seed,
         max_shards_per_dataset=args.max_shards_per_dataset,
     )
 
     # --- Stage 3: Extract audio + write manifest ---
     manifest_path = run_stage3(
-        target_cuts_by_lang, context_pool_by_speaker, cross_lingual_map,
-        speaker_embeddings, args.output_dir, args.sample_rate, args.seed,
+        target_cuts_by_lang,
+        context_pool_by_speaker,
+        cross_lingual_map,
+        speaker_embeddings,
+        args.output_dir,
+        args.sample_rate,
+        args.seed,
     )
 
     # --- Summary ---
diff --git a/scripts/magpietts/inspect_crosslingual_dataset.py b/scripts/magpietts/inspect_crosslingual_dataset.py
index 9ce0c648fe39..6fed5c93adf5 100644
--- a/scripts/magpietts/inspect_crosslingual_dataset.py
+++ b/scripts/magpietts/inspect_crosslingual_dataset.py
@@ -111,9 +111,15 @@ def main():
                 decoded_target, decoded_target_len = codec_model.decode(
                     tokens=target_codes_t, tokens_len=target_codes_len
                 )
-            decoded_target_np = decoded_target[0, :decoded_target_len[0]].cpu().float().numpy()
-            sf.write(os.path.join(sample_dir, "target_decoded_from_codes.wav"), decoded_target_np, codec_model.output_sample_rate)
-            logging.info(f"  Saved target_decoded_from_codes.wav ({len(decoded_target_np)/codec_model.output_sample_rate:.2f}s), codes shape: {target_codes_np.shape}")
+            decoded_target_np = decoded_target[0, : decoded_target_len[0]].cpu().float().numpy()
+            sf.write(
+                os.path.join(sample_dir, "target_decoded_from_codes.wav"),
+                decoded_target_np,
+                codec_model.output_sample_rate,
+            )
+            logging.info(
+                f"  Saved target_decoded_from_codes.wav ({len(decoded_target_np)/codec_model.output_sample_rate:.2f}s), codes shape: {target_codes_np.shape}"
+            )
         else:
             logging.warning(f"  No target_codes found for cut {cut.id}")
 
@@ -123,12 +129,16 @@ def main():
             ctx_codes_t = torch.from_numpy(ctx_codes_np).unsqueeze(0).to(device)  # (1, C, T)
             ctx_codes_len = torch.tensor([ctx_codes_t.shape[2]], device=device)
             with torch.inference_mode():
-                decoded_ctx, decoded_ctx_len = codec_model.decode(
-                    tokens=ctx_codes_t, tokens_len=ctx_codes_len
-                )
-            decoded_ctx_np = decoded_ctx[0, :decoded_ctx_len[0]].cpu().float().numpy()
-            sf.write(os.path.join(sample_dir, "context_decoded_from_codes.wav"), decoded_ctx_np, codec_model.output_sample_rate)
-            logging.info(f"  Saved context_decoded_from_codes.wav ({len(decoded_ctx_np)/codec_model.output_sample_rate:.2f}s), codes shape: {ctx_codes_np.shape}")
+                decoded_ctx, decoded_ctx_len = codec_model.decode(tokens=ctx_codes_t, tokens_len=ctx_codes_len)
+            decoded_ctx_np = decoded_ctx[0, : decoded_ctx_len[0]].cpu().float().numpy()
+            sf.write(
+                os.path.join(sample_dir, "context_decoded_from_codes.wav"),
+                decoded_ctx_np,
+                codec_model.output_sample_rate,
+            )
+            logging.info(
+                f"  Saved context_decoded_from_codes.wav ({len(decoded_ctx_np)/codec_model.output_sample_rate:.2f}s), codes shape: {ctx_codes_np.shape}"
+            )
         else:
             logging.warning(f"  No context_codes found for cut {cut.id}")
 
diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py
index d225136989f1..3741deddf430 100644
--- a/tests/collections/tts/test_infer_vs_process_batch.py
+++ b/tests/collections/tts/test_infer_vs_process_batch.py
@@ -142,7 +142,7 @@ def create_synthetic_batch(
     text = torch.zeros(batch_size, max_text_len, dtype=torch.long, device=device)
     for b in range(batch_size):
         tl = text_lens_list[b]
-        text[b, :tl - 1] = torch.randint(0, text_vocab_size, (tl - 1,), device=device)
+        text[b, : tl - 1] = torch.randint(0, text_vocab_size, (tl - 1,), device=device)
         text[b, tl - 1] = model.eos_id  # EOS as last valid token
     text_lens = torch.tensor(text_lens_list, dtype=torch.long, device=device)
 
@@ -161,7 +161,9 @@ def create_synthetic_batch(
     audio_codes_lens = torch.tensor(audio_frames_list, dtype=torch.long, device=device)
 
     # Context audio codes (raw, without BOS/EOS)
-    context_audio_codes = torch.zeros(batch_size, num_codebooks, max_context_audio_frames, dtype=torch.long, device=device)
+    context_audio_codes = torch.zeros(
+        batch_size, num_codebooks, max_context_audio_frames, dtype=torch.long, device=device
+    )
     for b in range(batch_size):
         caf = context_audio_frames_list[b]
         context_audio_codes[b, :, :caf] = torch.randint(0, codebook_size, (num_codebooks, caf), device=device)
@@ -249,8 +251,10 @@ def compare_audio_codes(model, pb_output, ib_output, batch):
             num_show = min(10, mismatch_positions.size(0))
             for i in range(num_show):
                 cb, t = mismatch_positions[i].tolist()
-                print(f"    Mismatch at codebook={cb}, time={t}: "
-                      f"pb={pb_codes_b[cb, t].item()}, ib={ib_codes_b[cb, t].item()}")
+                print(
+                    f"    Mismatch at codebook={cb}, time={t}: "
+                    f"pb={pb_codes_b[cb, t].item()}, ib={ib_codes_b[cb, t].item()}"
+                )
 
     return all_match
 
@@ -308,7 +312,7 @@ def compare_phoneme_predictions(model, pb_output, ib_output, batch):
 
         # infer_batch phoneme preds: slice from start_idx for this batch item
         start = max(0, ib_start_idx[b].item())
-        ib_ph_b = ib_phoneme_preds[b, :, start:start + compare_len]
+        ib_ph_b = ib_phoneme_preds[b, :, start : start + compare_len]
 
         matches = (pb_ph_b == ib_ph_b).all()
         num_matching = (pb_ph_b == ib_ph_b).sum().item()
@@ -325,8 +329,10 @@ def compare_phoneme_predictions(model, pb_output, ib_output, batch):
             num_show = min(10, mismatch_positions.size(0))
             for i in range(num_show):
                 sf, t = mismatch_positions[i].tolist()
-                print(f"    Mismatch at stacking_factor={sf}, time={t}: "
-                      f"pb={pb_ph_b[sf, t].item()}, ib={ib_ph_b[sf, t].item()}")
+                print(
+                    f"    Mismatch at stacking_factor={sf}, time={t}: "
+                    f"pb={pb_ph_b[sf, t].item()}, ib={ib_ph_b[sf, t].item()}"
+                )
 
     return all_match
 

From 1f0f83f38b6d811d059073603560825d91e89175 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Mon, 9 Mar 2026 13:04:03 -0400
Subject: [PATCH 74/94] tokenizer import change

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 .../common/tokenizers/text_to_speech/tts_tokenizers.py      | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
index 81f875750d64..65b27bc6b62f 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -14,11 +14,13 @@
 # limitations under the License.
 
 import itertools
+import os
 import string
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import List, Optional, Union
 
+from tokenizers import Tokenizer
 from transformers import PreTrainedTokenizerBase
 
 from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import (
@@ -1180,10 +1182,6 @@ class IPABPETokenizer:
     """
 
     def __init__(self, tokenizer_path: str):
-        import os
-
-        from tokenizers import Tokenizer
-
         if os.path.isdir(tokenizer_path):
             tokenizer_file = os.path.join(tokenizer_path, "tokenizer.json")
         else:

From a61b60a7277c128b5068e2bcf2922796c63709d4 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Mon, 9 Mar 2026 13:12:50 -0400
Subject: [PATCH 75/94] remove unnecessary imports

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/modules/__init__.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/nemo/collections/tts/modules/__init__.py b/nemo/collections/tts/modules/__init__.py
index 0c9a8c182b71..866f418dbacd 100644
--- a/nemo/collections/tts/modules/__init__.py
+++ b/nemo/collections/tts/modules/__init__.py
@@ -15,12 +15,6 @@
 import nemo.collections.tts.modules.adapters
 import nemo.collections.tts.modules.ffn_modules
 import nemo.collections.tts.modules.moe_modules
-from nemo.collections.tts.modules.nemotron_h_decoder import (
-    HybridMambaAttentionDynamicCache,
-    NemotronHConfig,
-    NemotronHForCausalLM,
-    NemotronHModel,
-)
 from nemo.collections.tts.modules.tacotron2 import Decoder as Taco2Decoder
 from nemo.collections.tts.modules.tacotron2 import Encoder as Taco2Encoder
 from nemo.collections.tts.modules.tacotron2 import Postnet as Taco2Postnet

From f090106e12de460303ebd7f0d30ad9bf72ae8700 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Mon, 9 Mar 2026 13:28:30 -0400
Subject: [PATCH 76/94] cleanup

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 .../tts/modules/nemotron_h_decoder.py         |    4 +-
 .../create_crosslingual_context_dataset.py    |  995 --
 .../magpietts/inspect_crosslingual_dataset.py |  161 -
 ...okenizer_2048_en_de_es_fr_hi_it_vi_zh.json | 9954 -----------------
 4 files changed, 2 insertions(+), 11112 deletions(-)
 delete mode 100644 scripts/magpietts/create_crosslingual_context_dataset.py
 delete mode 100644 scripts/magpietts/inspect_crosslingual_dataset.py
 delete mode 100644 scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json

diff --git a/nemo/collections/tts/modules/nemotron_h_decoder.py b/nemo/collections/tts/modules/nemotron_h_decoder.py
index ec30a1e7a699..986359c0e2b3 100644
--- a/nemo/collections/tts/modules/nemotron_h_decoder.py
+++ b/nemo/collections/tts/modules/nemotron_h_decoder.py
@@ -21,8 +21,8 @@
 """
 
 import math
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Tuple, Union
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
diff --git a/scripts/magpietts/create_crosslingual_context_dataset.py b/scripts/magpietts/create_crosslingual_context_dataset.py
deleted file mode 100644
index c3bb7008a509..000000000000
--- a/scripts/magpietts/create_crosslingual_context_dataset.py
+++ /dev/null
@@ -1,995 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Creates a cross-lingual context dataset for TTS training.
-
-For each target utterance in language A, finds the closest speaker voice from a
-different language B (using TitaNet speaker embeddings) and pairs the target with
-context audio from that cross-lingual speaker.
-
-The script operates in three stages:
-  Stage 1: Build a per-speaker TitaNet embedding index across all languages.
-  Stage 2: Compute cross-lingual speaker matches and sample a language-balanced subset.
-  Stage 3: Extract audio to disk and write a NeMo-format JSONL manifest.
-
-After running this script, use create_lhotse_shar_from_nemo_manifest.py to convert
-the output manifest into lhotse shar format, then optionally run
-extend_lhotse_shards_with_audio_codes.py to add codec codes.
-
-Example usage:
-    python scripts/magpietts/create_crosslingual_context_dataset.py \
-        --master-yaml /data/magpie_pretraining_data/manifests/ipa_manifests/train_25fpsSpectralCodecBWE_en_de_es_fr_hi_it_vi_zh_with_ipa.yaml \
-        --output-dir /data/crosslingual_context_dataset \
-        --target-hours 50.0 \
-        --samples-per-speaker 5 \
-        --seed 42 \
-        --log-level INFO
-"""
-
-import argparse
-import glob as glob_module
-import gzip
-import json
-import logging
-import os
-import pickle
-import random
-import re
-from collections import defaultdict
-from typing import Any, Dict, List, Tuple
-
-import numpy as np
-import soundfile as sf
-import torch
-import yaml
-from lhotse import CutSet
-from tqdm import tqdm
-
-TITANET_MODEL_NAME = "nvidia/speakerverification_en_titanet_large"
-TITANET_SAMPLE_RATE = 16000
-
-
-# ---------------------------------------------------------------------------
-# YAML / shar helpers
-# ---------------------------------------------------------------------------
-
-
-def parse_master_yaml(yaml_path: str) -> Dict[str, List[Dict]]:
-    """
-    Parse the master multilingual YAML and each per-language YAML it references.
-    Returns {language: [list of shar_entry dicts with context_audio]}.
-    """
-    yaml_base_dir = os.path.dirname(yaml_path)
-    with open(yaml_path, 'r') as f:
-        master_entries = yaml.safe_load(f)
-
-    lang_to_shar_entries: Dict[str, List[Dict]] = defaultdict(list)
-    for entry in master_entries:
-        lang = entry.get("tags", {}).get("lang")
-        child_yaml_path = entry.get("input_cfg")
-        if not lang or not child_yaml_path:
-            continue
-        if not os.path.isabs(child_yaml_path):
-            child_yaml_path = os.path.join(yaml_base_dir, child_yaml_path)
-        if not os.path.isfile(child_yaml_path):
-            logging.warning(f"Per-language YAML not found: {child_yaml_path}")
-            continue
-        with open(child_yaml_path, 'r') as f:
-            child_entries = yaml.safe_load(f)
-        for ce in child_entries:
-            shar_path = ce.get("shar_path", {})
-            if "context_audio" not in shar_path:
-                logging.debug(
-                    f"Skipping text-context-only entry (no context_audio): {shar_path.get('cuts', 'unknown')}"
-                )
-                continue
-            lang_to_shar_entries[lang].append(ce)
-
-    return dict(lang_to_shar_entries)
-
-
-def expand_shar_range(pattern: str) -> List[str]:
-    """
-    Expand a shar path pattern like '.../cuts.{000000..001231}.jsonl.gz'
-    into a list of concrete file paths.
-    """
-    match = re.search(r'\{(\d+)\.\.(\d+)\}', pattern)
-    if not match:
-        return [pattern]
-    start_idx = int(match.group(1))
-    end_idx = int(match.group(2))
-    width = len(match.group(1))
-    prefix = pattern[: match.start()]
-    suffix = pattern[match.end() :]
-    return [f"{prefix}{i:0{width}d}{suffix}" for i in range(start_idx, end_idx + 1)]
-
-
-def parse_speaker_field(speaker_str: str) -> Tuple[str, str, str]:
-    """Extract (language, dataset, speaker_id) from '| Language:XX Dataset:YYY Speaker:ZZZ |'."""
-    lang_m = re.search(r"Language:(\w+)", speaker_str)
-    dataset_m = re.search(r"Dataset:([\w\d\W]+?) Speaker:", speaker_str)
-    spk_m = re.search(r"Speaker:([\w\d\W]+?) \|", speaker_str)
-    lang = lang_m.group(1) if lang_m else "unknown"
-    dataset = dataset_m.group(1).strip() if dataset_m else "unknown"
-    speaker_id = spk_m.group(1).strip() if spk_m else "unknown"
-    return lang, dataset, speaker_id
-
-
-# ---------------------------------------------------------------------------
-# Stage 1: Build speaker embedding index
-# ---------------------------------------------------------------------------
-
-
-def discover_speakers_from_cuts(
-    lang_to_shar_entries: Dict[str, List[Dict]],
-    max_cuts_per_speaker: int,
-    max_shards_per_dataset: int = 0,
-) -> Dict[str, Dict]:
-    """
-    Pass 1 (metadata only): Read cut JSONL files to discover unique speakers
-    and collect up to max_cuts_per_speaker cut metadata entries per speaker.
-
-    Args:
-        max_shards_per_dataset: If > 0, only scan this many .jsonl.gz shard
-            files per shar group (dataset) instead of all shards. This
-            dramatically speeds up discovery for large datasets while still
-            finding most speakers.
-
-    Returns: {speaker_str: {"language": str, "cut_metas": [list of (shar_entry, shard_idx, cut_json_dict)]}}
-    """
-    speaker_info: Dict[str, Dict] = {}
-
-    for lang, shar_entries in lang_to_shar_entries.items():
-        logging.info(f"[Stage 1] Discovering speakers for language: {lang} ({len(shar_entries)} shar groups)")
-        for se in shar_entries:
-            cuts_pattern = se["shar_path"]["cuts"]
-            cuts_files = expand_shar_range(cuts_pattern)
-            if max_shards_per_dataset > 0 and len(cuts_files) > max_shards_per_dataset:
-                logging.info(
-                    f"  Limiting scan to {max_shards_per_dataset}/{len(cuts_files)} "
-                    f"shards for dataset: {cuts_pattern}"
-                )
-                cuts_files = cuts_files[:max_shards_per_dataset]
-            for cuts_file in cuts_files:
-                if not os.path.isfile(cuts_file):
-                    continue
-                shard_idx_match = re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_file)
-                shard_idx = int(shard_idx_match.group(1)) if shard_idx_match else 0
-                try:
-                    with gzip.open(cuts_file, 'rt', encoding='utf-8') as f:
-                        for line in f:
-                            cut_json = json.loads(line)
-                            supervisions = cut_json.get("supervisions", [])
-                            if not supervisions:
-                                continue
-                            speaker_str = supervisions[0].get("speaker", "")
-                            if not speaker_str:
-                                continue
-                            if speaker_str not in speaker_info:
-                                speaker_info[speaker_str] = {
-                                    "language": lang,
-                                    "cut_metas": [],
-                                }
-                            if len(speaker_info[speaker_str]["cut_metas"]) < max_cuts_per_speaker:
-                                speaker_info[speaker_str]["cut_metas"].append((se, shard_idx, cut_json))
-                except Exception as e:
-                    logging.warning(f"Error reading {cuts_file}: {e}")
-
-    logging.info(
-        f"[Stage 1] Discovered {len(speaker_info)} unique speakers across {len(lang_to_shar_entries)} languages"
-    )
-    for lang in sorted(lang_to_shar_entries.keys()):
-        n = sum(1 for v in speaker_info.values() if v["language"] == lang)
-        logging.info(f"  {lang}: {n} speakers")
-    return speaker_info
-
-
-def compute_speaker_embeddings(
-    speaker_info: Dict[str, Dict],
-    sv_model: torch.nn.Module,
-    device: torch.device,
-    batch_size: int = 16,
-) -> Dict[str, Dict]:
-    """
-    Pass 2: For each speaker, load audio from shar tars for the sampled cuts,
-    compute TitaNet embeddings, and average them into a single representative vector.
-
-    Returns: {speaker_str: {"language": str, "embedding": np.ndarray}}
-    """
-    speaker_embeddings: Dict[str, Dict] = {}
-
-    speakers_needing_audio = {}
-    for spk, info in speaker_info.items():
-        cut_metas = info["cut_metas"]
-        if not cut_metas:
-            continue
-        grouped_by_shar_and_shard: Dict[str, Dict[int, List]] = defaultdict(lambda: defaultdict(list))
-        for se, shard_idx, cut_json in cut_metas:
-            shar_key = json.dumps(se["shar_path"], sort_keys=True)
-            grouped_by_shar_and_shard[shar_key][shard_idx].append((se, cut_json))
-        speakers_needing_audio[spk] = {
-            "language": info["language"],
-            "grouped": grouped_by_shar_and_shard,
-        }
-
-    # Collect audio in batches: load from shar, accumulate waveforms per speaker
-    speaker_audio_tensors: Dict[str, List[torch.Tensor]] = defaultdict(list)
-
-    logging.info(f"[Stage 1] Loading audio for {len(speakers_needing_audio)} speakers to compute embeddings...")
-
-    # Group all (shar_entry, shard_idx) that we need to load
-    shar_shard_to_speakers: Dict[Tuple[str, int], List[Tuple[str, str]]] = defaultdict(list)
-    for spk, data in speakers_needing_audio.items():
-        for shar_key, shard_map in data["grouped"].items():
-            for shard_idx, items in shard_map.items():
-                for se, cut_json in items:
-                    cut_id = cut_json.get("id", "")
-                    shar_shard_to_speakers[(shar_key, shard_idx)].append((spk, cut_id))
-
-    # Process shard by shard to minimize tar file openings
-    total_shards = len(shar_shard_to_speakers)
-    for (shar_key, shard_idx), spk_cut_pairs in tqdm(
-        shar_shard_to_speakers.items(), desc="[Stage 1] Loading audio shards", total=total_shards
-    ):
-        se_shar_path = json.loads(shar_key)
-        cuts_files = expand_shar_range(se_shar_path["cuts"])
-        target_audio_files = expand_shar_range(se_shar_path.get("target_audio", ""))
-
-        if shard_idx >= len(cuts_files) or shard_idx >= len(target_audio_files):
-            logging.warning(f"Shard index {shard_idx} out of range, skipping")
-            continue
-
-        cut_file = cuts_files[shard_idx]
-        target_tar = target_audio_files[shard_idx]
-
-        if not os.path.isfile(cut_file) or not os.path.isfile(target_tar):
-            logging.warning(f"Missing shard files: cuts={cut_file}, target={target_tar}")
-            continue
-
-        needed_cut_ids = {cut_id for (_, cut_id) in spk_cut_pairs}
-        cut_id_to_spk = {cut_id: spk for (spk, cut_id) in spk_cut_pairs}
-
-        try:
-            fields = {
-                "cuts": [cut_file],
-                "recording": [target_tar],
-            }
-            # Also include context_recording if available, to avoid errors
-            context_audio_files = expand_shar_range(se_shar_path.get("context_audio", ""))
-            if shard_idx < len(context_audio_files) and os.path.isfile(context_audio_files[shard_idx]):
-                fields["context_recording"] = [context_audio_files[shard_idx]]
-
-            shard_cutset = CutSet.from_shar(fields=fields)
-            for cut in shard_cutset:
-                if cut.id in needed_cut_ids:
-                    spk = cut_id_to_spk[cut.id]
-                    audio_np = cut.recording.resample(TITANET_SAMPLE_RATE).load_audio().squeeze(0)
-                    audio_tensor = torch.from_numpy(audio_np).float()
-                    speaker_audio_tensors[spk].append(audio_tensor)
-                    needed_cut_ids.discard(cut.id)
-                    if not needed_cut_ids:
-                        break
-        except Exception as e:
-            logging.warning(f"Error loading shard {cut_file}: {e}")
-
-    # Now compute embeddings in batches
-    logging.info(f"[Stage 1] Computing TitaNet embeddings for {len(speaker_audio_tensors)} speakers...")
-    all_speakers = list(speaker_audio_tensors.keys())
-
-    for batch_start in tqdm(range(0, len(all_speakers), batch_size), desc="[Stage 1] TitaNet batches"):
-        batch_speakers = all_speakers[batch_start : batch_start + batch_size]
-        audio_list = []
-        audio_lens = []
-        spk_indices = []  # maps each audio in batch back to speaker
-
-        for spk in batch_speakers:
-            for audio_t in speaker_audio_tensors[spk]:
-                audio_list.append(audio_t.to(device))
-                audio_lens.append(audio_t.size(0))
-                spk_indices.append(spk)
-
-        if not audio_list:
-            continue
-
-        batch_lens = torch.tensor(audio_lens, device=device).long()
-        max_len = int(batch_lens.max().item())
-        padded = torch.zeros(len(audio_list), max_len, device=device, dtype=torch.float32)
-        for i, t in enumerate(audio_list):
-            padded[i, : t.size(0)] = t
-
-        with torch.inference_mode():
-            _, embeddings = sv_model.forward(input_signal=padded, input_signal_length=batch_lens)
-
-        embeddings_np = embeddings.cpu().float().numpy()
-
-        # Average embeddings per speaker
-        spk_emb_accum: Dict[str, List[np.ndarray]] = defaultdict(list)
-        for i, spk in enumerate(spk_indices):
-            spk_emb_accum[spk].append(embeddings_np[i])
-
-        for spk in batch_speakers:
-            if spk in spk_emb_accum and spk_emb_accum[spk]:
-                avg_emb = np.mean(spk_emb_accum[spk], axis=0)
-                avg_emb = avg_emb / (np.linalg.norm(avg_emb) + 1e-8)
-                speaker_embeddings[spk] = {
-                    "language": speakers_needing_audio[spk]["language"],
-                    "embedding": avg_emb,
-                }
-
-    logging.info(f"[Stage 1] Computed embeddings for {len(speaker_embeddings)} speakers")
-    return speaker_embeddings
-
-
-def run_stage1(
-    lang_to_shar_entries: Dict[str, List[Dict]],
-    samples_per_speaker: int,
-    device: torch.device,
-    index_path: str,
-    batch_size: int = 16,
-    max_shards_per_dataset: int = 0,
-) -> Dict[str, Dict]:
-    """Run full Stage 1: discover speakers, load audio, compute embeddings, save index."""
-    if os.path.isfile(index_path):
-        logging.info(f"[Stage 1] Loading cached speaker index from {index_path}")
-        with open(index_path, 'rb') as f:
-            return pickle.load(f)
-
-    from nemo.collections.asr.models import EncDecSpeakerLabelModel
-
-    logging.info(f"[Stage 1] Loading TitaNet model: {TITANET_MODEL_NAME}")
-    sv_model = EncDecSpeakerLabelModel.from_pretrained(TITANET_MODEL_NAME)
-    sv_model = sv_model.to(device)
-    sv_model.eval()
-
-    speaker_info = discover_speakers_from_cuts(
-        lang_to_shar_entries,
-        max_cuts_per_speaker=samples_per_speaker,
-        max_shards_per_dataset=max_shards_per_dataset,
-    )
-    speaker_embeddings = compute_speaker_embeddings(speaker_info, sv_model, device, batch_size=batch_size)
-
-    os.makedirs(os.path.dirname(index_path), exist_ok=True)
-    with open(index_path, 'wb') as f:
-        pickle.dump(speaker_embeddings, f)
-    logging.info(f"[Stage 1] Saved speaker index to {index_path}")
-
-    del sv_model
-    torch.cuda.empty_cache()
-    return speaker_embeddings
-
-
-# ---------------------------------------------------------------------------
-# Stage 2: Cross-lingual speaker matching + language-balanced sampling
-# ---------------------------------------------------------------------------
-
-
-def build_crosslingual_map(speaker_embeddings: Dict[str, Dict]) -> Dict[str, Tuple[str, float]]:
-    """
-    For each speaker S in language L, find the closest speaker S' from a different
-    language by cosine similarity of their TitaNet embeddings.
-
-    Returns: {speaker_str: (best_match_speaker_str, cosine_similarity)}
-    """
-    speakers = list(speaker_embeddings.keys())
-    n = len(speakers)
-    logging.info(f"[Stage 2] Building cross-lingual map for {n} speakers...")
-
-    # Build embedding matrix
-    emb_matrix = np.stack([speaker_embeddings[s]["embedding"] for s in speakers])
-    langs = [speaker_embeddings[s]["language"] for s in speakers]
-
-    # Cosine similarity matrix (embeddings are already L2-normalized)
-    sim_matrix = emb_matrix @ emb_matrix.T
-
-    cross_lingual_map: Dict[str, Tuple[str, float]] = {}
-    for i in range(n):
-        best_j = -1
-        best_sim = -2.0
-        for j in range(n):
-            if langs[j] == langs[i]:
-                continue
-            if sim_matrix[i, j] > best_sim:
-                best_sim = sim_matrix[i, j]
-                best_j = j
-        if best_j >= 0:
-            cross_lingual_map[speakers[i]] = (speakers[best_j], float(best_sim))
-        else:
-            logging.warning(f"No cross-lingual match found for speaker: {speakers[i]}")
-
-    logging.info(f"[Stage 2] Built cross-lingual map with {len(cross_lingual_map)} entries")
-    avg_sim = np.mean([v[1] for v in cross_lingual_map.values()]) if cross_lingual_map else 0
-    logging.info(f"[Stage 2] Average cross-lingual similarity: {avg_sim:.4f}")
-    return cross_lingual_map
-
-
-def sample_balanced_cuts(
-    lang_to_shar_entries: Dict[str, List[Dict]],
-    cross_lingual_map: Dict[str, Tuple[str, float]],
-    target_hours: float,
-    seed: int,
-    max_shards_per_dataset: int = 0,
-) -> Tuple[Dict[str, List[Dict]], Dict[str, List[Dict]]]:
-    """
-    Sample cuts across languages so each language contributes approximately
-    target_hours / num_languages hours of target audio.
-
-    Args:
-        max_shards_per_dataset: If > 0, only read this many shard files per
-            dataset. Since we only need ~6.25h per language, reading a small
-            fraction of shards is sufficient and avoids scanning tens of
-            thousands of files for large datasets.
-
-    Returns:
-        target_cuts_by_lang: {lang: [list of cut_json dicts with extra metadata]}
-        context_pool_by_speaker: {speaker_str: [list of (shar_entry, shard_idx, cut_json)]}
-    """
-    rng = random.Random(seed)
-    num_langs = len(lang_to_shar_entries)
-    hours_per_lang = target_hours / num_langs
-    secs_per_lang = hours_per_lang * 3600
-    # Collect 3x the target to allow shuffling diversity
-    collect_secs_per_lang = secs_per_lang * 3
-
-    logging.info(
-        f"[Stage 2] Sampling ~{hours_per_lang:.2f}h per language ({num_langs} languages, {target_hours}h total)"
-    )
-
-    all_matched_speakers = set(v[0] for v in cross_lingual_map.values())
-
-    target_cuts_by_lang: Dict[str, List[Dict]] = {}
-    context_pool_by_speaker: Dict[str, List] = defaultdict(list)
-
-    for lang, shar_entries in lang_to_shar_entries.items():
-        logging.info(f"[Stage 2] Reading cuts for language: {lang}")
-        lang_cuts = []
-        lang_collected_secs = 0.0
-        lang_done = False
-
-        for se in shar_entries:
-            if lang_done:
-                break
-            cuts_pattern = se["shar_path"]["cuts"]
-            cuts_files = expand_shar_range(cuts_pattern)
-            if max_shards_per_dataset > 0 and len(cuts_files) > max_shards_per_dataset:
-                cuts_files = cuts_files[:max_shards_per_dataset]
-                logging.info(
-                    f"  Limiting to {max_shards_per_dataset} shards for dataset: " f"{se['shar_path']['cuts']}"
-                )
-            for cuts_file in cuts_files:
-                if lang_done:
-                    break
-                if not os.path.isfile(cuts_file):
-                    continue
-                shard_idx_match = re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_file)
-                shard_idx = int(shard_idx_match.group(1)) if shard_idx_match else 0
-                try:
-                    with gzip.open(cuts_file, 'rt', encoding='utf-8') as f:
-                        for line in f:
-                            cut_json = json.loads(line)
-                            speaker_str = cut_json.get("supervisions", [{}])[0].get("speaker", "")
-                            if not speaker_str:
-                                continue
-                            if speaker_str in all_matched_speakers:
-                                context_pool_by_speaker[speaker_str].append((se, shard_idx, cut_json))
-                            if speaker_str in cross_lingual_map:
-                                cut_json["_shar_entry"] = se
-                                cut_json["_shard_idx"] = shard_idx
-                                cut_json["_speaker_str"] = speaker_str
-                                lang_cuts.append(cut_json)
-                                lang_collected_secs += cut_json.get("duration", 0)
-                                if lang_collected_secs >= collect_secs_per_lang:
-                                    lang_done = True
-                                    break
-                except Exception as e:
-                    logging.warning(f"Error reading {cuts_file}: {e}")
-
-        logging.info(f"  {lang}: {len(lang_cuts)} candidate target cuts ({lang_collected_secs / 3600:.2f}h collected)")
-
-        rng.shuffle(lang_cuts)
-        sampled = []
-        total_dur = 0.0
-        for cut_json in lang_cuts:
-            dur = cut_json.get("duration", 0)
-            if dur <= 0:
-                continue
-            sampled.append(cut_json)
-            total_dur += dur
-            if total_dur >= secs_per_lang:
-                break
-
-        target_cuts_by_lang[lang] = sampled
-        logging.info(f"  {lang}: sampled {len(sampled)} cuts, {total_dur / 3600:.2f}h")
-
-    total_sampled = sum(len(v) for v in target_cuts_by_lang.values())
-    total_hours = sum(sum(c.get("duration", 0) for c in v) for v in target_cuts_by_lang.values()) / 3600
-    logging.info(f"[Stage 2] Total sampled: {total_sampled} cuts, {total_hours:.2f}h")
-    return target_cuts_by_lang, dict(context_pool_by_speaker)
-
-
-# ---------------------------------------------------------------------------
-# Stage 3: Extract audio + write NeMo manifest
-# ---------------------------------------------------------------------------
-
-
-def run_stage3(
-    target_cuts_by_lang: Dict[str, List[Dict]],
-    context_pool_by_speaker: Dict[str, List],
-    cross_lingual_map: Dict[str, Tuple[str, float]],
-    speaker_embeddings: Dict[str, Dict],
-    output_dir: str,
-    sample_rate: int,
-    seed: int,
-):
-    """
-    For each sampled target cut, pick a context utterance from the matched
-    cross-lingual speaker, extract both audios to disk, and write the manifest.
-    """
-    rng = random.Random(seed)
-    audio_dir = os.path.join(output_dir, "extracted_audio")
-    target_audio_dir = os.path.join(audio_dir, "target")
-    context_audio_dir = os.path.join(audio_dir, "context")
-    os.makedirs(target_audio_dir, exist_ok=True)
-    os.makedirs(context_audio_dir, exist_ok=True)
-
-    manifest_path = os.path.join(output_dir, "manifest.json")
-
-    # Build a quick lookup: for each context cut we might need to load,
-    # index by (shar_key, shard_idx, cut_id)
-    # First, assign a context cut to each target
-    assignments: List[Dict] = []
-    for lang, cuts in target_cuts_by_lang.items():
-        for cut_json in cuts:
-            spk = cut_json["_speaker_str"]
-            matched_spk, ssim = cross_lingual_map[spk]
-            ctx_pool = context_pool_by_speaker.get(matched_spk, [])
-            if not ctx_pool:
-                logging.warning(
-                    f"No context pool for matched speaker {matched_spk}, skipping cut {cut_json.get('id', '')}"
-                )
-                continue
-            ctx_se, ctx_shard_idx, ctx_cut_json = rng.choice(ctx_pool)
-            assignments.append(
-                {
-                    "target_cut_json": cut_json,
-                    "target_shar_entry": cut_json["_shar_entry"],
-                    "target_shard_idx": cut_json["_shard_idx"],
-                    "target_speaker": spk,
-                    "context_cut_json": ctx_cut_json,
-                    "context_shar_entry": ctx_se,
-                    "context_shard_idx": ctx_shard_idx,
-                    "context_speaker": matched_spk,
-                    "ssim": ssim,
-                    "lang": lang,
-                }
-            )
-
-    logging.info(f"[Stage 3] Total assignments: {len(assignments)}")
-
-    # Group by (shar_key, shard_idx) for efficient loading
-    # We need to load target and context audio from potentially different shards
-    # Strategy: process all assignments, grouping audio loads by shard
-    target_loads: Dict[Tuple[str, int], List[int]] = defaultdict(list)
-    context_loads: Dict[Tuple[str, int], List[int]] = defaultdict(list)
-
-    for idx, a in enumerate(assignments):
-        t_shar_key = json.dumps(a["target_shar_entry"]["shar_path"], sort_keys=True)
-        target_loads[(t_shar_key, a["target_shard_idx"])].append(idx)
-        c_shar_key = json.dumps(a["context_shar_entry"]["shar_path"], sort_keys=True)
-        context_loads[(c_shar_key, a["context_shard_idx"])].append(idx)
-
-    # Arrays to hold extracted audio file paths
-    target_audio_paths = [None] * len(assignments)
-    context_audio_paths = [None] * len(assignments)
-
-    def _save_audio_from_shard(
-        shard_loads: Dict[Tuple[str, int], List[int]],
-        assignments_list: List[Dict],
-        cut_json_key: str,
-        out_subdir: str,
-        out_paths_array: List,
-        audio_field: str,
-    ):
-        """Load cuts from shar tars and save individual audio files to disk."""
-        total_shards = len(shard_loads)
-        for (shar_key_str, shard_idx), indices in tqdm(
-            shard_loads.items(), desc=f"[Stage 3] Extracting {audio_field}", total=total_shards
-        ):
-            se_shar_path = json.loads(shar_key_str)
-            cuts_files = expand_shar_range(se_shar_path["cuts"])
-            target_audio_files = expand_shar_range(se_shar_path.get("target_audio", ""))
-
-            if shard_idx >= len(cuts_files) or shard_idx >= len(target_audio_files):
-                logging.warning(f"Shard {shard_idx} out of range, skipping")
-                continue
-
-            cut_file = cuts_files[shard_idx]
-            tar_file = target_audio_files[shard_idx]
-
-            if not os.path.isfile(cut_file) or not os.path.isfile(tar_file):
-                logging.warning(f"Missing files: {cut_file} or {tar_file}")
-                continue
-
-            needed_cut_ids = {}
-            for i in indices:
-                cj = assignments_list[i][cut_json_key]
-                cid = cj.get("id", "")
-                needed_cut_ids[cid] = i
-
-            try:
-                fields = {"cuts": [cut_file], "recording": [tar_file]}
-                ctx_audio_files = expand_shar_range(se_shar_path.get("context_audio", ""))
-                if ctx_audio_files and shard_idx < len(ctx_audio_files) and os.path.isfile(ctx_audio_files[shard_idx]):
-                    fields["context_recording"] = [ctx_audio_files[shard_idx]]
-
-                shard_cutset = CutSet.from_shar(fields=fields)
-                for cut in shard_cutset:
-                    if cut.id in needed_cut_ids:
-                        assign_idx = needed_cut_ids[cut.id]
-                        audio_np = cut.recording.resample(sample_rate).load_audio().squeeze(0)
-                        safe_id = cut.id.replace("/", "_")
-                        out_file = os.path.join(out_subdir, f"{safe_id}.wav")
-                        sf.write(out_file, audio_np, sample_rate)
-                        out_paths_array[assign_idx] = os.path.relpath(
-                            out_file, os.path.join(output_dir, "extracted_audio")
-                        )
-                        del needed_cut_ids[cut.id]
-                        if not needed_cut_ids:
-                            break
-            except Exception as e:
-                logging.warning(f"Error processing shard {cut_file}: {e}")
-
-    # Extract target audio
-    logging.info(f"[Stage 3] Extracting target audio from {len(target_loads)} shards...")
-    _save_audio_from_shard(
-        target_loads,
-        assignments,
-        "target_cut_json",
-        target_audio_dir,
-        target_audio_paths,
-        "target_audio",
-    )
-
-    # Extract context audio
-    logging.info(f"[Stage 3] Extracting context audio from {len(context_loads)} shards...")
-    _save_audio_from_shard(
-        context_loads,
-        assignments,
-        "context_cut_json",
-        context_audio_dir,
-        context_audio_paths,
-        "context_audio",
-    )
-
-    # Write manifest
-    logging.info(f"[Stage 3] Writing manifest to {manifest_path}")
-    written = 0
-    skipped = 0
-    with open(manifest_path, 'w', encoding='utf-8') as f:
-        for idx, a in enumerate(assignments):
-            if target_audio_paths[idx] is None or context_audio_paths[idx] is None:
-                skipped += 1
-                continue
-
-            t_cut = a["target_cut_json"]
-            c_cut = a["context_cut_json"]
-            t_sup = t_cut.get("supervisions", [{}])[0]
-
-            text = t_sup.get("text", "")
-            normalized_text = t_sup.get("custom", {}).get("normalized_text", text)
-            ipa = t_sup.get("custom", {}).get("ipa", "")
-            speaker = t_sup.get("speaker", "")
-            duration = t_cut.get("duration", 0)
-            context_duration = c_cut.get("duration", 0)
-            ctx_lang_parsed, _, _ = parse_speaker_field(a["context_speaker"])
-
-            target_lang_parsed, _, _ = parse_speaker_field(speaker)
-
-            entry = {
-                "audio_filepath": target_audio_paths[idx],
-                "text": text,
-                "normalized_text": normalized_text,
-                "speaker": speaker,
-                "language": target_lang_parsed,
-                "duration": duration,
-                "context_audio_filepath": context_audio_paths[idx],
-                "context_audio_duration": context_duration,
-                "context_speaker_similarity": round(a["ssim"], 6),
-                "context_language": ctx_lang_parsed,
-                "context_speaker": a["context_speaker"],
-            }
-            if ipa:
-                entry["ipa"] = ipa
-
-            # Carry over any additional custom fields from the target supervision
-            _exclude_custom_keys = {
-                "target_audio_codes_path",
-                "context_audio_codes_path",
-                "context_audio_text",
-                "context_audio_normalized_text",
-                "context_audio_offset",
-            }
-            for k, v in t_sup.get("custom", {}).items():
-                if k not in entry and k not in _exclude_custom_keys:
-                    entry[k] = v
-
-            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
-            written += 1
-
-    logging.info(f"[Stage 3] Manifest written: {written} entries, {skipped} skipped")
-    return manifest_path
-
-
-# ---------------------------------------------------------------------------
-# YAML config generation (post Stage 4)
-# ---------------------------------------------------------------------------
-
-
-def generate_yaml_config(lhotse_shar_dir: str, output_yaml_path: str, data_mount_prefix: str = "/data"):
-    """
-    Generate a lhotse YAML config pointing to the cross-lingual shar dataset.
-    Call this after running create_lhotse_shar_from_nemo_manifest.py on the manifest.
-
-    Args:
-        lhotse_shar_dir: Absolute path to the lhotse_shar output directory
-                         (containing cuts/, target_audio/, context_audio/).
-        output_yaml_path: Path to write the YAML config file.
-        data_mount_prefix: If shar_dir is under a mount, replace the host prefix
-                           with this docker-internal prefix. Pass empty string to skip.
-    """
-    cuts_dir = os.path.join(lhotse_shar_dir, "cuts")
-    target_audio_dir = os.path.join(lhotse_shar_dir, "target_audio")
-    context_audio_dir = os.path.join(lhotse_shar_dir, "context_audio")
-
-    cuts_files = sorted(glob_module.glob(os.path.join(cuts_dir, "cuts.*.jsonl.gz")))
-    context_files = sorted(glob_module.glob(os.path.join(context_audio_dir, "recording.*.tar")))
-
-    if not cuts_files:
-        logging.error(f"No cut files found in {cuts_dir}")
-        return
-
-    # Determine shard range
-    first_idx = int(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[0]).group(1))
-    last_idx = int(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[-1]).group(1))
-    width = len(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[0]).group(1))
-
-    def _make_range_pattern(directory: str, prefix: str, ext: str) -> str:
-        path = os.path.join(directory, f"{prefix}.{{{first_idx:0{width}d}..{last_idx:0{width}d}}}.{ext}")
-        return path
-
-    shar_path = {
-        "cuts": _make_range_pattern(cuts_dir, "cuts", "jsonl.gz"),
-        "target_audio": _make_range_pattern(target_audio_dir, "recording", "tar"),
-    }
-    if context_files:
-        shar_path["context_audio"] = _make_range_pattern(context_audio_dir, "recording", "tar")
-
-    # Check for codec codes
-    for codec_dir_name in os.listdir(lhotse_shar_dir):
-        codec_subdir = os.path.join(lhotse_shar_dir, codec_dir_name)
-        if not os.path.isdir(codec_subdir):
-            continue
-        target_codes_dir = os.path.join(codec_subdir, "target_codes")
-        context_codes_dir = os.path.join(codec_subdir, "context_codes")
-        if os.path.isdir(target_codes_dir):
-            tc_files = sorted(glob_module.glob(os.path.join(target_codes_dir, "codes.*.tar")))
-            if tc_files:
-                tc_first = int(re.search(r"codes\.(\d+)\.tar$", tc_files[0]).group(1))
-                tc_last = int(re.search(r"codes\.(\d+)\.tar$", tc_files[-1]).group(1))
-                tc_width = len(re.search(r"codes\.(\d+)\.tar$", tc_files[0]).group(1))
-                shar_path["target_codes"] = os.path.join(
-                    target_codes_dir, f"codes.{{{tc_first:0{tc_width}d}..{tc_last:0{tc_width}d}}}.tar"
-                )
-        if os.path.isdir(context_codes_dir):
-            cc_files = sorted(glob_module.glob(os.path.join(context_codes_dir, "codes.*.tar")))
-            if cc_files:
-                cc_first = int(re.search(r"codes\.(\d+)\.tar$", cc_files[0]).group(1))
-                cc_last = int(re.search(r"codes\.(\d+)\.tar$", cc_files[-1]).group(1))
-                cc_width = len(re.search(r"codes\.(\d+)\.tar$", cc_files[0]).group(1))
-                shar_path["context_codes"] = os.path.join(
-                    context_codes_dir, f"codes.{{{cc_first:0{cc_width}d}..{cc_last:0{cc_width}d}}}.tar"
-                )
-
-    yaml_entry = [
-        {
-            "type": "lhotse_shar",
-            "shar_path": shar_path,
-            "weight": 1.0,
-            "tags": {
-                "task": "tts",
-                "lang": "crosslingual",
-                "tokenizer_names": ["nemotron_nano_30b"],
-            },
-        }
-    ]
-
-    os.makedirs(os.path.dirname(output_yaml_path) or ".", exist_ok=True)
-    with open(output_yaml_path, 'w') as f:
-        yaml.dump(yaml_entry, f, default_flow_style=False, sort_keys=False)
-    logging.info(f"YAML config written to {output_yaml_path}")
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description="Create a cross-lingual context TTS dataset from multilingual lhotse shar data.",
-    )
-    parser.add_argument(
-        "--master-yaml",
-        required=True,
-        type=str,
-        help="Path to the master multilingual YAML (e.g. train_25fpsSpectralCodecBWE_en_de_es_fr_hi_it_vi_zh_with_ipa.yaml).",
-    )
-    parser.add_argument(
-        "--output-dir",
-        required=True,
-        type=str,
-        help="Base directory for all outputs (extracted audio, manifest, speaker index).",
-    )
-    parser.add_argument(
-        "--target-hours",
-        type=float,
-        default=50.0,
-        help="Total hours of target audio to sample (split equally across languages).",
-    )
-    parser.add_argument(
-        "--samples-per-speaker",
-        type=int,
-        default=5,
-        help="Number of utterances per speaker to use for computing the average TitaNet embedding.",
-    )
-    parser.add_argument(
-        "--sample-rate",
-        type=int,
-        default=24000,
-        help="Sample rate for saving extracted audio files.",
-    )
-    parser.add_argument(
-        "--embedding-batch-size",
-        type=int,
-        default=16,
-        help="Batch size for TitaNet embedding computation.",
-    )
-    parser.add_argument(
-        "--max-shards-per-dataset",
-        type=int,
-        default=0,
-        help="Max number of .jsonl.gz shard files to scan per dataset during "
-        "speaker discovery (Stage 1). 0 means scan all shards. "
-        "Setting this to e.g. 10 dramatically speeds up discovery while "
-        "still finding most speakers.",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="Random seed for reproducibility.",
-    )
-    parser.add_argument(
-        "--log-level",
-        type=str,
-        default="INFO",
-        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
-        help="Logging level.",
-    )
-    parser.add_argument(
-        "--generate-yaml",
-        type=str,
-        default=None,
-        help="If provided, skip stages 1-3 and instead generate a YAML config "
-        "pointing to the lhotse shar in OUTPUT_DIR/lhotse_shar. "
-        "Value is the output YAML file path.",
-    )
-    args = parser.parse_args()
-
-    log_level = getattr(logging, args.log_level.upper(), logging.INFO)
-    logging.basicConfig(
-        level=log_level,
-        format='%(asctime)s - %(levelname)s - %(message)s',
-    )
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-
-    os.makedirs(args.output_dir, exist_ok=True)
-
-    # --- Generate YAML config mode (post Stage 4) ---
-    if args.generate_yaml:
-        lhotse_shar_dir = os.path.join(args.output_dir, "lhotse_shar")
-        generate_yaml_config(lhotse_shar_dir, args.generate_yaml)
-        return
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    logging.info(f"Using device: {device}")
-
-    # --- Parse master YAML ---
-    logging.info(f"Parsing master YAML: {args.master_yaml}")
-    lang_to_shar_entries = parse_master_yaml(args.master_yaml)
-    if not lang_to_shar_entries:
-        logging.error("No shar entries found. Check the master YAML path and contents.")
-        return
-
-    for lang, entries in sorted(lang_to_shar_entries.items()):
-        logging.info(f"  Language '{lang}': {len(entries)} shar groups (with context_audio)")
-
-    # --- Stage 1: Build speaker embedding index ---
-    index_path = os.path.join(args.output_dir, "speaker_embedding_index.pkl")
-    speaker_embeddings = run_stage1(
-        lang_to_shar_entries,
-        samples_per_speaker=args.samples_per_speaker,
-        device=device,
-        index_path=index_path,
-        batch_size=args.embedding_batch_size,
-        max_shards_per_dataset=args.max_shards_per_dataset,
-    )
-
-    # --- Stage 2: Cross-lingual matching + balanced sampling ---
-    cross_lingual_map = build_crosslingual_map(speaker_embeddings)
-    target_cuts_by_lang, context_pool_by_speaker = sample_balanced_cuts(
-        lang_to_shar_entries,
-        cross_lingual_map,
-        target_hours=args.target_hours,
-        seed=args.seed,
-        max_shards_per_dataset=args.max_shards_per_dataset,
-    )
-
-    # --- Stage 3: Extract audio + write manifest ---
-    manifest_path = run_stage3(
-        target_cuts_by_lang,
-        context_pool_by_speaker,
-        cross_lingual_map,
-        speaker_embeddings,
-        args.output_dir,
-        args.sample_rate,
-        args.seed,
-    )
-
-    # --- Summary ---
-    logging.info("=" * 60)
-    logging.info("Cross-lingual context dataset creation complete!")
-    logging.info(f"  Manifest: {manifest_path}")
-    logging.info(f"  Audio dir: {os.path.join(args.output_dir, 'extracted_audio')}")
-    logging.info("")
-    logging.info("Next steps:")
-    logging.info("  1. Convert to lhotse shar format:")
-    logging.info(f"     python scripts/magpietts/create_lhotse_shar_from_nemo_manifest.py \\")
-    logging.info(f"       --manifest-path {manifest_path} \\")
-    logging.info(f"       --audio-base-dir {os.path.join(args.output_dir, 'extracted_audio')} \\")
-    logging.info(f"       --output-dir {os.path.join(args.output_dir, 'lhotse_shar')} \\")
-    logging.info(f"       --num-jobs 16 --processing-chunk-size 256 --audio-format flac --shuffle --shuffle-seed 42")
-    logging.info("")
-    logging.info("  2. (Optional) Add codec codes:")
-    logging.info(f"     python scripts/magpietts/extend_lhotse_shards_with_audio_codes.py \\")
-    logging.info(f"       --cuts-dir {os.path.join(args.output_dir, 'lhotse_shar', 'cuts')} \\")
-    logging.info(f"       --target-audio-dir {os.path.join(args.output_dir, 'lhotse_shar', 'target_audio')} \\")
-    logging.info(f"       --context-audio-dir {os.path.join(args.output_dir, 'lhotse_shar', 'context_audio')} \\")
-    logging.info(f"       --output-dir {os.path.join(args.output_dir, 'lhotse_shar')} \\")
-    logging.info(f"       --codec-model-path <YOUR_CODEC_MODEL_PATH>")
-    logging.info("")
-    yaml_out = os.path.join(args.output_dir, "crosslingual_context.yaml")
-    logging.info("  3. Generate YAML config for training:")
-    logging.info(f"     python scripts/magpietts/create_crosslingual_context_dataset.py \\")
-    logging.info(f"       --master-yaml {args.master_yaml} \\")
-    logging.info(f"       --output-dir {args.output_dir} \\")
-    logging.info(f"       --generate-yaml {yaml_out}")
-    logging.info("=" * 60)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/magpietts/inspect_crosslingual_dataset.py b/scripts/magpietts/inspect_crosslingual_dataset.py
deleted file mode 100644
index 6fed5c93adf5..000000000000
--- a/scripts/magpietts/inspect_crosslingual_dataset.py
+++ /dev/null
@@ -1,161 +0,0 @@
-"""
-Inspect the cross-lingual context dataset by decoding target and context
-audio codes back to waveforms and saving them alongside the original
-recording audio for comparison.
-
-Usage (inside docker):
-    python scripts/magpietts/inspect_crosslingual_dataset.py \
-        --shar-dir /data/crosslingual_context_dataset/lhotse_shar \
-        --codec-model-path /model_artifacts/25fps_spectral_codec_with_bandwidth_extension.nemo \
-        --codec-name 25fpsSpectralCodecBWE \
-        --output-dir /data/crosslingual_context_dataset/inspect \
-        --num-samples 10
-"""
-
-import argparse
-import logging
-import os
-
-import numpy as np
-import soundfile as sf
-import torch
-from lhotse import CutSet
-
-from nemo.collections.tts.models import AudioCodecModel
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Inspect cross-lingual dataset: decode codes and save audio.")
-    parser.add_argument("--shar-dir", required=True, help="Path to lhotse_shar directory.")
-    parser.add_argument("--codec-model-path", required=True, help="Path to .nemo codec model.")
-    parser.add_argument("--codec-name", default="25fpsSpectralCodecBWE", help="Codec subdirectory name.")
-    parser.add_argument("--output-dir", required=True, help="Directory to save inspection outputs.")
-    parser.add_argument("--num-samples", type=int, default=10, help="Number of samples to inspect.")
-    args = parser.parse_args()
-
-    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-
-    os.makedirs(args.output_dir, exist_ok=True)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    # Load codec model
-    logging.info(f"Loading codec model from {args.codec_model_path}")
-    codec_model = AudioCodecModel.restore_from(args.codec_model_path, map_location="cpu", strict=False)
-    codec_model = codec_model.to(device)
-    codec_model.eval()
-    codec_sr = codec_model.sample_rate
-    logging.info(f"Codec output sample rate: {codec_sr}")
-
-    # Build shar fields for first shard
-    cuts_dir = os.path.join(args.shar_dir, "cuts")
-    target_audio_dir = os.path.join(args.shar_dir, "target_audio")
-    context_audio_dir = os.path.join(args.shar_dir, "context_audio")
-    target_codes_dir = os.path.join(args.shar_dir, args.codec_name, "target_codes")
-    context_codes_dir = os.path.join(args.shar_dir, args.codec_name, "context_codes")
-
-    # Use first shard only
-    fields = {
-        "cuts": [os.path.join(cuts_dir, "cuts.000000.jsonl.gz")],
-        "recording": [os.path.join(target_audio_dir, "recording.000000.tar")],
-        "context_recording": [os.path.join(context_audio_dir, "recording.000000.tar")],
-        "target_codes": [os.path.join(target_codes_dir, "codes.000000.tar")],
-        "context_codes": [os.path.join(context_codes_dir, "codes.000000.tar")],
-    }
-
-    for k, v in fields.items():
-        if not os.path.isfile(v[0]):
-            logging.error(f"Missing file for '{k}': {v[0]}")
-            return
-
-    logging.info("Loading CutSet from shar...")
-    cutset = CutSet.from_shar(fields=fields)
-
-    count = 0
-    for cut in cutset:
-        if count >= args.num_samples:
-            break
-
-        sup = cut.supervisions[0] if cut.supervisions else None
-        lang = sup.language if sup else "unk"
-        speaker = sup.speaker if sup else "unk"
-        ctx_lang = sup.custom.get("context_language", "unk") if sup and hasattr(sup, "custom") else "unk"
-        ssim = sup.custom.get("context_speaker_similarity", "N/A") if sup and hasattr(sup, "custom") else "N/A"
-
-        sample_dir = os.path.join(args.output_dir, f"sample_{count:03d}_{lang}")
-        os.makedirs(sample_dir, exist_ok=True)
-
-        logging.info(f"--- Sample {count} ---")
-        logging.info(f"  Cut ID: {cut.id}")
-        logging.info(f"  Target lang: {lang}, Context lang: {ctx_lang}, SSIM: {ssim}")
-        logging.info(f"  Speaker: {speaker}")
-        if sup:
-            logging.info(f"  Text: {sup.text[:80]}...")
-
-        # 1. Save original target recording audio
-        target_audio_np = cut.recording.resample(codec_sr).load_audio().squeeze(0)
-        sf.write(os.path.join(sample_dir, "target_recording.wav"), target_audio_np, codec_sr)
-        logging.info(f"  Saved target_recording.wav ({len(target_audio_np)/codec_sr:.2f}s)")
-
-        # 2. Save original context recording audio
-        if cut.has_custom("context_recording"):
-            ctx_audio_np = cut.context_recording.resample(codec_sr).load_audio().squeeze(0)
-            sf.write(os.path.join(sample_dir, "context_recording.wav"), ctx_audio_np, codec_sr)
-            logging.info(f"  Saved context_recording.wav ({len(ctx_audio_np)/codec_sr:.2f}s)")
-
-        # 3. Decode target codes -> audio
-        if cut.has_custom("target_codes"):
-            target_codes_np = cut.target_codes.load().astype(np.int32)  # (C, T)
-            target_codes_t = torch.from_numpy(target_codes_np).unsqueeze(0).to(device)  # (1, C, T)
-            target_codes_len = torch.tensor([target_codes_t.shape[2]], device=device)
-            with torch.inference_mode():
-                decoded_target, decoded_target_len = codec_model.decode(
-                    tokens=target_codes_t, tokens_len=target_codes_len
-                )
-            decoded_target_np = decoded_target[0, : decoded_target_len[0]].cpu().float().numpy()
-            sf.write(
-                os.path.join(sample_dir, "target_decoded_from_codes.wav"),
-                decoded_target_np,
-                codec_model.output_sample_rate,
-            )
-            logging.info(
-                f"  Saved target_decoded_from_codes.wav ({len(decoded_target_np)/codec_model.output_sample_rate:.2f}s), codes shape: {target_codes_np.shape}"
-            )
-        else:
-            logging.warning(f"  No target_codes found for cut {cut.id}")
-
-        # 4. Decode context codes -> audio
-        if cut.has_custom("context_codes"):
-            ctx_codes_np = cut.context_codes.load().astype(np.int32)  # (C, T)
-            ctx_codes_t = torch.from_numpy(ctx_codes_np).unsqueeze(0).to(device)  # (1, C, T)
-            ctx_codes_len = torch.tensor([ctx_codes_t.shape[2]], device=device)
-            with torch.inference_mode():
-                decoded_ctx, decoded_ctx_len = codec_model.decode(tokens=ctx_codes_t, tokens_len=ctx_codes_len)
-            decoded_ctx_np = decoded_ctx[0, : decoded_ctx_len[0]].cpu().float().numpy()
-            sf.write(
-                os.path.join(sample_dir, "context_decoded_from_codes.wav"),
-                decoded_ctx_np,
-                codec_model.output_sample_rate,
-            )
-            logging.info(
-                f"  Saved context_decoded_from_codes.wav ({len(decoded_ctx_np)/codec_model.output_sample_rate:.2f}s), codes shape: {ctx_codes_np.shape}"
-            )
-        else:
-            logging.warning(f"  No context_codes found for cut {cut.id}")
-
-        # 5. Write metadata
-        with open(os.path.join(sample_dir, "info.txt"), "w") as f:
-            f.write(f"cut_id: {cut.id}\n")
-            f.write(f"target_language: {lang}\n")
-            f.write(f"context_language: {ctx_lang}\n")
-            f.write(f"speaker: {speaker}\n")
-            f.write(f"context_speaker_similarity: {ssim}\n")
-            f.write(f"text: {sup.text if sup else ''}\n")
-            f.write(f"duration: {cut.duration}\n")
-
-        count += 1
-
-    logging.info(f"Done. Saved {count} samples to {args.output_dir}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json
deleted file mode 100644
index 6d7e35116405..000000000000
--- a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json
+++ /dev/null
@@ -1,9954 +0,0 @@
-{
-  "version": "1.0",
-  "truncation": null,
-  "padding": null,
-  "added_tokens": [
-    {
-      "id": 0,
-      "content": "<pad>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 1,
-      "content": "<blank>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 2,
-      "content": "<unk>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    }
-  ],
-  "normalizer": null,
-  "pre_tokenizer": {
-    "type": "ByteLevel",
-    "add_prefix_space": false,
-    "trim_offsets": true,
-    "use_regex": true
-  },
-  "post_processor": null,
-  "decoder": {
-    "type": "ByteLevel",
-    "add_prefix_space": true,
-    "trim_offsets": true,
-    "use_regex": true
-  },
-  "model": {
-    "type": "BPE",
-    "dropout": null,
-    "unk_token": "<unk>",
-    "continuing_subword_prefix": null,
-    "end_of_word_suffix": null,
-    "fuse_unk": false,
-    "byte_fallback": false,
-    "ignore_merges": false,
-    "vocab": {
-      "<pad>": 0,
-      "<blank>": 1,
-      "<unk>": 2,
-      "(": 3,
-      ")": 4,
-      "-": 5,
-      ".": 6,
-      "1": 7,
-      "2": 8,
-      "4": 9,
-      "5": 10,
-      "6": 11,
-      "7": 12,
-      "F": 13,
-      "a": 14,
-      "b": 15,
-      "c": 16,
-      "d": 17,
-      "e": 18,
-      "f": 19,
-      "h": 20,
-      "i": 21,
-      "j": 22,
-      "k": 23,
-      "l": 24,
-      "m": 25,
-      "n": 26,
-      "o": 27,
-      "p": 28,
-      "q": 29,
-      "r": 30,
-      "s": 31,
-      "t": 32,
-      "u": 33,
-      "v": 34,
-      "w": 35,
-      "x": 36,
-      "y": 37,
-      "z": 38,
-      "¡": 39,
-      "£": 40,
-      "¦": 41,
-      "§": 42,
-      "©": 43,
-      "ª": 44,
-      "¬": 45,
-      "°": 46,
-      "²": 47,
-      "³": 48,
-      "¸": 49,
-      "¹": 50,
-      "¾": 51,
-      "Ã": 52,
-      "Å": 53,
-      "É": 54,
-      "Ê": 55,
-      "Ë": 56,
-      "Ì": 57,
-      "Î": 58,
-      "Ï": 59,
-      "Ċ": 60,
-      "Ġ": 61,
-      "Ģ": 62,
-      "ģ": 63,
-      "Ĥ": 64,
-      "ĥ": 65,
-      "ĩ": 66,
-      "Ī": 67,
-      "Ĭ": 68,
-      "ĭ": 69,
-      "Į": 70,
-      "į": 71,
-      "İ": 72,
-      "ı": 73,
-      "Ĳ": 74,
-      "ĳ": 75,
-      "Ĵ": 76,
-      "ĵ": 77,
-      "Ķ": 78,
-      "ķ": 79,
-      "ĸ": 80,
-      "Ĺ": 81,
-      "Ļ": 82,
-      "Ľ": 83,
-      "ľ": 84,
-      "Ŀ": 85,
-      "Ł": 86,
-      "ËĪ": 87,
-      "ËĲ": 88,
-      "ËĪÉ": 89,
-      "ËĮ": 90,
-      "ÉĻ": 91,
-      "ËĪa": 92,
-      "ËĪi": 93,
-      "Ġt": 94,
-      "Éª": 95,
-      "É¾": 96,
-      "ĠÉ": 97,
-      "Ġk": 98,
-      "Éľ": 99,
-      "Ġs": 100,
-      "ËĪe": 101,
-      "ÉĽ": 102,
-      "ËĪo": 103,
-      "Ġl": 104,
-      "ËĪÉĽ": 105,
-      "Ġd": 106,
-      "ÊĬ": 107,
-      "ËĪaËĲ": 108,
-      "Ġp": 109,
-      "Ìĥ": 110,
-      "Ġm": 111,
-      "ËĪu": 112,
-      "Åĭ": 113,
-      "Ã°": 114,
-      "ËĪÉĶ": 115,
-      "ÊĮ": 116,
-      "ËĮa": 117,
-      "Ġh": 118,
-      "ËĪÊĮ": 119,
-      "Ġn": 120,
-      "Êģ": 121,
-      "ËĪÉĳ": 122,
-      "Êĥ": 123,
-      "eËĲ": 124,
-      "Ġa": 125,
-      "Ġb": 126,
-      "ÉĶ": 127,
-      "ËĪÉĻ": 128,
-      "ÉĻn": 129,
-      "Ġf": 130,
-      "ËĪÉª": 131,
-      "É¡": 132,
-      "ËĪeËĲ": 133,
-      "Ġj": 134,
-      "nt": 135,
-      "ĠÃ°": 136,
-      "ĠËĮ": 137,
-      "Ġts": 138,
-      "ĠÉ¡": 139,
-      "Éķ": 140,
-      "ËĪoËĲ": 141,
-      "Ê°": 142,
-      "aËĲ": 143,
-      "ËĪy": 144,
-      "ĠtÉķ": 145,
-      "ËĪiËĲ": 146,
-      "ĠÊ": 147,
-      "Ġv": 148,
-      "Ġw": 149,
-      "st": 150,
-      "Éĳ": 151,
-      "nd": 152,
-      "ËĮi": 153,
-      "Ìª": 154,
-      "ËĮe": 155,
-      "Ġz": 156,
-      "ËĪaÉª": 157,
-      "ËĪiÉĽ": 158,
-      "Î²": 159,
-      "É¹": 160,
-      "ĠËĮa": 161,
-      "Î¸": 162,
-      "ĠhÉĽ": 163,
-      "ÊĪ": 164,
-      "iËĲ": 165,
-      "ËĮo": 166,
-      "ĠÉª": 167,
-      "Éľn": 168,
-      "Ġx": 169,
-      "ĠtÉĻ": 170,
-      "ËĪuËĲ": 171,
-      "ËĮÉĻ": 172,
-      "ĠjËĪi": 173,
-      "ËĮÉĽ": 174,
-      "ĠÉĽ": 175,
-      "ĠËĪa": 176,
-      "ËĮaËĲ": 177,
-      "Ġla": 178,
-      "ĠÃ°e": 179,
-      "ĠhÉĽËĲ": 180,
-      "Ġe": 181,
-      "Ã§": 182,
-      "ÉĻl": 183,
-      "oËĲ": 184,
-      "ËĪÉĳu": 185,
-      "ÊĴ": 186,
-      "uËĲ": 187,
-      "ĠÉĹ": 188,
-      "ĠÉķ": 189,
-      "ËĮeËĲ": 190,
-      "ĠtÉķËĪi": 191,
-      "os": 192,
-      "ËĪÉĶËĲ": 193,
-      "as": 194,
-      "ËĪÊĬ": 195,
-      "Ġi": 196,
-      "ËĪai": 197,
-      "É²": 198,
-      "Éªn": 199,
-      "ts": 200,
-      "ÉľÅĭ": 201,
-      "ĠÉŁ": 202,
-      "ĠÊĥ": 203,
-      "ËĪeÉª": 204,
-      "ÉĽÉ¾": 205,
-      "ËĪÉĽËĲ": 206,
-      "ËĪÉĽÉ¾": 207,
-      "Ġr": 208,
-      "tÊĥ": 209,
-      "ËĮÉĶ": 210,
-      "ĠdÉĻ": 211,
-      "tÉĻ": 212,
-      "ou": 213,
-      "ËĪyÉĻ": 214,
-      "ĠËĮi": 215,
-      "ÉĻÉ¾": 216,
-      "ËĪÉĻÊĬ": 217,
-      "ËĪÊĮÉ¾": 218,
-      "ËĪÉĴ": 219,
-      "Ġth": 220,
-      "ËĪon": 221,
-      "Êĭ": 222,
-      "ËĪÉĳËĲ": 223,
-      "ËĪÊĮh": 224,
-      "wËĪa": 225,
-      "ËĪei": 226,
-      "ll": 227,
-      "ĠÉĲ": 228,
-      "ÉĳËĲ": 229,
-      "an": 230,
-      "ÉŁ": 231,
-      "ĠÊĭ": 232,
-      "Ġko": 233,
-      "kh": 234,
-      "ÉªÅĭ": 235,
-      "ËĪaËĲÉª": 236,
-      "ĠtÊĥ": 237,
-      "ËĪaËĲt": 238,
-      "ĠËĮe": 239,
-      "ĠtÉķh": 240,
-      "ËĪuo": 241,
-      "ËĪonÉ¡": 242,
-      "Éĸ": 243,
-      "at": 244,
-      "Ġke": 245,
-      "ÉĴ": 246,
-      "ĠÉķËĪi": 247,
-      "Ã¸": 248,
-      "ĠÉĳ": 249,
-      "ËĪeËĲk": 250,
-      "Åĵ": 251,
-      "re": 252,
-      "ĠÉ¾": 253,
-      "ĠkÉĶ": 254,
-      "ËĮÊĬ": 255,
-      "sk": 256,
-      "ĠÊĬ": 257,
-      "Ġand": 258,
-      "ÉªÃ§": 259,
-      "Ġme": 260,
-      "ËĪaÉ¾": 261,
-      "ĠËĪÉª": 262,
-      "na": 263,
-      "ĠÎ²": 264,
-      "ĠlËĪi": 265,
-      "jaËĲ": 266,
-      "li": 267,
-      "no": 268,
-      "ĠÉªn": 269,
-      "ĠdËĮi": 270,
-      "ĠÉ²": 271,
-      "tËĲ": 272,
-      "ÉĻm": 273,
-      "ĠlÉĻ": 274,
-      "ĠÃ°ÉĻ": 275,
-      "Éªk": 276,
-      "ËĪÉĽl": 277,
-      "Éľt": 278,
-      "Ġse": 279,
-      "es": 280,
-      "ËĪou": 281,
-      "ËĪaÊĬ": 282,
-      "ĠÉĶ": 283,
-      "Éªt": 284,
-      "ĠÅĭ": 285,
-      "ËĪÉĽn": 286,
-      "Êİ": 287,
-      "Ġkh": 288,
-      "ËĪÉĽnt": 289,
-      "ËĪaËĲÉ¾": 290,
-      "Ġki": 291,
-      "mp": 292,
-      "lt": 293,
-      "É£": 294,
-      "Ġpa": 295,
-      "ËĪÉĻËĲ": 296,
-      "Éªs": 297,
-      "ĠÉĴ": 298,
-      "Ġle": 299,
-      "ÉªÉľ": 300,
-      "ËĪÉĽt": 301,
-      "Ġde": 302,
-      "ĠÉ¹": 303,
-      "ĠtËĪoËĲ": 304,
-      "ĠÊģ": 305,
-      "ÊĥÉĻn": 306,
-      "ĠÊĬnt": 307,
-      "ËĪÉĶÉ¾": 308,
-      "ËĪaÃ°": 309,
-      "ĠaÉª": 310,
-      "ĠÊĲ": 311,
-      "ĠmËĪa": 312,
-      "ra": 313,
-      "ĠkËĪÉª": 314,
-      "kt": 315,
-      "ËĲp": 316,
-      "ĠÊĪ": 317,
-      "ËĪaËĲÊĬ": 318,
-      "ĠkËĪÊĮÉ¾": 319,
-      "ĠËĪÊĮ": 320,
-      "ĠÉĴv": 321,
-      "Ġel": 322,
-      "ks": 323,
-      "Ġkw": 324,
-      "ÉĻt": 325,
-      "ndo": 326,
-      "ei": 327,
-      "ĠËĮaËĲp": 328,
-      "se": 329,
-      "ÉĻÉ¹": 330,
-      "ËĪuei": 331,
-      "ÉĻs": 332,
-      "ĠkËĮo": 333,
-      "ĠÊĤ": 334,
-      "ĠËĮÊĬ": 335,
-      "Ġc": 336,
-      "ĠÉĽn": 337,
-      "ËĪant": 338,
-      "Î¸j": 339,
-      "ËĮoËĲ": 340,
-      "ĠËĪaËĲ": 341,
-      "ĠpÉ¾": 342,
-      "si": 343,
-      "ĠËĪe": 344,
-      "ĠjuËĲ": 345,
-      "ĠkËĮe": 346,
-      "ËĮÉª": 347,
-      "ÉĶn": 348,
-      "ĠsËĪÊĮ": 349,
-      "ĠËĪu": 350,
-      "ni": 351,
-      "Ġst": 352,
-      "ĠdiËĲ": 353,
-      "ĠkeËĲ": 354,
-      "ĠjËĪiou": 355,
-      "ËĪaiÉľ": 356,
-      "ĠdÊĴ": 357,
-      "ĠËĪÉĶ": 358,
-      "va": 359,
-      "ËĲÉ¾": 360,
-      "ËĪÃ¸": 361,
-      "ËĮÉĻÊĬ": 362,
-      "ĠpËĪu": 363,
-      "Ġsu": 364,
-      "Ġma": 365,
-      "ĠÉĻ": 366,
-      "dÊĴ": 367,
-      "ĠpÊ°": 368,
-      "le": 369,
-      "in": 370,
-      "ĠtÉķhËĪi": 371,
-      "ĠwËĪo": 372,
-      "ro": 373,
-      "ËĮy": 374,
-      "É¾a": 375,
-      "ĠsËĪi": 376,
-      "Ã°ÉĻ": 377,
-      "ĠseËĲ": 378,
-      "la": 379,
-      "ĠÊĴ": 380,
-      "mb": 381,
-      "ĠhËĪoËĲ": 382,
-      "ĠbÊ°": 383,
-      "ĠÉĽÉ¾": 384,
-      "ĠÃ°at": 385,
-      "sp": 386,
-      "ÉĶÉ¾": 387,
-      "en": 388,
-      "ĠsÉĻ": 389,
-      "ËĪÉĶÉľ": 390,
-      "ĠlËĮa": 391,
-      "ĠËĮÉĽ": 392,
-      "ĠËĪy": 393,
-      "É¡aËĲ": 394,
-      "ĠdÉĽÉ¾": 395,
-      "ËĪÉĽÊģ": 396,
-      "Éľkh": 397,
-      "ËĪiÉĻ": 398,
-      "ËĪan": 399,
-      "ĠmËĪo": 400,
-      "ËĪaÎ²": 401,
-      "Ġal": 402,
-      "ĠËĪeËĲ": 403,
-      "ĠÎ¸": 404,
-      "ĠnËĪi": 405,
-      "pÊ°": 406,
-      "lla": 407,
-      "Ġpl": 408,
-      "ËĪÅĵ": 409,
-      "jËĪÉĳu": 410,
-      "Ġav": 411,
-      "ĠmËĪi": 412,
-      "ĠfËĪa": 413,
-      "ËĪÉľ": 414,
-      "me": 415,
-      "ËĮÉĻh": 416,
-      "ËĪuÉĻ": 417,
-      "it": 418,
-      "jËĪe": 419,
-      "Ġo": 420,
-      "ËĪÉľËĲ": 421,
-      "ĠtÉķËĪiou": 422,
-      "ÉĶËĲ": 423,
-      "ĠnÉĻ": 424,
-      "ËĪÉĻÉľn": 425,
-      "ĠmÉĻ": 426,
-      "ĠdeËĲ": 427,
-      "mo": 428,
-      "sa": 429,
-      "jËĪÉĶ": 430,
-      "ËĪal": 431,
-      "ĠtÉķËĪiÉĽ": 432,
-      "ĠÉ¡ÉĻ": 433,
-      "Ã°a": 434,
-      "ĠÉªz": 435,
-      "Ġsa": 436,
-      "ri": 437,
-      "ĠËĮil": 438,
-      "ËĮu": 439,
-      "ĠkaËĲ": 440,
-      "ĠÉĻËĲ": 441,
-      "ĠÉĸ": 442,
-      "Ġka": 443,
-      "ËĪÊĮhi": 444,
-      "ĠjeËĲ": 445,
-      "ĠtÊ°": 446,
-      "ne": 447,
-      "kËĲ": 448,
-      "ĠtsËĪai": 449,
-      "ĠËĪeËĲk": 450,
-      "nk": 451,
-      "ti": 452,
-      "ËĪaÉľn": 453,
-      "ĠkËĲ": 454,
-      "É¡ÉĻn": 455,
-      "ËĪia": 456,
-      "ĠÉĶËĲÉ¾": 457,
-      "Êı": 458,
-      "ĠËĮÊĮ": 459,
-      "ĠzËĪaËĲ": 460,
-      "Ġlos": 461,
-      "ÉĽs": 462,
-      "ËĪÉĶn": 463,
-      "ÉĽnt": 464,
-      "ÉĽn": 465,
-      "ĠÉŁËĪoËĲ": 466,
-      "Ã§t": 467,
-      "Ġdas": 468,
-      "ĠxËĮo": 469,
-      "ËĪuÉľ": 470,
-      "ËĪas": 471,
-      "ĠbËĪÊĮ": 472,
-      "ËĪiÉĽÉľn": 473,
-      "ÉĲ": 474,
-      "ĠtsuËĲ": 475,
-      "ĠpËĮÉĽ": 476,
-      "ĠnËĪÉĶ": 477,
-      "ÊĬt": 478,
-      "ma": 479,
-      "ĠnËĪo": 480,
-      "ĠlËĪÉª": 481,
-      "ËĪÉĽs": 482,
-      "Éªl": 483,
-      "ĠÉķËĪiÉĽ": 484,
-      "ĠËĪÊĬ": 485,
-      "ÉĴt": 486,
-      "to": 487,
-      "ĠËĪo": 488,
-      "ËĮon": 489,
-      "ĠkwËĪa": 490,
-      "ĠÉªt": 491,
-      "ĠhoËĲ": 492,
-      "ËĪiËĲk": 493,
-      "ĠËĮaËĲpk": 494,
-      "ËĪaÉªn": 495,
-      "Ã¦": 496,
-      "ÉĻnt": 497,
-      "ta": 498,
-      "lo": 499,
-      "ĠnËĪÉĳ": 500,
-      "ĠlËĪa": 501,
-      "ËĪiÉľ": 502,
-      "ĠwËĪei": 503,
-      "ÉĽÊģ": 504,
-      "ĠtËĪa": 505,
-      "ĠÉ¾ËĮÉĻh": 506,
-      "ĠÉķËĪiÉĳ": 507,
-      "ËĮiËĲ": 508,
-      "ËĮÉĽl": 509,
-      "ĠtÉĻÉľ": 510,
-      "ĠkËĪuo": 511,
-      "ĠtËĪu": 512,
-      "jËĪÉĽ": 513,
-      "ĠËĮin": 514,
-      "É¾e": 515,
-      "ĠkoËĲ": 516,
-      "ĠkËĪa": 517,
-      "É¾i": 518,
-      "ĠtÉķËĪiÉĳ": 519,
-      "lÉĻ": 520,
-      "ĠkÉĻ": 521,
-      "ĠtËĪi": 522,
-      "ĠÅĭËĪyÉĻ": 523,
-      "Ġtsh": 524,
-      "er": 525,
-      "av": 526,
-      "ĠkÉĶn": 527,
-      "ËĪÉĻÉľÅĭ": 528,
-      "Ã°o": 529,
-      "ËĪaËĲn": 530,
-      "ĠbÊ°ËĪi": 531,
-      "ĠkËĲjaËĲ": 532,
-      "ÉĻz": 533,
-      "ĠpÊģ": 534,
-      "ĠdËĪÉª": 535,
-      "ĠziËĲ": 536,
-      "É¡eËĲ": 537,
-      "ĠtËĪÉĻ": 538,
-      "Éªz": 539,
-      "ĠnËĮon": 540,
-      "taËĲ": 541,
-      "bl": 542,
-      "te": 543,
-      "nËĮeËĲ": 544,
-      "ËĪÉªl": 545,
-      "so": 546,
-      "ko": 547,
-      "uÊģ": 548,
-      "ĠÉ£": 549,
-      "ĠpaÊģ": 550,
-      "ĠËĪÉĽ": 551,
-      "jËĪuËĲ": 552,
-      "ËĮÊĮ": 553,
-      "yn": 554,
-      "ËĪiËĲn": 555,
-      "ĠlËĪaÉª": 556,
-      "ËĪÉªÅĭ": 557,
-      "ĠtÉķhËĪy": 558,
-      "ĠnËĪÊĮhi": 559,
-      "ĠdËĮe": 560,
-      "ĠjËĪÉĳu": 561,
-      "ĠtËĪÉĳu": 562,
-      "ĠhËĪo": 563,
-      "Éªd": 564,
-      "ĠthËĪÉĳ": 565,
-      "mËĪe": 566,
-      "ĠËĪÉĻ": 567,
-      "ja": 568,
-      "Ġph": 569,
-      "ÉĽt": 570,
-      "ĠkËĪÊĮ": 571,
-      "tÉĻn": 572,
-      "mËĪÉĳ": 573,
-      "wËĪe": 574,
-      "ĠËĮaÉªn": 575,
-      "ĠÃ°Éªs": 576,
-      "É¡ÉĻ": 577,
-      "ĠnËĪaËĲ": 578,
-      "ĠbËĪaËĲ": 579,
-      "ĠaÎ¸": 580,
-      "ĠmËĮa": 581,
-      "ËĪÊĮha": 582,
-      "ĠdËĮa": 583,
-      "ËĪÊı": 584,
-      "ĠÉ²ËĮy": 585,
-      "ĠpËĪa": 586,
-      "ËĪaÃ°o": 587,
-      "di": 588,
-      "bÉľ": 589,
-      "É³": 590,
-      "ĠwiËĲ": 591,
-      "ĠnËĪÉª": 592,
-      "ĠÉ¡ËĪÉĶÉľ": 593,
-      "tËĲo": 594,
-      "ËĮÉĻm": 595,
-      "ËĪaËĲr": 596,
-      "ĠmÉĽ": 597,
-      "ËĪeËĲÉ¡aËĲ": 598,
-      "ĠsËĮi": 599,
-      "ĠlËĮaËĲ": 600,
-      "nËĮaËĲ": 601,
-      "Ġsp": 602,
-      "tÊģ": 603,
-      "ĠÊİ": 604,
-      "ËĮÉĳËĲ": 605,
-      "Ġkl": 606,
-      "kÊ°": 607,
-      "il": 608,
-      "ĠÊĥt": 609,
-      "ĠËĮÊĬn": 610,
-      "al": 611,
-      "ĠsËĪÉĽ": 612,
-      "ĠmËĪaËĲ": 613,
-      "ĠÅĵ": 614,
-      "ĠÉ¡ËĪÊĮ": 615,
-      "ĠpËĮÉĽr": 616,
-      "É¾ËĪa": 617,
-      "ËĲÊĪ": 618,
-      "ËĪaÎ²a": 619,
-      "ĠwËĪÉĴ": 620,
-      "ĠxËĪuei": 621,
-      "ĠkhËĪo": 622,
-      "Ġlas": 623,
-      "ĠÉĹËĪo": 624,
-      "ĠfÉĽÉ¾": 625,
-      "ĠjËĪiÉĽ": 626,
-      "ĠtËĪe": 627,
-      "ĠkËĮÉĶ": 628,
-      "ĠdeËĲn": 629,
-      "Ġmo": 630,
-      "ĠpËĪi": 631,
-      "ĠtËĪÉĳ": 632,
-      "ËĪÉĽst": 633,
-      "wËĪÉĳ": 634,
-      "ËĪaÉªt": 635,
-      "ÉĻÊĬ": 636,
-      "ĠËĪi": 637,
-      "Éªj": 638,
-      "aÉª": 639,
-      "ËĪaËĲÉľ": 640,
-      "ĠËĪÉªs": 641,
-      "ĠpÉĶÉ¾": 642,
-      "Ã¦Éľn": 643,
-      "ka": 644,
-      "ÅĭÉ¡": 645,
-      "bÉĻn": 646,
-      "ÊĬf": 647,
-      "ĠpÉ¹": 648,
-      "ĠlËĮe": 649,
-      "ËĪiËĲd": 650,
-      "ËĪaËĲre": 651,
-      "ĠmËĪÊĮ": 652,
-      "ÉĻr": 653,
-      "ĠdÉĳ": 654,
-      "ËĪaËĲto": 655,
-      "ĠpËĪeËĲ": 656,
-      "ĠdËĪoËĲ": 657,
-      "ĠsËĮÊĬ": 658,
-      "ĠhËĪi": 659,
-      "ĠsËĪa": 660,
-      "ËĪeËĲn": 661,
-      "dÉĻ": 662,
-      "Ġpj": 663,
-      "ËĪÅĵÊģ": 664,
-      "lÉªÃ§": 665,
-      "ÉĴn": 666,
-      "ĠËĪÉĻr": 667,
-      "tËĪe": 668,
-      "Ġil": 669,
-      "ËĪaËĲl": 670,
-      "ĠsËĮÉĻÊĬ": 671,
-      "sÊĪ": 672,
-      "ĠdËĪuËĲ": 673,
-      "hËĪÉĳ": 674,
-      "ĠxËĪou": 675,
-      "ĠlËĪaiÉľ": 676,
-      "wËĪo": 677,
-      "ËĪÉĽnte": 678,
-      "Ġsy": 679,
-      "ĠzÉªÃ§": 680,
-      "ĠÉ¡ËĪu": 681,
-      "ĠÉķËĪy": 682,
-      "ËĪÉĶËĲl": 683,
-      "ÉĶl": 684,
-      "ĠtËĪo": 685,
-      "ĠÊĭoËĲ": 686,
-      "ĠiËĲ": 687,
-      "wËĪaÃ°a": 688,
-      "ËĪando": 689,
-      "ĠaÎ¸ÉĽnt": 690,
-      "ĠaÎ¸ÉĽntwËĪaÃ°a": 691,
-      "ĠtËĪiÉĽ": 692,
-      "ËĪeiÉľ": 693,
-      "ĠpËĮa": 694,
-      "ĠnËĪaÉª": 695,
-      "wa": 696,
-      "Ġfr": 697,
-      "ĠÊĲËĪÉĻÉľn": 698,
-      "ËĪua": 699,
-      "mi": 700,
-      "ĠmËĪÉĽ": 701,
-      "ËĪeËĲkÊ°": 702,
-      "cÊ°": 703,
-      "ĠwËĪÉĳ": 704,
-      "sta": 705,
-      "Ġtu": 706,
-      "Ġsk": 707,
-      "ËĪÉĶl": 708,
-      "ËĪeËĲÊĪ": 709,
-      "ĠlËĪaËĲÉª": 710,
-      "ĠlËĪaËĲ": 711,
-      "ËĪÉĽËĲs": 712,
-      "ËĪÉĽÉ¾a": 713,
-      "ËĪÉĻÉľt": 714,
-      "Ġyn": 715,
-      "dÉĻn": 716,
-      "Ġdi": 717,
-      "ËĪiËĲs": 718,
-      "ĠÃ°el": 719,
-      "ËĪÊĮr": 720,
-      "ĠhËĪaËĲ": 721,
-      "ĠbÉĻ": 722,
-      "ĠjËĪuËĲ": 723,
-      "lle": 724,
-      "sto": 725,
-      "ËĪÉªt": 726,
-      "ËĪoËĲÉ¾": 727,
-      "bÊ°": 728,
-      "mÉĻn": 729,
-      "ËĮuÉĻ": 730,
-      "ËĮÉĻÉ¾": 731,
-      "ËĪÊĮn": 732,
-      "ĠlËĪaÉªk": 733,
-      "ĠbËĪa": 734,
-      "ÉªÃ°": 735,
-      "Ġlo": 736,
-      "zi": 737,
-      "ËĪÊĮst": 738,
-      "mËĪi": 739,
-      "ÉĶÊģ": 740,
-      "ĠnËĪÉªÃ§t": 741,
-      "ĠtÉ¾": 742,
-      "ĠdËĪeËĲkÊ°": 743,
-      "ĠsËĮe": 744,
-      "ĠnËĪÉĻÊĬ": 745,
-      "Ġu": 746,
-      "Ġsi": 747,
-      "ĠÉªÃ§": 748,
-      "Ġpr": 749,
-      "ĠtÉķËĪy": 750,
-      "ĠmËĪu": 751,
-      "za": 752,
-      "ĠtÊģ": 753,
-      "ĠwÉªÃ°": 754,
-      "tËĪÉĽ": 755,
-      "ĠpËĪÊĮÉ¾": 756,
-      "ĠkËĪÉĶ": 757,
-      "ËĪoËĲr": 758,
-      "ĠhËĮa": 759,
-      "ĠkËĪonÉ¡": 760,
-      "ĠpuÊģ": 761,
-      "Ġdy": 762,
-      "ËĪÉªn": 763,
-      "nte": 764,
-      "ĠkËĮa": 765,
-      "ËĪÉĻÉª": 766,
-      "Ġmi": 767,
-      "ĠÉ¡ËĮuÉĻ": 768,
-      "ĠÊ²": 769,
-      "ĠfËĪÉĳ": 770,
-      "ĠvÉĳËĲ": 771,
-      "ĠËĮaÊĬ": 772,
-      "ËĮuËĲ": 773,
-      "ĠËĪun": 774,
-      "ĠjËĪÊĮha": 775,
-      "juËĲ": 776,
-      "ĠmÉªt": 777,
-      "ĠlËĪÉĽ": 778,
-      "ËĪeËĲÊĥ": 779,
-      "ĠfÉĶËĲ": 780,
-      "mÉĻ": 781,
-      "É¾t": 782,
-      "ĠkËĮon": 783,
-      "ĠlËĪÉĶ": 784,
-      "ĠxËĪÉĳu": 785,
-      "pl": 786,
-      "ĠdËĪi": 787,
-      "ĠlËĪoËĲ": 788,
-      "sÉĻ": 789,
-      "ËĪaËĲva": 790,
-      "ĠlËĪu": 791,
-      "ĠÉ¡ËĮÉĻÊĬ": 792,
-      "Ġhav": 793,
-      "ĠËĮaËĲpkËĮoËĲ": 794,
-      "É¾ËĪi": 795,
-      "ĠfËĪÉĻ": 796,
-      "ĠhËĮÉĻm": 797,
-      "ËĪonÉ¡Éľ": 798,
-      "jo": 799,
-      "ĠsÉĶ": 800,
-      "ËĪaËĲd": 801,
-      "wËĪiÉĻ": 802,
-      "ËĪand": 803,
-      "ËĮaÉªn": 804,
-      "tÉ¾": 805,
-      "ĠËĮÉª": 806,
-      "ĠËĪuna": 807,
-      "ĠxwËĪÉĳ": 808,
-      "ĠjÉĶËĲ": 809,
-      "ÊģËĪi": 810,
-      "ĠkËĪuoÉľ": 811,
-      "ĠaÎ²": 812,
-      "ĠÉ¡ËĪaËĲ": 813,
-      "ano": 814,
-      "tÉĻl": 815,
-      "ĠrËĮe": 816,
-      "ËĮÊĮt": 817,
-      "ĠjËĪiÉĳ": 818,
-      "ĠÉ¾ËĮÉĻhaËĲ": 819,
-      "ĠmËĪe": 820,
-      "ĠËĪyÃ¦Éľn": 821,
-      "ĠfËĪu": 822,
-      "Ġbl": 823,
-      "nËĪi": 824,
-      "sÉĻn": 825,
-      "ĠaÉªn": 826,
-      "ËĪiÊĬ": 827,
-      "ĠÃ°eÉª": 828,
-      "ĠÉªts": 829,
-      "Ġ(": 830,
-      "ËĪyËĲ": 831,
-      "ÉĻd": 832,
-      "ĠËĮo": 833,
-      "ĠÉĽs": 834,
-      "ĠviËĲ": 835,
-      "ËĲÉ¡eËĲ": 836,
-      "kËĪe": 837,
-      "ĠËĪal": 838,
-      "ÉĽl": 839,
-      "ĠÊĮ": 840,
-      "ËĲo": 841,
-      "ĠkËĪo": 842,
-      "ĠÊĪËĪuËĲ": 843,
-      "ĠsËĪÉª": 844,
-      "ËĪeËĲÉ¾": 845,
-      "Éľm": 846,
-      "ËĮÉĻn": 847,
-      "ËĪaËĲi": 848,
-      "ËĪoËĲl": 849,
-      "ÉªËĮeËĲ": 850,
-      "ĠÊ²ËĪy": 851,
-      "ĠkËĪÉĶËĲ": 852,
-      "sËĪi": 853,
-      "ĠlËĪe": 854,
-      "ËĮÉĴt": 855,
-      "ËĪiËĲp": 856,
-      "aÊģ": 857,
-      "ĠÎ¸ËĪÉªÅĭ": 858,
-      "ËĪÉĻËĲÉª": 859,
-      "ËĪÊĮl": 860,
-      "ĠhËĪoËĲtaËĲ": 861,
-      "ËĪoÉª": 862,
-      "nto": 863,
-      "zh": 864,
-      "ĠdeËĲm": 865,
-      "ĠkÉĶm": 866,
-      "Ê°ËĪiËĲk": 867,
-      "ĠdÊĴËĪÊĮst": 868,
-      "pÉ¾": 869,
-      "Ġly": 870,
-      "hËĪu": 871,
-      "ËĪÉĶÃ¸": 872,
-      "ËĪaËĲs": 873,
-      "ĠËĪan": 874,
-      "ĠËĪÉĴ": 875,
-      "Ġkan": 876,
-      "ĠtsËĪuo": 877,
-      "ËĪeËĲva": 878,
-      "ĠÉ¡É¾": 879,
-      "Ġpo": 880,
-      "ĠtÊĥËĪÉĶ": 881,
-      "Êİa": 882,
-      "ĠmËĮi": 883,
-      "Êĥt": 884,
-      "tËĪi": 885,
-      "ĠhËĪÊĮ": 886,
-      "tÊĥe": 887,
-      "ĠfÉĶn": 888,
-      "ve": 889,
-      "ĠnËĮe": 890,
-      "ËĪÉĶÊģ": 891,
-      "iz": 892,
-      "ĠsËĪuo": 893,
-      "ËĪÉĽËĲr": 894,
-      "wËĪaÊģ": 895,
-      "ËĪaÃ°a": 896,
-      "Åĭk": 897,
-      "po": 898,
-      "ĠkËĪi": 899,
-      "ËĪad": 900,
-      "ĠvËĪi": 901,
-      "tÉķ": 902,
-      "ĠkËĪÉĻ": 903,
-      "ĠwËĪu": 904,
-      "ÉĴz": 905,
-      "ĠvÉĳËĲÉ¾": 906,
-      "ÊģËĪÉĽ": 907,
-      "ĠkËĪaËĲ": 908,
-      "ke": 909,
-      "nÉĻ": 910,
-      "ËĪÊĮb": 911,
-      "ËĪuËĲÉ¾": 912,
-      "ËĮÉĻËĲ": 913,
-      "ĠÊĪÊ°ËĪiËĲk": 914,
-      "ĠkËĪu": 915,
-      "ĠbËĮÊĮt": 916,
-      "Ġat": 917,
-      "ĠfÉ¹": 918,
-      "ËĪax": 919,
-      "ĠzoËĲ": 920,
-      "ĠtËĪaËĲ": 921,
-      "ĠÃ°ËĮe": 922,
-      "neËĲ": 923,
-      "ĠÉĳËĲ": 924,
-      "ĠaÊĬf": 925,
-      "am": 926,
-      "ÊĬÅĭ": 927,
-      "ĠÉĶËĲ": 928,
-      "ĠÉķËĪiÉľÅĭ": 929,
-      "ĠËĪÉĶËĲl": 930,
-      "Éªm": 931,
-      "jËĪo": 932,
-      "ËĪiËĲÉŁ": 933,
-      "ĠkwËĮÉĽ": 934,
-      "ĠmËĪas": 935,
-      "ÉĻh": 936,
-      "ĠËĪaÊĬ": 937,
-      "ËĪÉĶÉª": 938,
-      "É¡ÉĻÉ¾": 939,
-      "rÉĻn": 940,
-      "ËĪÉªk": 941,
-      "sse": 942,
-      "ĠpËĪÉĳ": 943,
-      "ĠÉĹËĮe": 944,
-      "ĠÉĹËĪi": 945,
-      "Ġaz": 946,
-      "ĠÉ¡ËĪÊĮjaËĲ": 947,
-      "ze": 948,
-      "ĠÉĹËĮaËĲ": 949,
-      "ĠfËĪi": 950,
-      "ĠËĮÉĴn": 951,
-      "ĠxËĪo": 952,
-      "ĠËĮÊĬna": 953,
-      "ĠtÊ°aËĲ": 954,
-      "ĠsÉĳ": 955,
-      "ËĪeÉªÊĥÉĻn": 956,
-      "ĠtÉķËĪiÉľ": 957,
-      "ĠÉŁaËĲ": 958,
-      "pËĲ": 959,
-      "Ġply": 960,
-      "Î¸ËĪi": 961,
-      "ËĲÉĸ": 962,
-      "ĠtËĪuei": 963,
-      "ĠlËĪÉĻ": 964,
-      "ĠdÉĳËĲ": 965,
-      "ft": 966,
-      "ËĪam": 967,
-      "ĠsËĪÊĮkt": 968,
-      "ĠtËĪou": 969,
-      "ĠpËĪiÉĽ": 970,
-      "ĠËĪai": 971,
-      "ĠwËĪÉĴn": 972,
-      "ĠzËĮaÉªn": 973,
-      "Ġest": 974,
-      "ĠmÉĶ": 975,
-      "ĠtÉķjËĪÉĳu": 976,
-      "Éľp": 977,
-      "ËĪÊĮz": 978,
-      "bi": 979,
-      "ËĪÉĽËĲseËĲ": 980,
-      "ĠlËĪy": 981,
-      "ĠmËĮe": 982,
-      "ĠdËĮÉĽl": 983,
-      "ËĪiËĲl": 984,
-      "ĠkËĮomo": 985,
-      "ĠhËĪaÉľn": 986,
-      "ËĪoËĲne": 987,
-      "ĠkËĪÊĮÉ¾t": 988,
-      "ĠsyÊģ": 989,
-      "ËĮÉĶÉ¾": 990,
-      "ĠÉªf": 991,
-      "uv": 992,
-      "zÉĻn": 993,
-      "ol": 994,
-      "Ïĩ": 995,
-      "im": 996,
-      "ĠmËĪiÉĽ": 997,
-      "ĠÃ°Éª": 998,
-      "ĠvËĪÉĽ": 999,
-      "ÊĬd": 1000,
-      "Ġtr": 1001,
-      "ËĪeËĲs": 1002,
-      "Ã°e": 1003,
-      "de": 1004,
-      "Ê°Ïĩ": 1005,
-      "ÉŁÊ°": 1006,
-      "ËĮÉĻËĲÉªÉľ": 1007,
-      "bËĲ": 1008,
-      "ËĪÊĬk": 1009,
-      "ĠnËĪÉĶÉªÉľ": 1010,
-      "ĠËĮiËĲ": 1011,
-      "ËĪÉĳËĲt": 1012,
-      "ËĪiËĲÉ¾": 1013,
-      "ĠtÉ¹": 1014,
-      "É¾ÉĶ": 1015,
-      "ĠwÉĴz": 1016,
-      "Ġvu": 1017,
-      "bÉĻl": 1018,
-      "bÉĻ": 1019,
-      "É¹i": 1020,
-      "nts": 1021,
-      "ĠsËĪaËĲ": 1022,
-      "dÊ°": 1023,
-      "ĠtÊĬ": 1024,
-      "ĠÊİËĮi": 1025,
-      "Î²a": 1026,
-      "hËĪÉĻÉľÅĭ": 1027,
-      "ĠsËĪiËĲ": 1028,
-      "ĠpËĮaÉ¾a": 1029,
-      "ËĪÉĽÉ¾ÉĶ": 1030,
-      "ËĪÉªs": 1031,
-      "É£o": 1032,
-      "ĠËĮal": 1033,
-      "or": 1034,
-      "ĠbËĪÊĮh": 1035,
-      "ĠkËĪoËĲ": 1036,
-      "ĠtËĪÉĽ": 1037,
-      "ĠpËĪo": 1038,
-      "ĠÊĴÉĻ": 1039,
-      "pÊģ": 1040,
-      "ĠËĪaÉª": 1041,
-      "hËĪÉĳÉľÅĭ": 1042,
-      "ÉĻli": 1043,
-      "ËĪeÉªt": 1044,
-      "ĠjËĪiouÉľ": 1045,
-      "ĠdËĪÉĻ": 1046,
-      "ĠmËĪÉĶËĲ": 1047,
-      "lËĪi": 1048,
-      "ËĮyÉĻ": 1049,
-      "ĠlËĪoËĲÉ¡": 1050,
-      "ĠnËĪÊĮ": 1051,
-      "ĠhËĪÊĬ": 1052,
-      "ĠnËĪÉĻÉľÅĭ": 1053,
-      "ĠÊģÉĻ": 1054,
-      "zËĪi": 1055,
-      "ĠtËĪuËĲ": 1056,
-      "ĠkËĮome": 1057,
-      "ĠlËĪeËĲ": 1058,
-      "ËĪaËĲtaËĲ": 1059,
-      "Ġan": 1060,
-      "ĠËĪyu": 1061,
-      "ĠËĮÊĮÉ¡ÉĻÉ¾": 1062,
-      "ĠËĪÉªn": 1063,
-      "ĠhËĪoÉĻ": 1064,
-      "vÉĻ": 1065,
-      "ËĪÃ¸ËĲ": 1066,
-      "Î¸ja": 1067,
-      "ËĪuÉĻÉľn": 1068,
-      "ĠkÉĻÉ¾": 1069,
-      "ËĪat": 1070,
-      "jËĪÃ¸": 1071,
-      "ËĪÉĽtÊģ": 1072,
-      "ĠpËĪÉĳu": 1073,
-      "stÉĻ": 1074,
-      "ĠwÉĴt": 1075,
-      "ËĪeËĲl": 1076,
-      "ÊĪi": 1077,
-      "ĠxËĪaiÉľ": 1078,
-      "ËĪyÊģ": 1079,
-      "ĠhËĪoËĲÉ¡aËĲ": 1080,
-      "ĠtsËĪi": 1081,
-      "ĠËĪÊĮp": 1082,
-      "ĠnËĮÉĴt": 1083,
-      "ĠlËĪÉªeËĲ": 1084,
-      "ĠhËĪa": 1085,
-      "Ġfl": 1086,
-      "ĠnËĪeËĲ": 1087,
-      "ËĮaËĲÉª": 1088,
-      "ĠtËĪuo": 1089,
-      "tÊĥËĲ": 1090,
-      "sËĪe": 1091,
-      "bÊ°i": 1092,
-      "ĠbËĪÊĮhÊĬt": 1093,
-      "ËĪÉĽnd": 1094,
-      "ĠsËĪÉĶ": 1095,
-      "ÉĻns": 1096,
-      "ËĮÉĻl": 1097,
-      "ÉĽÉľ": 1098,
-      "ĠÉ¡l": 1099,
-      "ËĪÉªÉ¾": 1100,
-      "ËĪaËĲta": 1101,
-      "ÉľËĲ": 1102,
-      "ËĪÉĽnto": 1103,
-      "skËĮoËĲ": 1104,
-      "ËĪÉĽk": 1105,
-      "tsi": 1106,
-      "ĠtËĪonÉ¡": 1107,
-      "ĠbiËĲ": 1108,
-      "ĠhËĪaËĲÉª": 1109,
-      "ĠbËĪi": 1110,
-      "jj": 1111,
-      "Êİi": 1112,
-      "ĠkÊ°": 1113,
-      "ĠsËĪo": 1114,
-      "llo": 1115,
-      "ĠbaÉª": 1116,
-      "ĠÉĽnt": 1117,
-      "ĠËĪiËĲ": 1118,
-      "ĠÉ¡ËĪo": 1119,
-      "É¾eËĲ": 1120,
-      "ĠkÊĭ": 1121,
-      "ĠmËĪeiÉľ": 1122,
-      "ÊĬËĪÉĶËĲ": 1123,
-      "ĠtËĪaÉª": 1124,
-      "Ġsus": 1125,
-      "Ġri": 1126,
-      "ĠvËĮÉĽ": 1127,
-      "ËĪiËĲno": 1128,
-      "vano": 1129,
-      "ĠdËĮiËĲ": 1130,
-      "ĠÊĲËĪaÉľn": 1131,
-      "ÊĤ": 1132,
-      "ĠÉĲb": 1133,
-      "ËĪaËĲh": 1134,
-      "ÉªÊĥ": 1135,
-      "ĠdËĮella": 1136,
-      "tËĲi": 1137,
-      "ĠËĪÊĬn": 1138,
-      "ĠhiËĲ": 1139,
-      "ĠbËĪaËĲt": 1140,
-      "ĠthËĪi": 1141,
-      "Ġam": 1142,
-      "ĠËĪoËĲ": 1143,
-      "Ġhu": 1144,
-      "ĠkËĪÊĮh": 1145,
-      "ĠzËĪÉĳËĲ": 1146,
-      "ĠÉ¡ËĮÉĶ": 1147,
-      "ĠËĪÉĻÊĬ": 1148,
-      "yËĪi": 1149,
-      "ĠlËĪÊĮ": 1150,
-      "ĠdËĪeËĲ": 1151,
-      "ĠsËĪÉĶËĲ": 1152,
-      "skËĮeËĲ": 1153,
-      "É¾o": 1154,
-      "ÊģËĪÉĳ": 1155,
-      "tËĪa": 1156,
-      "ĠkËĪÊĬ": 1157,
-      "ËĪante": 1158,
-      "ĠdÉĶ": 1159,
-      "ĠsËĪeÉª": 1160,
-      "ĠsÉĽt": 1161,
-      "É¹Éª": 1162,
-      "ĠÉ¡ËĮÉĻÊĬÉªÅĭ": 1163,
-      "zo": 1164,
-      "ĠjËĪaËĲ": 1165,
-      "ĠÉĴvÃ°ÉĻ": 1166,
-      "ĠÊĿ": 1167,
-      "ĠÉĽl": 1168,
-      "ĠsËĪoËĲ": 1169,
-      "ĠthËĪiÉľ": 1170,
-      "ĠËĪÉĽl": 1171,
-      "ĠlyËĮi": 1172,
-      "ndÊĴ": 1173,
-      "ĠÉķjËĪÉĳu": 1174,
-      "Î¸a": 1175,
-      "ĠÉ¾ËĮÉĻheËĲ": 1176,
-      "ĠmaÉª": 1177,
-      "jÉĻ": 1178,
-      "ĠËĪÊĮb": 1179,
-      "asjËĪÉĶ": 1180,
-      "dÊģ": 1181,
-      "ĠkhËĪa": 1182,
-      "ĠËĪes": 1183,
-      "vi": 1184,
-      "fi": 1185,
-      "ËĮÉĻb": 1186,
-      "Ġre": 1187,
-      "ĠavËĮÉĽ": 1188,
-      "ĠtËĮi": 1189,
-      "ĠkÉ¾": 1190,
-      "ĠbÉªk": 1191,
-      "ste": 1192,
-      "ËĪeËĲÊĥc": 1193,
-      "pt": 1194,
-      "zÉĻ": 1195,
-      "ĠwËĪaËĲ": 1196,
-      "kl": 1197,
-      "ĠsËĪÊĮm": 1198,
-      "ÉªÊĪ": 1199,
-      "dz": 1200,
-      "vo": 1201,
-      "ËĮaÊĬt": 1202,
-      "nde": 1203,
-      "ĠdÉĽs": 1204,
-      "ĠÉŁËĪaËĲ": 1205,
-      "ĠrËĮi": 1206,
-      "sËĮeËĲ": 1207,
-      "É¡i": 1208,
-      "Ġals": 1209,
-      "ËĪiÃ°o": 1210,
-      "ĠnËĪiÉľn": 1211,
-      "ÊĬl": 1212,
-      "tsËĲ": 1213,
-      "ËĪanto": 1214,
-      "ĠÉĹËĪÉĻÊĬ": 1215,
-      "kËĲi": 1216,
-      "ĠsËĪÊĮb": 1217,
-      "ĠnËĪa": 1218,
-      "ĠlËĮo": 1219,
-      "ĠphËĪi": 1220,
-      "mËĮe": 1221,
-      "Ġfa": 1222,
-      "kÉĻ": 1223,
-      "ĠzËĪu": 1224,
-      "ns": 1225,
-      "ĠÊģe": 1226,
-      "ĠbËĪo": 1227,
-      "ËĪaËĲti": 1228,
-      "Ġman": 1229,
-      "ĠlËĪiÉĳ": 1230,
-      "ĠÉĹËĮyÉĻ": 1231,
-      "ĠfËĪÉĶËĲ": 1232,
-      "ĠkÊĭËĪeËĲÊĥc": 1233,
-      "ĠxËĪÉĳ": 1234,
-      "ĠtÉķËĪu": 1235,
-      "jÉĻÉ¾": 1236,
-      "ĠÉªst": 1237,
-      "wËĪi": 1238,
-      "ĠËĮaÉªnÉĻ": 1239,
-      "ÉªÉ¡": 1240,
-      "ĠsÊĪ": 1241,
-      "ËĪiÉĻl": 1242,
-      "ĠnËĪiÉĽÉľn": 1243,
-      "ĠËĮÉĽËĲ": 1244,
-      "ËĪaÉªnd": 1245,
-      "ĠzËĪi": 1246,
-      "vÉĻn": 1247,
-      "mz": 1248,
-      "Ã°os": 1249,
-      "dÊĴËĲ": 1250,
-      "jËĪa": 1251,
-      "É¾ËĪÉĶ": 1252,
-      "lËĪe": 1253,
-      "Ê²": 1254,
-      "ĠvËĪÉĶ": 1255,
-      "ĠlËĪiÉĽ": 1256,
-      "Î¸e": 1257,
-      "mËĪente": 1258,
-      "ĠÉªnÃ°ÉĻ": 1259,
-      "ĠaÉªm": 1260,
-      "nÉĻn": 1261,
-      "ĠhÉĻm": 1262,
-      "É¾aËĲ": 1263,
-      "ĠsËĪuoÉľ": 1264,
-      "ĠÉ²ËĪi": 1265,
-      "ĠÉ¹ËĪiÉĻl": 1266,
-      "lËĪa": 1267,
-      "ĠbËĪÉĶ": 1268,
-      "ĠkËĪai": 1269,
-      "ÊģËĪa": 1270,
-      "ĠwËĪÉľËĲ": 1271,
-      "ĠaËĲ": 1272,
-      "Ġpas": 1273,
-      "ËĪÊĮs": 1274,
-      "wËĪÉĽÉ¾": 1275,
-      "ĠÉĹËĪe": 1276,
-      "ĠhËĮatÉĻ": 1277,
-      "aÉªn": 1278,
-      "ĠËĪÉĶpÊ°": 1279,
-      "ÊģËĪe": 1280,
-      "ĠÉŁaËĲËĪeËĲÉ¡aËĲ": 1281,
-      "ĠËĪÊĬs": 1282,
-      "ĠtÉķhËĪiÉľ": 1283,
-      "ntÊĥ": 1284,
-      "ĠxËĪuo": 1285,
-      "ËĪuÊģ": 1286,
-      "ĠÉªm": 1287,
-      "É³Éĸ": 1288,
-      "ËĪyÉĻÉľkh": 1289,
-      "ĠËĪyÉĽ": 1290,
-      "ĠmËĮaËĲ": 1291,
-      "ÅĵÊģ": 1292,
-      "ĠËĪalt": 1293,
-      "ĠkÉĻm": 1294,
-      "Êİo": 1295,
-      "ĠÉĲn": 1296,
-      "Ġfy": 1297,
-      "ĠËĮÉĽra": 1298,
-      "ĠÉ¡ËĪÊĬ": 1299,
-      "ĠpËĪÊĮ": 1300,
-      "ls": 1301,
-      "ĠlËĪiËĲ": 1302,
-      "ĠÊĤËĪy": 1303,
-      "ĠbÉªkËĪÊĮz": 1304,
-      "ĠÉ¡ÉĽt": 1305,
-      "ĠbÉ¾": 1306,
-      "tÊ°": 1307,
-      "tÉĻlËĮÉĻb": 1308,
-      "xo": 1309,
-      "skËĮaËĲ": 1310,
-      "É²Ê²": 1311,
-      "ËĪeËĲkÊĪ": 1312,
-      "rÉĻ": 1313,
-      "tÊĥo": 1314,
-      "ĠpÊģÉĶ": 1315,
-      "ĠÉ¹ËĪaÉªt": 1316,
-      "ĠpËĪei": 1317,
-      "ËĮÉªÃ§": 1318,
-      "jËĪÉĽÉ¾": 1319,
-      "tËĲa": 1320,
-      "ĠÉĲbËĮaÊĬt": 1321,
-      "ĠkÊĭËĪeËĲÊĥcÉĻn": 1322,
-      "ĠvËĪe": 1323,
-      "ÊĬÉľ": 1324,
-      "ĠakËĪe": 1325,
-      "ĠpËĪai": 1326,
-      "vËĪÉĽ": 1327,
-      "ĠÎ¸É¹": 1328,
-      "Éªf": 1329,
-      "ĠavËĪÉĽ": 1330,
-      "ĠkËĪe": 1331,
-      "dËĪi": 1332,
-      "ËĪeËĲÉĸ": 1333,
-      "ĠbÉĻt": 1334,
-      "ÊĪÊ°": 1335,
-      "teËĲ": 1336,
-      "Î¸jËĪÉĶn": 1337,
-      "dÉľ": 1338,
-      "ĠjËĪiÉľ": 1339,
-      "Ġve": 1340,
-      "É£ËĪu": 1341,
-      "ËĪÊĮhÉĻl": 1342,
-      "ĠpÉĶ": 1343,
-      "ĠÉ¡r": 1344,
-      "ĠÃ°a": 1345,
-      "ĠvËĪiËĲ": 1346,
-      "ĠËĮÉĳËĲ": 1347,
-      "ËĪÉĻÊĬnt": 1348,
-      "ĠbËĪaËĲÉ¾": 1349,
-      "ĠmËĪÊĮtÉĻlËĮÉĻb": 1350,
-      "ld": 1351,
-      "ĠtÉķËĮÉĶ": 1352,
-      "pa": 1353,
-      "Ã°ËĪad": 1354,
-      "ËĪiÉ¾": 1355,
-      "ĠxËĪu": 1356,
-      "ĠlËĪiÉľÅĭ": 1357,
-      "ËĪeÉªs": 1358,
-      "ĠÉĹËĮeÉľn": 1359,
-      "ĠthËĪiÉĽ": 1360,
-      "tËĲe": 1361,
-      "ĠavËĮÉĽk": 1362,
-      "ĠËĮÉĶ": 1363,
-      "ĠkËĪÉĳu": 1364,
-      "Éªv": 1365,
-      "iËĲz": 1366,
-      "ËĪos": 1367,
-      "ĠÉ¡É¹": 1368,
-      "and": 1369,
-      "ĠlËĪiou": 1370,
-      "ĠËĪoÉľ": 1371,
-      "É¡l": 1372,
-      "ĠpËĪÉĶËĲ": 1373,
-      "ĠmËĮeËĲ": 1374,
-      "ĠkËĪÉĴ": 1375,
-      "nos": 1376,
-      "Ã§ÉĻn": 1377,
-      "fÉĻn": 1378,
-      "ĠsËĪÊĮktËĮeËĲ": 1379,
-      "ĠËĪaÉªn": 1380,
-      "ËĪoËĲre": 1381,
-      "jËĪÉĽn": 1382,
-      "ĠÃ°ËĪÉĽn": 1383,
-      "ĠtÉķhËĪiÉĽÉľn": 1384,
-      "ĠhËĪaÉª": 1385,
-      "É¾ËĪÉĽ": 1386,
-      "ĠsËĪu": 1387,
-      "ĠkËĪÉªjaËĲ": 1388,
-      "ĠpjËĮÊĬ": 1389,
-      "ĠhÉĻmËĮaËĲ": 1390,
-      "ĠËĮÊĮp": 1391,
-      "ĠpËĪÊĮhÉĻl": 1392,
-      "ĠxËĪÉĻ": 1393,
-      "dËĪe": 1394,
-      "ĠmÉĳ": 1395,
-      "ĠÊĬm": 1396,
-      "ndÉĻ": 1397,
-      "ĠdËĪÉĻÊĬnt": 1398,
-      "ËĪeËĲÊĥÉĻn": 1399,
-      "ĠÃ°ats": 1400,
-      "is": 1401,
-      "ĠcËĪaËĲh": 1402,
-      "pe": 1403,
-      "ĠsËĮo": 1404,
-      "ĠÃ°ËĪe": 1405,
-      "ĠsËĪaËĲt": 1406,
-      "ËĪaÊģ": 1407,
-      "ĠsËĪe": 1408,
-      "ÉĻk": 1409,
-      "ÉªÊĭ": 1410,
-      "ĠkËĪoËĲi": 1411,
-      "kÉĶ": 1412,
-      "ĠvËĪaËĲÊĬ": 1413,
-      "ĠfËĪei": 1414,
-      "ĠlËĪeËĲk": 1415,
-      "ĠhËĪiÉĻ": 1416,
-      "ĠaÊĬ": 1417,
-      "ËĪÉĽndo": 1418,
-      "ËĪes": 1419,
-      "ĠzËĪÉĶ": 1420,
-      "ĠËĪÉĽÉ¾a": 1421,
-      "nËĪiÉľn": 1422,
-      "ĠkËĪÊĮm": 1423,
-      "ĠlËĪÉĴ": 1424,
-      "Éªst": 1425,
-      "ĠpÉĳ": 1426,
-      "ĠfËĪÉĶ": 1427,
-      "ĠthËĪonÉ¡": 1428,
-      "nke": 1429,
-      "ËĮÉªk": 1430,
-      "ĠÉ²ËĪÉĻ": 1431,
-      "ËĮÊĮm": 1432,
-      "ËĪiËĲt": 1433,
-      "ĠwËĪÉĴnt": 1434,
-      "ËĪaÎ²an": 1435,
-      "ĠbËĪÊĮr": 1436,
-      "ÉĽnd": 1437,
-      "ĠËĮÉĳËĲbÉľ": 1438,
-      "ĠvËĪaÉª": 1439,
-      "ĠtÊĥËĮi": 1440,
-      "ĠÎ¸ËĪÉªÅĭk": 1441,
-      "sti": 1442,
-      "ĠkÉ¹": 1443,
-      "ĠËĪaÊĬt": 1444,
-      "stÉĻn": 1445,
-      "ĠÊĭËĪÊĮn": 1446,
-      "ĠÉ¡ËĮaËĲ": 1447,
-      "ËĪaËĲÉľÉ²": 1448,
-      "Êģi": 1449,
-      "ĠnËĪÉĶx": 1450,
-      "ĠÉ¹ËĪiÉĻlÉª": 1451,
-      "ĠvËĮi": 1452,
-      "ĠÃ°eÉĻ": 1453,
-      "ËĮÉªtÊĥ": 1454,
-      "ĠvËĪyÉĻ": 1455,
-      "ĠËĮaËĲpkËĮaËĲ": 1456,
-      "ĠfËĮaËĲÉª": 1457,
-      "ĠpËĪÉĶ": 1458,
-      "ĠnËĪÊĮmb": 1459,
-      "Î¸es": 1460,
-      "jËĪÉĽÊģ": 1461,
-      "ĠkËĪÊĬcÊ°": 1462,
-      "mËĪÉĽ": 1463,
-      "ĠvËĪu": 1464,
-      "ĠlÅĵÊģ": 1465,
-      "ĠiËĲm": 1466,
-      "ÊĪÉĻÉ¾": 1467,
-      "tÊĥi": 1468,
-      "ËĲs": 1469,
-      "ĠtËĪy": 1470,
-      "ĠmËĪiÉľÅĭ": 1471,
-      "É¾ËĪe": 1472,
-      "mËĮa": 1473,
-      "ĠmËĮiËĲ": 1474,
-      "ĠÉĽks": 1475,
-      "Éªp": 1476,
-      "ĠkËĪÊĮÉ¾nËĮaËĲ": 1477,
-      "ĠËĮaÊĬx": 1478,
-      "rËĪiËĲ": 1479,
-      "ĠcËĪÊĮl": 1480,
-      "mos": 1481,
-      "ĠkËĪÊĮÉ¾tËĮeËĲ": 1482,
-      "iËĲÉ¾": 1483,
-      "kÉĻn": 1484,
-      "ĠdËĪu": 1485,
-      "naËĲ": 1486,
-      "ĠpwËĪe": 1487,
-      "ËĮÉĶÉª": 1488,
-      "ĠtÉķhËĪiÉĽ": 1489,
-      "ĠÎ²ËĪi": 1490,
-      "ËĪiÉĽÉľt": 1491,
-      "Ġte": 1492,
-      "ËĪaÃ°os": 1493,
-      "mËĪa": 1494,
-      "ĠvËĪo": 1495,
-      "ĠmËĪÉª": 1496,
-      "ĠbËĮi": 1497,
-      "ad": 1498,
-      "do": 1499,
-      "ĠnËĪaÊĬ": 1500,
-      "ĠÊ²ËĪyÉľ": 1501,
-      "wËĪÉĽ": 1502,
-      "ËĪis": 1503,
-      "el": 1504,
-      "Ġpar": 1505,
-      "ĠtËĪai": 1506,
-      "ĠdËĪÉªjaËĲ": 1507,
-      "hËĪi": 1508,
-      "ĠÉ¾ËĪÊĮ": 1509,
-      "ĠdËĪe": 1510,
-      "ËĪaÉªd": 1511,
-      "Ġper": 1512,
-      "ĠsËĮÉĶ": 1513,
-      "we": 1514,
-      "ÊĬm": 1515,
-      "Ġin": 1516,
-      "ĠjËĪuËĲz": 1517,
-      "ËĪiËĲpÉĻl": 1518,
-      "ĠÊĭËĪaËĲl": 1519,
-      "ĠetËĪÉĽ": 1520,
-      "ËĮÉĽm": 1521,
-      "ĠnËĪu": 1522,
-      "ËĪÉĽkt": 1523,
-      "ĠiËĲÉ¾": 1524,
-      "ĠbÉ¹": 1525,
-      "ĠtshËĪi": 1526,
-      "ĠÉĹËĪÉĶÉľ": 1527,
-      "ĠkwËĮa": 1528,
-      "ĠfËĪuÉľ": 1529,
-      "wËĮa": 1530,
-      "ĠdËĪiËĲ": 1531,
-      "ĠÉ¡ËĪyÉĻ": 1532,
-      "ËĮÉĽËĲ": 1533,
-      "rËĪa": 1534,
-      "Ġne": 1535,
-      "ĠzËĪyÉĻ": 1536,
-      "ĠbËĪaÉª": 1537,
-      "ĠÉŁËĪÊĮb": 1538,
-      "ËĪuËĲto": 1539,
-      "ÊĬnt": 1540,
-      "ĠcÊ°": 1541,
-      "ËĪÉĽnti": 1542,
-      "ËĪoÉĻ": 1543,
-      "ĠsËĮÊĮm": 1544,
-      "ĠlÉĳ": 1545,
-      "ËĮeva": 1546,
-      "É¾ÉĽ": 1547,
-      "ntÉľ": 1548,
-      "ĠmËĪÉĽn": 1549,
-      "ËĪÉĳËĲk": 1550,
-      "Ġkil": 1551,
-      "ËĪones": 1552,
-      "ff": 1553,
-      "ĠmËĪÉĽËĲ": 1554,
-      "ĠvËĪÉĻÉª": 1555,
-      "ĠËĪÉĶËĲ": 1556,
-      "ĠËĮÉªnt": 1557,
-      "ÊĬn": 1558,
-      "ĠwÉªl": 1559,
-      "Ġsin": 1560,
-      "ĠËĮalla": 1561,
-      "ĠaÎ²ËĪia": 1562,
-      "pi": 1563,
-      "ËĪoÉľ": 1564,
-      "ÉªjËĮaËĲ": 1565,
-      "ku": 1566,
-      "ĠvËĪÉª": 1567,
-      "Ġtut": 1568,
-      "ĠtËĪeÉľ": 1569,
-      "ĠhËĪÉĶ": 1570,
-      "Î²É¾e": 1571,
-      "sÉĻÉ¾": 1572,
-      "ĠkhËĪai": 1573,
-      "ĠmËĪÉĶ": 1574,
-      "Ġta": 1575,
-      "ĠÉ²ËĪaËĲ": 1576,
-      "Ġnu": 1577,
-      "ËĪuËĲn": 1578,
-      "ĠÉĻËĲÉľ": 1579,
-      "ĠËĪaÊĬf": 1580,
-      "ËĪiËĲdÉľ": 1581,
-      "nti": 1582,
-      "ĠpËĪiËĲpÉĻl": 1583,
-      "Ġkj": 1584,
-      "Ġpe": 1585,
-      "ĠmËĪÉĳ": 1586,
-      "ËĮaÉª": 1587,
-      "ËĪaËĲle": 1588,
-      "ĠvËĮÉĻËĲÉªÉľ": 1589,
-      "mpo": 1590,
-      "ĠkËĪÉªt": 1591,
-      "ĠnËĮÉĽ": 1592,
-      "ĠÉŁËĪaËĲtaËĲ": 1593,
-      "ĠsËĪaËĲtÊ°": 1594,
-      "ĠÉŁËĪi": 1595,
-      "Ġso": 1596,
-      "ĠbËĪÉĽ": 1597,
-      "kËĪi": 1598,
-      "Éªti": 1599,
-      "Ġtsi": 1600,
-      "ĠkÊģ": 1601,
-      "ËĮÉĴ": 1602,
-      "É¡ÉĻl": 1603,
-      "kst": 1604,
-      "ĠmËĪÉĻËĲ": 1605,
-      "ËĪÊĮk": 1606,
-      "ĠnËĪaËĲÊĬ": 1607,
-      "Ġap": 1608,
-      "ĠlËĪÉªkÊ°": 1609,
-      "lli": 1610,
-      "ĠkwËĪal": 1611,
-      "ĠËĪÉĻËĲ": 1612,
-      "ĠtsËĪuei": 1613,
-      "Ġdo": 1614,
-      "ĠkËĲjËĪo": 1615,
-      "ÊĬz": 1616,
-      "ĠpËĪaËĲ": 1617,
-      "ĠmËĪuËĲ": 1618,
-      "ĠÉ¡ÉĻv": 1619,
-      "rËĪi": 1620,
-      "Ġtw": 1621,
-      "ËĮÉªn": 1622,
-      "dËĪÉĳ": 1623,
-      "ĠÃ°ËĪi": 1624,
-      "ĠËĪaËĲi": 1625,
-      "ĠhËĪiÉĽ": 1626,
-      "ĠÃ°ËĮÉĽm": 1627,
-      "ĠpÊ°ËĪÉªÉ¾": 1628,
-      "ÉĴm": 1629,
-      "ĠËĮeËĲ": 1630,
-      "ĠthËĪaiÉľ": 1631,
-      "ĠvËĪas": 1632,
-      "ĠnÉĳËĲ": 1633,
-      "pÉĻn": 1634,
-      "ĠpËĮÉĻÉ¾": 1635,
-      "ĠÉĹËĪaËĲÉª": 1636,
-      "ËĪouÉľ": 1637,
-      "ĠÊĲËĪuÉľ": 1638,
-      "ĠmËĪan": 1639,
-      "ĠtËĪÉĻÉªÉľ": 1640,
-      "ĠlËĪaËĲÊĬ": 1641,
-      "mËĪÉĽnte": 1642,
-      "ĠfËĪam": 1643,
-      "sjËĪÉĶ": 1644,
-      "ĠpËĪÉĻ": 1645,
-      "ËĪeËĲm": 1646,
-      "ĠpËĪÊĮr": 1647,
-      "jËĪi": 1648,
-      "ĠlÉĽ": 1649,
-      "Ġten": 1650,
-      "ËĪoËĲra": 1651,
-      "ki": 1652,
-      "ĠÊĤËĪaËĲÊĬ": 1653,
-      "kÉª": 1654,
-      "bËĲe": 1655,
-      "ËĪalt": 1656,
-      "Ã°Éª": 1657,
-      "pËĪi": 1658,
-      "ĠËĮÉĽnt": 1659,
-      "ĠmËĪei": 1660,
-      "ĠhËĪÉĻÊĬ": 1661,
-      "ĠhËĪÉĽÉ¾": 1662,
-      "jËĪÉĳ": 1663,
-      "ĠhËĪÊĬaËĲ": 1664,
-      "mÉľ": 1665,
-      "ĠdÊ°": 1666,
-      "ĠtÊĥËĪe": 1667,
-      "lËĪÉĽ": 1668,
-      "ËĪaËĲte": 1669,
-      "ĠpËĪuËĲ": 1670,
-      "ĠmËĪÊĬ": 1671,
-      "ËĪaËĲÉªÊĪ": 1672,
-      "diËĲ": 1673,
-      "ĠfÉ¹ÉĴm": 1674,
-      "ĠhËĪÉĳËĲ": 1675,
-      "Î²o": 1676,
-      "ĠmËĪiÉľn": 1677,
-      "ĠÃ°iËĲz": 1678,
-      "ĠkËĪou": 1679,
-      "ËĪiËĲna": 1680,
-      "ĠavËĮeva": 1681,
-      "ĠËĪaËĲÉ¾": 1682,
-      "ĠnËĪuËĲÉ¾": 1683,
-      "ĠÎ²ËĪe": 1684,
-      "ĠzaÉªn": 1685,
-      "ËĪÉĽd": 1686,
-      "ÉĹ": 1687,
-      "ËĪeÉªk": 1688,
-      "sËĮÉĻÊĬ": 1689,
-      "ËĪeËĲÉŁ": 1690,
-      "ĠÊĤËĪÉĻËĲ": 1691,
-      "je": 1692,
-      "cÊ°ËĲ": 1693,
-      "ËĪÉĶr": 1694,
-      "ÉĽËĲ": 1695,
-      "ĠtÉķhËĪyÃ¦Éľn": 1696,
-      "ĠËĮaÉªnÉĻn": 1697,
-      "ĠiËĲn": 1698,
-      "ĠbËĪÊĮc": 1699,
-      "ËĪiËĲm": 1700,
-      "É¾as": 1701,
-      "ËĮÉĻs": 1702,
-      "ĠvËĪeËĲ": 1703,
-      "ĠËĪÉĻrÉľ": 1704,
-      "ĠduËĲ": 1705,
-      "ntÉĻ": 1706,
-      "ĠpÉ¹ËĪÉĴ": 1707,
-      "ĠbËĪÉª": 1708,
-      "ĠwËĪoÉľ": 1709,
-      "nËĮi": 1710,
-      "ĠhÉĲ": 1711,
-      "ĠkËĪÉĽ": 1712,
-      "Ġet": 1713,
-      "jËĪÉĽndo": 1714,
-      "ĠËĪaiÉľ": 1715,
-      "Ġli": 1716,
-      "ĠËĪaÊĬs": 1717,
-      "kËĲo": 1718,
-      "ĠÉĹËĪyÉĻ": 1719,
-      "keËĲ": 1720,
-      "ĠfËĪiËĲl": 1721,
-      "ĠbÊ°ËĪaËĲi": 1722,
-      "ĠÉ¡ÉĻÊĥ": 1723,
-      "ÊĴËĪe": 1724,
-      "ĠnjËĪuËĲ": 1725,
-      "ĠËĪak": 1726,
-      "ĠÉĹËĪaËĲ": 1727,
-      "zËĪa": 1728,
-      "vËĪe": 1729,
-      "ĠhËĮaÊĬ": 1730,
-      "ÉĲÃ§": 1731,
-      "ĠÉ¾ËĪÊĮkÊ°": 1732,
-      "pËĪe": 1733,
-      "ĠtÉĻbi": 1734,
-      "ĠpËĪÊĮhÉĻlËĮeËĲ": 1735,
-      "ĠfËĪÉĽ": 1736,
-      "ĠwËĮÉªtÊĥ": 1737,
-      "ĠtÉķËĪyÉĽÉľ": 1738,
-      "wËĮe": 1739,
-      "ËĮaÉªt": 1740,
-      "ĠnÉĳËĲx": 1741,
-      "ĠkËĪÉĶËĲn": 1742,
-      "ÊĬk": 1743,
-      "ĠbËĪaËĲd": 1744,
-      "ÅĭÉĻn": 1745,
-      "Ġni": 1746,
-      "ĠbËĪe": 1747,
-      "ĠmËĮÊĬ": 1748,
-      "ËĪar": 1749,
-      "ĠmËĮeÉªk": 1750,
-      "ĠsËĪaËĲÉ¾": 1751,
-      "Î²e": 1752,
-      "ĠtÉķhËĪiÉľÅĭ": 1753,
-      "itËĪe": 1754,
-      "kËĮe": 1755,
-      "ËĪÉĽËĲl": 1756,
-      "ËĮÉĴn": 1757,
-      "ËĮÉĳ": 1758,
-      "ĠbËĪÉªl": 1759,
-      "ĠwÊĬd": 1760,
-      "ĠbËĪoËĲl": 1761,
-      "rd": 1762,
-      "iÉĻ": 1763,
-      "Ġda": 1764,
-      "ĠbËĪaËĲÊĬ": 1765,
-      "ĠnËĪÊĮmbÉĻÉ¾": 1766,
-      "ËĪaËĲÉªÉľ": 1767,
-      "ĠÉĽm": 1768,
-      "ĠmiËĲÉ¾": 1769,
-      "ËĪeÉªm": 1770,
-      "los": 1771,
-      "ËĮÉĽt": 1772,
-      "ĠËĮaÊĬs": 1773,
-      "ĠmËĪaÉľt": 1774,
-      "ĠwËĪuÉĻ": 1775,
-      "ĠwËĪeÉª": 1776,
-      "ĠseÉ²": 1777,
-      "ĠbjËĪÉĽ": 1778,
-      "ĠwÉĽn": 1779,
-      "fl": 1780,
-      "ĠkhwËĪa": 1781,
-      "dËĪÉĽ": 1782,
-      "vÉ¹Éª": 1783,
-      "ĠËĪaÉ¾": 1784,
-      "jËĪÉĳuÉľ": 1785,
-      "ĠËĮaËĲpkËĮeËĲ": 1786,
-      "bÊģ": 1787,
-      "ĠtËĪaÉªm": 1788,
-      "ĠËĪÉĳ": 1789,
-      "ĠsËĮa": 1790,
-      "ĠzËĪoÉª": 1791,
-      "ËĪÉĶÉ¾a": 1792,
-      "ĠdËĪÃ¸": 1793,
-      "ËĪÉĶÉ¾t": 1794,
-      "ĠÅĭËĪÉĶ": 1795,
-      "min": 1796,
-      "ĠlËĪÊĬk": 1797,
-      "ËĪÉĶËĲt": 1798,
-      "ĠËĪÉĶtÉ¾": 1799,
-      "ĠfËĪaÉª": 1800,
-      "ĠÉ¡ÉĴt": 1801,
-      "ËĪeËĲÉĻn": 1802,
-      "kËĪÉĶ": 1803,
-      "ĠvËĪÉĽÉ¹i": 1804,
-      "mÉĽ": 1805,
-      "ËĪaÉªz": 1806,
-      "Ġesp": 1807,
-      "É²a": 1808,
-      "ĠlËĪo": 1809,
-      "ËĪÉĽËĲra": 1810,
-      "Î²ËĪi": 1811,
-      "ouÉľ": 1812,
-      "ËĮÉĻk": 1813,
-      "tÊĥuËĲ": 1814,
-      "ĠnËĪyÉĻ": 1815,
-      "ÊĪÉ¾": 1816,
-      "ĠÉ¡ËĪy": 1817,
-      "ĠtËĪoÃ°o": 1818,
-      "ËĪÉªÃ§t": 1819,
-      "ĠmÉªÃ§": 1820,
-      "ĠËĪand": 1821,
-      "ĠkwËĮÉĽl": 1822,
-      "ĠÊĤËĪaËĲ": 1823,
-      "ĠnËĪiÉľ": 1824,
-      "ËĪÉĶp": 1825,
-      "ËĪiËĲz": 1826,
-      "ĠÊĤËĪaÊĬ": 1827,
-      "ĠÉ¾ËĮÉĻhi": 1828,
-      "ĠsËĮÊĬo": 1829,
-      "ĠÉĽÉ¡": 1830,
-      "ĠdÅĵ": 1831,
-      "ĠÉ¡ËĮaËĲÉªÉľ": 1832,
-      "dÉª": 1833,
-      "lËĮa": 1834,
-      "stËĪi": 1835,
-      "ĠdËĮiËĲz": 1836,
-      "ĠtËĮÊĬ": 1837,
-      "Î¸i": 1838,
-      "ĠËĪÉªskËĮoËĲ": 1839,
-      "ndÉĻn": 1840,
-      "Ġtsv": 1841,
-      "ĠhËĪÉĻËĲ": 1842,
-      "ĠÊĥËĪÊĬ": 1843,
-      "ÉĻtËĮeËĲ": 1844,
-      "pËĮÉĽ": 1845,
-      "ËĪaÉ¾ÉĶn": 1846,
-      "ĠpÉĽÊģ": 1847,
-      "Ġy": 1848,
-      "mnËĮeËĲ": 1849,
-      "ËĪÉĽllo": 1850,
-      "ĠÉ¡ËĪÉĻ": 1851,
-      "ĠËĮad": 1852,
-      "ĠÊĥv": 1853,
-      "ËĪÊıÉ¾": 1854,
-      "rËĪe": 1855,
-      "yËĲ": 1856,
-      "ĠpËĪaËĲs": 1857,
-      "ĠËĪÉĽn": 1858,
-      "ÉªdÊĴ": 1859,
-      "ËĪuai": 1860,
-      "Ġfi": 1861,
-      "ĠtËĪyÉĻ": 1862,
-      "ËĪaËĲÉŁ": 1863,
-      "ĠtjËĪe": 1864,
-      "ËĪaËĲnaËĲ": 1865,
-      "stÉ¾": 1866,
-      "Êİe": 1867,
-      "ËĮeÉªt": 1868,
-      "ba": 1869,
-      "Ã°as": 1870,
-      "vÊģ": 1871,
-      "ĠzËĪÉĻËĲ": 1872,
-      "ËĪaËĲli": 1873,
-      "ÉŁÊ°eËĲ": 1874,
-      "ËĪaËĲteËĲ": 1875,
-      "ĠvËĪa": 1876,
-      "Ġsal": 1877,
-      "ËĪaËĲno": 1878,
-      "ĠÉ¡ÉĻz": 1879,
-      "ĠhËĪoËĲti": 1880,
-      "ĠÉ²ËĪiÉĽ": 1881,
-      "tÉľ": 1882,
-      "ĠËĪaËĲp": 1883,
-      "ĠwËĪÉĽl": 1884,
-      "ĠmËĪÉªl": 1885,
-      "ĠfyËĲÉ¾": 1886,
-      "ËĪÉĽËĲsaËĲ": 1887,
-      "ĠbËĮiËĲ": 1888,
-      "ËĪaËĲjaËĲ": 1889,
-      "ËĪÉªp": 1890,
-      "ĠfÊģ": 1891,
-      "tsiËĪoËĲne": 1892,
-      "ĠwËĪuÉľ": 1893,
-      "Ġvi": 1894,
-      "ĠwËĪÉĳÉľn": 1895,
-      "ËĪoËĲn": 1896,
-      "ĠÉĹËĪÉĻÉª": 1897,
-      "ĠÊĿËĪo": 1898,
-      "Ġra": 1899,
-      "mÉĻnt": 1900,
-      "ËĪaÊĬnd": 1901,
-      "ĠpÉĽÉ¾": 1902,
-      "ĠÉĹËĪaËĲÊĬ": 1903,
-      "oËĲÉ¾": 1904,
-      "hËĪo": 1905,
-      "ĠÉĴn": 1906,
-      "ĠÊİe": 1907,
-      "ĠsËĪÉªks": 1908,
-      "É¡n": 1909,
-      "ĠÉ¡ËĪa": 1910,
-      "ĠÎ¸j": 1911,
-      "ĠpËĪe": 1912,
-      "spe": 1913,
-      "ĠvËĪÉĻ": 1914,
-      "ĠfËĪÉª": 1915,
-      "ĠËĮÉªntÊĬ": 1916,
-      "lÉĻn": 1917,
-      "ĠnËĪiËĲd": 1918,
-      "ĠsËĮÊĬa": 1919,
-      "ĠËĪum": 1920,
-      "ĠdËĪeÉª": 1921,
-      "ĠËĪÊĮbÊ°i": 1922,
-      "ËĪÉĳËĲÉ¾": 1923,
-      "ĠbËĪiÉĽÉľt": 1924,
-      "Êİos": 1925,
-      "ĠtshËĪaiÉľ": 1926,
-      "ĠËĮÉªskËĮaËĲ": 1927,
-      "ĠaÊĬÉĻ": 1928,
-      "ĠËĪyÃ¦": 1929,
-      "Ġdyn": 1930,
-      "ĠmËĪiËĲn": 1931,
-      "ĠËĪÊĮcÊ°ËĲ": 1932,
-      "ĠsÉĽ": 1933,
-      "ĠnËĪy": 1934,
-      "ĠnËĮÉĽl": 1935,
-      "É¡É¾": 1936,
-      "ÊĥËĪe": 1937,
-      "ĠÊĤËĮÉĽ": 1938,
-      "ĠËĪÉĽvÉ¹Éª": 1939,
-      "ËĪÉĽlp": 1940,
-      "ĠbËĪak": 1941,
-      "ĠeËĲ": 1942,
-      "ĠfËĪaËĲ": 1943,
-      "ĠkÉĽl": 1944,
-      "ĠËĪeËĲs": 1945,
-      "jËĪaËĲd": 1946,
-      "ĠlËĮi": 1947,
-      "mbÉ¾e": 1948,
-      "ktÉĻ": 1949,
-      "nta": 1950,
-      "tËĪu": 1951,
-      "ĠÃ°ËĪat": 1952,
-      "ĠËĪaÎ²": 1953,
-      "ÉĻÉ¹i": 1954,
-      "ĠkwËĮÉĽlla": 1955,
-      "ĠbÉĻn": 1956,
-      "rËĮÉĽ": 1957,
-      "ĠnÉĶ": 1958,
-      "ĠÉ¡ËĪÉª": 1959,
-      "ĠËĪap": 1960,
-      "É¹ÉĻ": 1961,
-      "ËĪaÉľkh": 1962,
-      "ĠÊĲËĪi": 1963,
-      "ĠËĪÉĳËĲ": 1964,
-      "ÉªÉ¡ÉĻn": 1965,
-      "ĠwËĪai": 1966,
-      "ĠpÉĻt": 1967,
-      "kËĲa": 1968,
-      "ĠbËĪÉĽËĲ": 1969,
-      "ËĪeËĲÊĭ": 1970,
-      "lsÉĻÊĬ": 1971,
-      "ĠcËĪaËĲhÉªËĮeËĲ": 1972,
-      "ĠkÉĻn": 1973,
-      "ĠËĮaÉªnÉĻm": 1974,
-      "ËĪuËĲt": 1975,
-      "ĠhËĪaÊĬ": 1976,
-      "ĠtËĪanto": 1977,
-      "ĠhÉĲz": 1978,
-      "ĠsËĪÊĮÉ¾": 1979,
-      "Ġno": 1980,
-      "ĠtËĪÉĶËĲ": 1981,
-      "ĠzËĪaÉª": 1982,
-      "ĠtÉķËĪiÉĽÉľ": 1983,
-      "ĠkozËĪi": 1984,
-      "ĠkËĪei": 1985,
-      "Ã°ËĪÉĶÉ¾": 1986,
-      "ËĮÉĶÊģ": 1987,
-      "ĠtËĪÊĮÉ¾": 1988,
-      "ĠÊĲËĪÉĻ": 1989,
-      "ĠÉķËĪyÉĽÉľ": 1990,
-      "ĠmËĮÊĬÉŁÊ°eËĲ": 1991,
-      "mf": 1992,
-      "ĠvËĪiËĲdÉľ": 1993,
-      "kËĪa": 1994,
-      "ĠÉĲÉ¡": 1995,
-      "kw": 1996,
-      "ĠÊģÉĽ": 1997,
-      "xÉĻn": 1998,
-      "ĠdÊĬ": 1999,
-      "ĠkËĪÊĮÉ¾nËĮeËĲ": 2000,
-      "jËĪaËĲdaËĲ": 2001,
-      "ĠfÉĻ": 2002,
-      "ĠËĮimp": 2003,
-      "ĠhÉªz": 2004,
-      "ĠÊ°Ïĩ": 2005,
-      "ËĪoËĲni": 2006,
-      "ĠxËĪiÉľ": 2007,
-      "ËĪeËĲsÊĪ": 2008,
-      "ÊıbÉľ": 2009,
-      "ËĮÉĶÉ¾ke": 2010,
-      "ĠÉ¡ËĪÉĻÊĬ": 2011,
-      "ËĪÉªÊĥÉĻn": 2012,
-      "les": 2013,
-      "ĠfËĪiËĲ": 2014,
-      "É¡tÉĻ": 2015,
-      "ËĪeËĲre": 2016,
-      "ĠvËĮaËĲ": 2017,
-      "ĠËĪeÉª": 2018,
-      "ĠmËĪuÉĻÉľn": 2019,
-      "ĠÉ¡ËĪÊĬd": 2020,
-      "ĠmËĮaÉªn": 2021,
-      "zËĪe": 2022,
-      "ĠlËĪiÉľ": 2023,
-      "Ġmu": 2024,
-      "ĠkËĮÉĽl": 2025,
-      "ĠjËĮÉĻh": 2026,
-      "ĠfËĮÉĶÉ¾": 2027,
-      "fÉ¹": 2028,
-      "ĠkËĪaÉªn": 2029,
-      "ĠËĪÉĴlsÉĻÊĬ": 2030,
-      "Î¸ÉªÅĭ": 2031,
-      "ĠthËĪonÉ¡Éľ": 2032,
-      "tËĪÉĳ": 2033,
-      "Î¸jo": 2034,
-      "mËĪÉĶ": 2035,
-      "Ġos": 2036,
-      "ĠsÊĬ": 2037,
-      "ĠsËĪÊĮmÉĻ": 2038,
-      "ĠvËĮÉĽn": 2039,
-      "nËĪo": 2040,
-      "ĠËĪaktÊĥuËĲ": 2041,
-      "É£a": 2042,
-      "ĠtÊ°i": 2043,
-      "ĠfËĮi": 2044,
-      "ĠvËĪÉĽl": 2045,
-      "ĠtËĪutËĲi": 2046,
-      "xos": 2047
-    },
-    "merges": [
-      [
-        "Ë",
-        "Ī"
-      ],
-      [
-        "Ë",
-        "Ĳ"
-      ],
-      [
-        "ËĪ",
-        "É"
-      ],
-      [
-        "Ë",
-        "Į"
-      ],
-      [
-        "É",
-        "Ļ"
-      ],
-      [
-        "ËĪ",
-        "a"
-      ],
-      [
-        "ËĪ",
-        "i"
-      ],
-      [
-        "Ġ",
-        "t"
-      ],
-      [
-        "É",
-        "ª"
-      ],
-      [
-        "É",
-        "¾"
-      ],
-      [
-        "Ġ",
-        "É"
-      ],
-      [
-        "Ġ",
-        "k"
-      ],
-      [
-        "É",
-        "ľ"
-      ],
-      [
-        "Ġ",
-        "s"
-      ],
-      [
-        "ËĪ",
-        "e"
-      ],
-      [
-        "É",
-        "Ľ"
-      ],
-      [
-        "ËĪ",
-        "o"
-      ],
-      [
-        "Ġ",
-        "l"
-      ],
-      [
-        "ËĪÉ",
-        "Ľ"
-      ],
-      [
-        "Ġ",
-        "d"
-      ],
-      [
-        "Ê",
-        "Ĭ"
-      ],
-      [
-        "ËĪa",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "p"
-      ],
-      [
-        "Ì",
-        "ĥ"
-      ],
-      [
-        "Ġ",
-        "m"
-      ],
-      [
-        "ËĪ",
-        "u"
-      ],
-      [
-        "Å",
-        "ĭ"
-      ],
-      [
-        "Ã",
-        "°"
-      ],
-      [
-        "ËĪÉ",
-        "Ķ"
-      ],
-      [
-        "Ê",
-        "Į"
-      ],
-      [
-        "ËĮ",
-        "a"
-      ],
-      [
-        "Ġ",
-        "h"
-      ],
-      [
-        "ËĪ",
-        "ÊĮ"
-      ],
-      [
-        "Ġ",
-        "n"
-      ],
-      [
-        "Ê",
-        "ģ"
-      ],
-      [
-        "ËĪÉ",
-        "ĳ"
-      ],
-      [
-        "Ê",
-        "ĥ"
-      ],
-      [
-        "e",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "a"
-      ],
-      [
-        "Ġ",
-        "b"
-      ],
-      [
-        "É",
-        "Ķ"
-      ],
-      [
-        "ËĪÉ",
-        "Ļ"
-      ],
-      [
-        "ÉĻ",
-        "n"
-      ],
-      [
-        "Ġ",
-        "f"
-      ],
-      [
-        "ËĪÉ",
-        "ª"
-      ],
-      [
-        "É",
-        "¡"
-      ],
-      [
-        "ËĪe",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "j"
-      ],
-      [
-        "n",
-        "t"
-      ],
-      [
-        "Ġ",
-        "Ã°"
-      ],
-      [
-        "Ġ",
-        "ËĮ"
-      ],
-      [
-        "Ġt",
-        "s"
-      ],
-      [
-        "ĠÉ",
-        "¡"
-      ],
-      [
-        "É",
-        "ķ"
-      ],
-      [
-        "ËĪo",
-        "ËĲ"
-      ],
-      [
-        "Ê",
-        "°"
-      ],
-      [
-        "a",
-        "ËĲ"
-      ],
-      [
-        "ËĪ",
-        "y"
-      ],
-      [
-        "Ġt",
-        "Éķ"
-      ],
-      [
-        "ËĪi",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "Ê"
-      ],
-      [
-        "Ġ",
-        "v"
-      ],
-      [
-        "Ġ",
-        "w"
-      ],
-      [
-        "s",
-        "t"
-      ],
-      [
-        "É",
-        "ĳ"
-      ],
-      [
-        "n",
-        "d"
-      ],
-      [
-        "ËĮ",
-        "i"
-      ],
-      [
-        "Ì",
-        "ª"
-      ],
-      [
-        "ËĮ",
-        "e"
-      ],
-      [
-        "Ġ",
-        "z"
-      ],
-      [
-        "ËĪa",
-        "Éª"
-      ],
-      [
-        "ËĪi",
-        "ÉĽ"
-      ],
-      [
-        "Î",
-        "²"
-      ],
-      [
-        "É",
-        "¹"
-      ],
-      [
-        "Ġ",
-        "ËĮa"
-      ],
-      [
-        "Î",
-        "¸"
-      ],
-      [
-        "Ġh",
-        "ÉĽ"
-      ],
-      [
-        "Ê",
-        "Ī"
-      ],
-      [
-        "i",
-        "ËĲ"
-      ],
-      [
-        "ËĮ",
-        "o"
-      ],
-      [
-        "Ġ",
-        "Éª"
-      ],
-      [
-        "Éľ",
-        "n"
-      ],
-      [
-        "Ġ",
-        "x"
-      ],
-      [
-        "Ġt",
-        "ÉĻ"
-      ],
-      [
-        "ËĪu",
-        "ËĲ"
-      ],
-      [
-        "ËĮ",
-        "ÉĻ"
-      ],
-      [
-        "Ġj",
-        "ËĪi"
-      ],
-      [
-        "ËĮ",
-        "ÉĽ"
-      ],
-      [
-        "ĠÉ",
-        "Ľ"
-      ],
-      [
-        "Ġ",
-        "ËĪa"
-      ],
-      [
-        "ËĮa",
-        "ËĲ"
-      ],
-      [
-        "Ġl",
-        "a"
-      ],
-      [
-        "ĠÃ°",
-        "e"
-      ],
-      [
-        "ĠhÉĽ",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "e"
-      ],
-      [
-        "Ã",
-        "§"
-      ],
-      [
-        "ÉĻ",
-        "l"
-      ],
-      [
-        "o",
-        "ËĲ"
-      ],
-      [
-        "ËĪÉĳ",
-        "u"
-      ],
-      [
-        "Ê",
-        "Ĵ"
-      ],
-      [
-        "u",
-        "ËĲ"
-      ],
-      [
-        "ĠÉ",
-        "Ĺ"
-      ],
-      [
-        "ĠÉ",
-        "ķ"
-      ],
-      [
-        "ËĮ",
-        "eËĲ"
-      ],
-      [
-        "ĠtÉķ",
-        "ËĪi"
-      ],
-      [
-        "o",
-        "s"
-      ],
-      [
-        "ËĪÉĶ",
-        "ËĲ"
-      ],
-      [
-        "a",
-        "s"
-      ],
-      [
-        "ËĪ",
-        "ÊĬ"
-      ],
-      [
-        "Ġ",
-        "i"
-      ],
-      [
-        "ËĪa",
-        "i"
-      ],
-      [
-        "É",
-        "²"
-      ],
-      [
-        "Éª",
-        "n"
-      ],
-      [
-        "t",
-        "s"
-      ],
-      [
-        "Éľ",
-        "Åĭ"
-      ],
-      [
-        "ĠÉ",
-        "Ł"
-      ],
-      [
-        "Ġ",
-        "Êĥ"
-      ],
-      [
-        "ËĪe",
-        "Éª"
-      ],
-      [
-        "ÉĽ",
-        "É¾"
-      ],
-      [
-        "ËĪÉĽ",
-        "ËĲ"
-      ],
-      [
-        "ËĪÉĽ",
-        "É¾"
-      ],
-      [
-        "Ġ",
-        "r"
-      ],
-      [
-        "t",
-        "Êĥ"
-      ],
-      [
-        "ËĮ",
-        "ÉĶ"
-      ],
-      [
-        "Ġd",
-        "ÉĻ"
-      ],
-      [
-        "t",
-        "ÉĻ"
-      ],
-      [
-        "o",
-        "u"
-      ],
-      [
-        "ËĪy",
-        "ÉĻ"
-      ],
-      [
-        "ĠËĮ",
-        "i"
-      ],
-      [
-        "ÉĻ",
-        "É¾"
-      ],
-      [
-        "ËĪÉĻ",
-        "ÊĬ"
-      ],
-      [
-        "ËĪÊĮ",
-        "É¾"
-      ],
-      [
-        "ËĪÉ",
-        "Ĵ"
-      ],
-      [
-        "Ġt",
-        "h"
-      ],
-      [
-        "ËĪo",
-        "n"
-      ],
-      [
-        "Ê",
-        "ĭ"
-      ],
-      [
-        "ËĪÉĳ",
-        "ËĲ"
-      ],
-      [
-        "ËĪÊĮ",
-        "h"
-      ],
-      [
-        "w",
-        "ËĪa"
-      ],
-      [
-        "ËĪe",
-        "i"
-      ],
-      [
-        "l",
-        "l"
-      ],
-      [
-        "ĠÉ",
-        "Ĳ"
-      ],
-      [
-        "Éĳ",
-        "ËĲ"
-      ],
-      [
-        "a",
-        "n"
-      ],
-      [
-        "É",
-        "Ł"
-      ],
-      [
-        "ĠÊ",
-        "ĭ"
-      ],
-      [
-        "Ġk",
-        "o"
-      ],
-      [
-        "k",
-        "h"
-      ],
-      [
-        "Éª",
-        "Åĭ"
-      ],
-      [
-        "ËĪaËĲ",
-        "Éª"
-      ],
-      [
-        "Ġt",
-        "Êĥ"
-      ],
-      [
-        "ËĪaËĲ",
-        "t"
-      ],
-      [
-        "ĠËĮ",
-        "e"
-      ],
-      [
-        "ĠtÉķ",
-        "h"
-      ],
-      [
-        "ËĪu",
-        "o"
-      ],
-      [
-        "ËĪon",
-        "É¡"
-      ],
-      [
-        "É",
-        "ĸ"
-      ],
-      [
-        "a",
-        "t"
-      ],
-      [
-        "Ġk",
-        "e"
-      ],
-      [
-        "É",
-        "Ĵ"
-      ],
-      [
-        "ĠÉķ",
-        "ËĪi"
-      ],
-      [
-        "Ã",
-        "¸"
-      ],
-      [
-        "ĠÉ",
-        "ĳ"
-      ],
-      [
-        "ËĪeËĲ",
-        "k"
-      ],
-      [
-        "Å",
-        "ĵ"
-      ],
-      [
-        "r",
-        "e"
-      ],
-      [
-        "Ġ",
-        "É¾"
-      ],
-      [
-        "Ġk",
-        "ÉĶ"
-      ],
-      [
-        "ËĮ",
-        "ÊĬ"
-      ],
-      [
-        "s",
-        "k"
-      ],
-      [
-        "Ġ",
-        "ÊĬ"
-      ],
-      [
-        "Ġa",
-        "nd"
-      ],
-      [
-        "Éª",
-        "Ã§"
-      ],
-      [
-        "Ġm",
-        "e"
-      ],
-      [
-        "ËĪa",
-        "É¾"
-      ],
-      [
-        "Ġ",
-        "ËĪÉª"
-      ],
-      [
-        "n",
-        "a"
-      ],
-      [
-        "Ġ",
-        "Î²"
-      ],
-      [
-        "Ġl",
-        "ËĪi"
-      ],
-      [
-        "j",
-        "aËĲ"
-      ],
-      [
-        "l",
-        "i"
-      ],
-      [
-        "n",
-        "o"
-      ],
-      [
-        "ĠÉª",
-        "n"
-      ],
-      [
-        "Ġd",
-        "ËĮi"
-      ],
-      [
-        "ĠÉ",
-        "²"
-      ],
-      [
-        "t",
-        "ËĲ"
-      ],
-      [
-        "ÉĻ",
-        "m"
-      ],
-      [
-        "Ġl",
-        "ÉĻ"
-      ],
-      [
-        "ĠÃ°",
-        "ÉĻ"
-      ],
-      [
-        "Éª",
-        "k"
-      ],
-      [
-        "ËĪÉĽ",
-        "l"
-      ],
-      [
-        "Éľ",
-        "t"
-      ],
-      [
-        "Ġs",
-        "e"
-      ],
-      [
-        "e",
-        "s"
-      ],
-      [
-        "ËĪo",
-        "u"
-      ],
-      [
-        "ËĪa",
-        "ÊĬ"
-      ],
-      [
-        "ĠÉ",
-        "Ķ"
-      ],
-      [
-        "Éª",
-        "t"
-      ],
-      [
-        "Ġ",
-        "Åĭ"
-      ],
-      [
-        "ËĪÉĽ",
-        "n"
-      ],
-      [
-        "Ê",
-        "İ"
-      ],
-      [
-        "Ġk",
-        "h"
-      ],
-      [
-        "ËĪÉĽ",
-        "nt"
-      ],
-      [
-        "ËĪaËĲ",
-        "É¾"
-      ],
-      [
-        "Ġk",
-        "i"
-      ],
-      [
-        "m",
-        "p"
-      ],
-      [
-        "l",
-        "t"
-      ],
-      [
-        "É",
-        "£"
-      ],
-      [
-        "Ġp",
-        "a"
-      ],
-      [
-        "ËĪÉĻ",
-        "ËĲ"
-      ],
-      [
-        "Éª",
-        "s"
-      ],
-      [
-        "ĠÉ",
-        "Ĵ"
-      ],
-      [
-        "Ġl",
-        "e"
-      ],
-      [
-        "Éª",
-        "Éľ"
-      ],
-      [
-        "ËĪÉĽ",
-        "t"
-      ],
-      [
-        "Ġd",
-        "e"
-      ],
-      [
-        "ĠÉ",
-        "¹"
-      ],
-      [
-        "Ġt",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġ",
-        "Êģ"
-      ],
-      [
-        "Êĥ",
-        "ÉĻn"
-      ],
-      [
-        "ĠÊĬ",
-        "nt"
-      ],
-      [
-        "ËĪÉĶ",
-        "É¾"
-      ],
-      [
-        "ËĪa",
-        "Ã°"
-      ],
-      [
-        "Ġa",
-        "Éª"
-      ],
-      [
-        "ĠÊ",
-        "Ĳ"
-      ],
-      [
-        "Ġm",
-        "ËĪa"
-      ],
-      [
-        "r",
-        "a"
-      ],
-      [
-        "Ġk",
-        "ËĪÉª"
-      ],
-      [
-        "k",
-        "t"
-      ],
-      [
-        "ËĲ",
-        "p"
-      ],
-      [
-        "ĠÊ",
-        "Ī"
-      ],
-      [
-        "ËĪaËĲ",
-        "ÊĬ"
-      ],
-      [
-        "Ġk",
-        "ËĪÊĮÉ¾"
-      ],
-      [
-        "Ġ",
-        "ËĪÊĮ"
-      ],
-      [
-        "ĠÉĴ",
-        "v"
-      ],
-      [
-        "Ġe",
-        "l"
-      ],
-      [
-        "k",
-        "s"
-      ],
-      [
-        "Ġk",
-        "w"
-      ],
-      [
-        "ÉĻ",
-        "t"
-      ],
-      [
-        "nd",
-        "o"
-      ],
-      [
-        "e",
-        "i"
-      ],
-      [
-        "ĠËĮa",
-        "ËĲp"
-      ],
-      [
-        "s",
-        "e"
-      ],
-      [
-        "ÉĻ",
-        "É¹"
-      ],
-      [
-        "ËĪu",
-        "ei"
-      ],
-      [
-        "ÉĻ",
-        "s"
-      ],
-      [
-        "Ġk",
-        "ËĮo"
-      ],
-      [
-        "ĠÊ",
-        "Ĥ"
-      ],
-      [
-        "ĠËĮ",
-        "ÊĬ"
-      ],
-      [
-        "Ġ",
-        "c"
-      ],
-      [
-        "ĠÉĽ",
-        "n"
-      ],
-      [
-        "ËĪa",
-        "nt"
-      ],
-      [
-        "Î¸",
-        "j"
-      ],
-      [
-        "ËĮo",
-        "ËĲ"
-      ],
-      [
-        "Ġ",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġp",
-        "É¾"
-      ],
-      [
-        "s",
-        "i"
-      ],
-      [
-        "Ġ",
-        "ËĪe"
-      ],
-      [
-        "Ġj",
-        "uËĲ"
-      ],
-      [
-        "Ġk",
-        "ËĮe"
-      ],
-      [
-        "ËĮ",
-        "Éª"
-      ],
-      [
-        "ÉĶ",
-        "n"
-      ],
-      [
-        "Ġs",
-        "ËĪÊĮ"
-      ],
-      [
-        "Ġ",
-        "ËĪu"
-      ],
-      [
-        "n",
-        "i"
-      ],
-      [
-        "Ġs",
-        "t"
-      ],
-      [
-        "Ġd",
-        "iËĲ"
-      ],
-      [
-        "Ġk",
-        "eËĲ"
-      ],
-      [
-        "ĠjËĪi",
-        "ou"
-      ],
-      [
-        "ËĪai",
-        "Éľ"
-      ],
-      [
-        "Ġd",
-        "ÊĴ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĶ"
-      ],
-      [
-        "v",
-        "a"
-      ],
-      [
-        "ËĲ",
-        "É¾"
-      ],
-      [
-        "ËĪ",
-        "Ã¸"
-      ],
-      [
-        "ËĮÉĻ",
-        "ÊĬ"
-      ],
-      [
-        "Ġp",
-        "ËĪu"
-      ],
-      [
-        "Ġs",
-        "u"
-      ],
-      [
-        "Ġm",
-        "a"
-      ],
-      [
-        "Ġ",
-        "ÉĻ"
-      ],
-      [
-        "d",
-        "ÊĴ"
-      ],
-      [
-        "Ġp",
-        "Ê°"
-      ],
-      [
-        "l",
-        "e"
-      ],
-      [
-        "i",
-        "n"
-      ],
-      [
-        "ĠtÉķh",
-        "ËĪi"
-      ],
-      [
-        "Ġw",
-        "ËĪo"
-      ],
-      [
-        "r",
-        "o"
-      ],
-      [
-        "ËĮ",
-        "y"
-      ],
-      [
-        "É¾",
-        "a"
-      ],
-      [
-        "Ġs",
-        "ËĪi"
-      ],
-      [
-        "Ã°",
-        "ÉĻ"
-      ],
-      [
-        "Ġs",
-        "eËĲ"
-      ],
-      [
-        "l",
-        "a"
-      ],
-      [
-        "ĠÊ",
-        "Ĵ"
-      ],
-      [
-        "m",
-        "b"
-      ],
-      [
-        "Ġh",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġb",
-        "Ê°"
-      ],
-      [
-        "ĠÉĽ",
-        "É¾"
-      ],
-      [
-        "ĠÃ°",
-        "at"
-      ],
-      [
-        "s",
-        "p"
-      ],
-      [
-        "ÉĶ",
-        "É¾"
-      ],
-      [
-        "e",
-        "n"
-      ],
-      [
-        "Ġs",
-        "ÉĻ"
-      ],
-      [
-        "ËĪÉĶ",
-        "Éľ"
-      ],
-      [
-        "Ġl",
-        "ËĮa"
-      ],
-      [
-        "ĠËĮ",
-        "ÉĽ"
-      ],
-      [
-        "Ġ",
-        "ËĪy"
-      ],
-      [
-        "É¡",
-        "aËĲ"
-      ],
-      [
-        "Ġd",
-        "ÉĽÉ¾"
-      ],
-      [
-        "ËĪÉĽ",
-        "Êģ"
-      ],
-      [
-        "Éľ",
-        "kh"
-      ],
-      [
-        "ËĪi",
-        "ÉĻ"
-      ],
-      [
-        "ËĪa",
-        "n"
-      ],
-      [
-        "Ġm",
-        "ËĪo"
-      ],
-      [
-        "ËĪa",
-        "Î²"
-      ],
-      [
-        "Ġa",
-        "l"
-      ],
-      [
-        "Ġ",
-        "ËĪeËĲ"
-      ],
-      [
-        "Ġ",
-        "Î¸"
-      ],
-      [
-        "Ġn",
-        "ËĪi"
-      ],
-      [
-        "p",
-        "Ê°"
-      ],
-      [
-        "ll",
-        "a"
-      ],
-      [
-        "Ġp",
-        "l"
-      ],
-      [
-        "ËĪ",
-        "Åĵ"
-      ],
-      [
-        "j",
-        "ËĪÉĳu"
-      ],
-      [
-        "Ġa",
-        "v"
-      ],
-      [
-        "Ġm",
-        "ËĪi"
-      ],
-      [
-        "Ġf",
-        "ËĪa"
-      ],
-      [
-        "ËĪÉ",
-        "ľ"
-      ],
-      [
-        "m",
-        "e"
-      ],
-      [
-        "ËĮÉĻ",
-        "h"
-      ],
-      [
-        "ËĪu",
-        "ÉĻ"
-      ],
-      [
-        "i",
-        "t"
-      ],
-      [
-        "j",
-        "ËĪe"
-      ],
-      [
-        "Ġ",
-        "o"
-      ],
-      [
-        "ËĪÉľ",
-        "ËĲ"
-      ],
-      [
-        "ĠtÉķËĪi",
-        "ou"
-      ],
-      [
-        "ÉĶ",
-        "ËĲ"
-      ],
-      [
-        "Ġn",
-        "ÉĻ"
-      ],
-      [
-        "ËĪÉĻ",
-        "Éľn"
-      ],
-      [
-        "Ġm",
-        "ÉĻ"
-      ],
-      [
-        "Ġd",
-        "eËĲ"
-      ],
-      [
-        "m",
-        "o"
-      ],
-      [
-        "s",
-        "a"
-      ],
-      [
-        "j",
-        "ËĪÉĶ"
-      ],
-      [
-        "ËĪa",
-        "l"
-      ],
-      [
-        "ĠtÉķ",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ĠÉ¡",
-        "ÉĻ"
-      ],
-      [
-        "Ã°",
-        "a"
-      ],
-      [
-        "ĠÉª",
-        "z"
-      ],
-      [
-        "Ġs",
-        "a"
-      ],
-      [
-        "r",
-        "i"
-      ],
-      [
-        "ĠËĮi",
-        "l"
-      ],
-      [
-        "ËĮ",
-        "u"
-      ],
-      [
-        "Ġk",
-        "aËĲ"
-      ],
-      [
-        "ĠÉĻ",
-        "ËĲ"
-      ],
-      [
-        "ĠÉ",
-        "ĸ"
-      ],
-      [
-        "Ġk",
-        "a"
-      ],
-      [
-        "ËĪÊĮh",
-        "i"
-      ],
-      [
-        "Ġj",
-        "eËĲ"
-      ],
-      [
-        "Ġt",
-        "Ê°"
-      ],
-      [
-        "n",
-        "e"
-      ],
-      [
-        "k",
-        "ËĲ"
-      ],
-      [
-        "Ġts",
-        "ËĪai"
-      ],
-      [
-        "Ġ",
-        "ËĪeËĲk"
-      ],
-      [
-        "n",
-        "k"
-      ],
-      [
-        "t",
-        "i"
-      ],
-      [
-        "ËĪa",
-        "Éľn"
-      ],
-      [
-        "Ġk",
-        "ËĲ"
-      ],
-      [
-        "É¡",
-        "ÉĻn"
-      ],
-      [
-        "ËĪi",
-        "a"
-      ],
-      [
-        "ĠÉĶ",
-        "ËĲÉ¾"
-      ],
-      [
-        "Ê",
-        "ı"
-      ],
-      [
-        "ĠËĮ",
-        "ÊĮ"
-      ],
-      [
-        "Ġz",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġl",
-        "os"
-      ],
-      [
-        "ÉĽ",
-        "s"
-      ],
-      [
-        "ËĪÉĶ",
-        "n"
-      ],
-      [
-        "ÉĽ",
-        "nt"
-      ],
-      [
-        "ÉĽ",
-        "n"
-      ],
-      [
-        "ĠÉŁ",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ã§",
-        "t"
-      ],
-      [
-        "Ġd",
-        "as"
-      ],
-      [
-        "Ġx",
-        "ËĮo"
-      ],
-      [
-        "ËĪu",
-        "Éľ"
-      ],
-      [
-        "ËĪa",
-        "s"
-      ],
-      [
-        "Ġb",
-        "ËĪÊĮ"
-      ],
-      [
-        "ËĪiÉĽ",
-        "Éľn"
-      ],
-      [
-        "É",
-        "Ĳ"
-      ],
-      [
-        "Ġts",
-        "uËĲ"
-      ],
-      [
-        "Ġp",
-        "ËĮÉĽ"
-      ],
-      [
-        "Ġn",
-        "ËĪÉĶ"
-      ],
-      [
-        "ÊĬ",
-        "t"
-      ],
-      [
-        "m",
-        "a"
-      ],
-      [
-        "Ġn",
-        "ËĪo"
-      ],
-      [
-        "Ġl",
-        "ËĪÉª"
-      ],
-      [
-        "ËĪÉĽ",
-        "s"
-      ],
-      [
-        "Éª",
-        "l"
-      ],
-      [
-        "ĠÉķ",
-        "ËĪiÉĽ"
-      ],
-      [
-        "Ġ",
-        "ËĪÊĬ"
-      ],
-      [
-        "ÉĴ",
-        "t"
-      ],
-      [
-        "t",
-        "o"
-      ],
-      [
-        "Ġ",
-        "ËĪo"
-      ],
-      [
-        "ËĮo",
-        "n"
-      ],
-      [
-        "Ġk",
-        "wËĪa"
-      ],
-      [
-        "ĠÉª",
-        "t"
-      ],
-      [
-        "Ġh",
-        "oËĲ"
-      ],
-      [
-        "ËĪiËĲ",
-        "k"
-      ],
-      [
-        "ĠËĮaËĲp",
-        "k"
-      ],
-      [
-        "ËĪaÉª",
-        "n"
-      ],
-      [
-        "Ã",
-        "¦"
-      ],
-      [
-        "ÉĻn",
-        "t"
-      ],
-      [
-        "t",
-        "a"
-      ],
-      [
-        "l",
-        "o"
-      ],
-      [
-        "Ġn",
-        "ËĪÉĳ"
-      ],
-      [
-        "Ġl",
-        "ËĪa"
-      ],
-      [
-        "ËĪi",
-        "Éľ"
-      ],
-      [
-        "Ġw",
-        "ËĪei"
-      ],
-      [
-        "ÉĽ",
-        "Êģ"
-      ],
-      [
-        "Ġt",
-        "ËĪa"
-      ],
-      [
-        "ĠÉ¾",
-        "ËĮÉĻh"
-      ],
-      [
-        "ĠÉķËĪi",
-        "Éĳ"
-      ],
-      [
-        "ËĮi",
-        "ËĲ"
-      ],
-      [
-        "ËĮÉĽ",
-        "l"
-      ],
-      [
-        "ĠtÉĻ",
-        "Éľ"
-      ],
-      [
-        "Ġk",
-        "ËĪuo"
-      ],
-      [
-        "Ġt",
-        "ËĪu"
-      ],
-      [
-        "j",
-        "ËĪÉĽ"
-      ],
-      [
-        "ĠËĮi",
-        "n"
-      ],
-      [
-        "É¾",
-        "e"
-      ],
-      [
-        "Ġk",
-        "oËĲ"
-      ],
-      [
-        "Ġk",
-        "ËĪa"
-      ],
-      [
-        "É¾",
-        "i"
-      ],
-      [
-        "ĠtÉķËĪi",
-        "Éĳ"
-      ],
-      [
-        "l",
-        "ÉĻ"
-      ],
-      [
-        "Ġk",
-        "ÉĻ"
-      ],
-      [
-        "Ġt",
-        "ËĪi"
-      ],
-      [
-        "ĠÅĭ",
-        "ËĪyÉĻ"
-      ],
-      [
-        "Ġts",
-        "h"
-      ],
-      [
-        "e",
-        "r"
-      ],
-      [
-        "a",
-        "v"
-      ],
-      [
-        "ĠkÉĶ",
-        "n"
-      ],
-      [
-        "ËĪÉĻ",
-        "ÉľÅĭ"
-      ],
-      [
-        "Ã°",
-        "o"
-      ],
-      [
-        "ËĪaËĲ",
-        "n"
-      ],
-      [
-        "ĠbÊ°",
-        "ËĪi"
-      ],
-      [
-        "ĠkËĲ",
-        "jaËĲ"
-      ],
-      [
-        "ÉĻ",
-        "z"
-      ],
-      [
-        "Ġp",
-        "Êģ"
-      ],
-      [
-        "Ġd",
-        "ËĪÉª"
-      ],
-      [
-        "Ġz",
-        "iËĲ"
-      ],
-      [
-        "É¡",
-        "eËĲ"
-      ],
-      [
-        "Ġt",
-        "ËĪÉĻ"
-      ],
-      [
-        "Éª",
-        "z"
-      ],
-      [
-        "Ġn",
-        "ËĮon"
-      ],
-      [
-        "t",
-        "aËĲ"
-      ],
-      [
-        "b",
-        "l"
-      ],
-      [
-        "t",
-        "e"
-      ],
-      [
-        "n",
-        "ËĮeËĲ"
-      ],
-      [
-        "ËĪÉª",
-        "l"
-      ],
-      [
-        "s",
-        "o"
-      ],
-      [
-        "k",
-        "o"
-      ],
-      [
-        "u",
-        "Êģ"
-      ],
-      [
-        "ĠÉ",
-        "£"
-      ],
-      [
-        "Ġpa",
-        "Êģ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĽ"
-      ],
-      [
-        "j",
-        "ËĪuËĲ"
-      ],
-      [
-        "ËĮ",
-        "ÊĮ"
-      ],
-      [
-        "y",
-        "n"
-      ],
-      [
-        "ËĪiËĲ",
-        "n"
-      ],
-      [
-        "Ġl",
-        "ËĪaÉª"
-      ],
-      [
-        "ËĪÉª",
-        "Åĭ"
-      ],
-      [
-        "ĠtÉķh",
-        "ËĪy"
-      ],
-      [
-        "Ġn",
-        "ËĪÊĮhi"
-      ],
-      [
-        "Ġd",
-        "ËĮe"
-      ],
-      [
-        "Ġj",
-        "ËĪÉĳu"
-      ],
-      [
-        "Ġt",
-        "ËĪÉĳu"
-      ],
-      [
-        "Ġh",
-        "ËĪo"
-      ],
-      [
-        "Éª",
-        "d"
-      ],
-      [
-        "Ġth",
-        "ËĪÉĳ"
-      ],
-      [
-        "m",
-        "ËĪe"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĻ"
-      ],
-      [
-        "j",
-        "a"
-      ],
-      [
-        "Ġp",
-        "h"
-      ],
-      [
-        "ÉĽ",
-        "t"
-      ],
-      [
-        "Ġk",
-        "ËĪÊĮ"
-      ],
-      [
-        "t",
-        "ÉĻn"
-      ],
-      [
-        "m",
-        "ËĪÉĳ"
-      ],
-      [
-        "w",
-        "ËĪe"
-      ],
-      [
-        "ĠËĮa",
-        "Éªn"
-      ],
-      [
-        "ĠÃ°",
-        "Éªs"
-      ],
-      [
-        "É¡",
-        "ÉĻ"
-      ],
-      [
-        "Ġn",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġb",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġa",
-        "Î¸"
-      ],
-      [
-        "Ġm",
-        "ËĮa"
-      ],
-      [
-        "ËĪÊĮh",
-        "a"
-      ],
-      [
-        "Ġd",
-        "ËĮa"
-      ],
-      [
-        "ËĪ",
-        "Êı"
-      ],
-      [
-        "ĠÉ²",
-        "ËĮy"
-      ],
-      [
-        "Ġp",
-        "ËĪa"
-      ],
-      [
-        "ËĪaÃ°",
-        "o"
-      ],
-      [
-        "d",
-        "i"
-      ],
-      [
-        "b",
-        "Éľ"
-      ],
-      [
-        "É",
-        "³"
-      ],
-      [
-        "Ġw",
-        "iËĲ"
-      ],
-      [
-        "Ġn",
-        "ËĪÉª"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÉĶÉľ"
-      ],
-      [
-        "tËĲ",
-        "o"
-      ],
-      [
-        "ËĮÉĻ",
-        "m"
-      ],
-      [
-        "ËĪaËĲ",
-        "r"
-      ],
-      [
-        "Ġm",
-        "ÉĽ"
-      ],
-      [
-        "ËĪeËĲ",
-        "É¡aËĲ"
-      ],
-      [
-        "Ġs",
-        "ËĮi"
-      ],
-      [
-        "Ġl",
-        "ËĮaËĲ"
-      ],
-      [
-        "n",
-        "ËĮaËĲ"
-      ],
-      [
-        "Ġs",
-        "p"
-      ],
-      [
-        "t",
-        "Êģ"
-      ],
-      [
-        "ĠÊ",
-        "İ"
-      ],
-      [
-        "ËĮ",
-        "ÉĳËĲ"
-      ],
-      [
-        "Ġk",
-        "l"
-      ],
-      [
-        "k",
-        "Ê°"
-      ],
-      [
-        "i",
-        "l"
-      ],
-      [
-        "ĠÊĥ",
-        "t"
-      ],
-      [
-        "ĠËĮÊĬ",
-        "n"
-      ],
-      [
-        "a",
-        "l"
-      ],
-      [
-        "Ġs",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġm",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġ",
-        "Åĵ"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÊĮ"
-      ],
-      [
-        "ĠpËĮÉĽ",
-        "r"
-      ],
-      [
-        "É¾",
-        "ËĪa"
-      ],
-      [
-        "ËĲ",
-        "ÊĪ"
-      ],
-      [
-        "ËĪaÎ²",
-        "a"
-      ],
-      [
-        "Ġw",
-        "ËĪÉĴ"
-      ],
-      [
-        "Ġx",
-        "ËĪuei"
-      ],
-      [
-        "Ġkh",
-        "ËĪo"
-      ],
-      [
-        "Ġla",
-        "s"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪo"
-      ],
-      [
-        "Ġf",
-        "ÉĽÉ¾"
-      ],
-      [
-        "Ġj",
-        "ËĪiÉĽ"
-      ],
-      [
-        "Ġt",
-        "ËĪe"
-      ],
-      [
-        "Ġk",
-        "ËĮÉĶ"
-      ],
-      [
-        "ĠdeËĲ",
-        "n"
-      ],
-      [
-        "Ġm",
-        "o"
-      ],
-      [
-        "Ġp",
-        "ËĪi"
-      ],
-      [
-        "Ġt",
-        "ËĪÉĳ"
-      ],
-      [
-        "ËĪÉĽ",
-        "st"
-      ],
-      [
-        "w",
-        "ËĪÉĳ"
-      ],
-      [
-        "ËĪaÉª",
-        "t"
-      ],
-      [
-        "ÉĻ",
-        "ÊĬ"
-      ],
-      [
-        "Ġ",
-        "ËĪi"
-      ],
-      [
-        "Éª",
-        "j"
-      ],
-      [
-        "a",
-        "Éª"
-      ],
-      [
-        "ËĪaËĲ",
-        "Éľ"
-      ],
-      [
-        "ĠËĪÉª",
-        "s"
-      ],
-      [
-        "Ġp",
-        "ÉĶÉ¾"
-      ],
-      [
-        "Ã¦",
-        "Éľn"
-      ],
-      [
-        "k",
-        "a"
-      ],
-      [
-        "Åĭ",
-        "É¡"
-      ],
-      [
-        "b",
-        "ÉĻn"
-      ],
-      [
-        "ÊĬ",
-        "f"
-      ],
-      [
-        "Ġp",
-        "É¹"
-      ],
-      [
-        "Ġl",
-        "ËĮe"
-      ],
-      [
-        "ËĪiËĲ",
-        "d"
-      ],
-      [
-        "ËĪaËĲ",
-        "re"
-      ],
-      [
-        "Ġm",
-        "ËĪÊĮ"
-      ],
-      [
-        "ÉĻ",
-        "r"
-      ],
-      [
-        "Ġd",
-        "Éĳ"
-      ],
-      [
-        "ËĪaËĲt",
-        "o"
-      ],
-      [
-        "Ġp",
-        "ËĪeËĲ"
-      ],
-      [
-        "Ġd",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġs",
-        "ËĮÊĬ"
-      ],
-      [
-        "Ġh",
-        "ËĪi"
-      ],
-      [
-        "Ġs",
-        "ËĪa"
-      ],
-      [
-        "ËĪeËĲ",
-        "n"
-      ],
-      [
-        "d",
-        "ÉĻ"
-      ],
-      [
-        "Ġp",
-        "j"
-      ],
-      [
-        "ËĪÅĵ",
-        "Êģ"
-      ],
-      [
-        "l",
-        "ÉªÃ§"
-      ],
-      [
-        "ÉĴ",
-        "n"
-      ],
-      [
-        "ĠËĪÉĻ",
-        "r"
-      ],
-      [
-        "t",
-        "ËĪe"
-      ],
-      [
-        "Ġi",
-        "l"
-      ],
-      [
-        "ËĪaËĲ",
-        "l"
-      ],
-      [
-        "Ġs",
-        "ËĮÉĻÊĬ"
-      ],
-      [
-        "s",
-        "ÊĪ"
-      ],
-      [
-        "Ġd",
-        "ËĪuËĲ"
-      ],
-      [
-        "h",
-        "ËĪÉĳ"
-      ],
-      [
-        "Ġx",
-        "ËĪou"
-      ],
-      [
-        "Ġl",
-        "ËĪaiÉľ"
-      ],
-      [
-        "w",
-        "ËĪo"
-      ],
-      [
-        "ËĪÉĽnt",
-        "e"
-      ],
-      [
-        "Ġs",
-        "y"
-      ],
-      [
-        "Ġz",
-        "ÉªÃ§"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪu"
-      ],
-      [
-        "ĠÉķ",
-        "ËĪy"
-      ],
-      [
-        "ËĪÉĶËĲ",
-        "l"
-      ],
-      [
-        "ÉĶ",
-        "l"
-      ],
-      [
-        "Ġt",
-        "ËĪo"
-      ],
-      [
-        "ĠÊĭ",
-        "oËĲ"
-      ],
-      [
-        "Ġ",
-        "iËĲ"
-      ],
-      [
-        "wËĪa",
-        "Ã°a"
-      ],
-      [
-        "ËĪa",
-        "ndo"
-      ],
-      [
-        "ĠaÎ¸",
-        "ÉĽnt"
-      ],
-      [
-        "ĠaÎ¸ÉĽnt",
-        "wËĪaÃ°a"
-      ],
-      [
-        "Ġt",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ËĪei",
-        "Éľ"
-      ],
-      [
-        "Ġp",
-        "ËĮa"
-      ],
-      [
-        "Ġn",
-        "ËĪaÉª"
-      ],
-      [
-        "w",
-        "a"
-      ],
-      [
-        "Ġf",
-        "r"
-      ],
-      [
-        "ĠÊĲ",
-        "ËĪÉĻÉľn"
-      ],
-      [
-        "ËĪu",
-        "a"
-      ],
-      [
-        "m",
-        "i"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĽ"
-      ],
-      [
-        "ËĪeËĲk",
-        "Ê°"
-      ],
-      [
-        "c",
-        "Ê°"
-      ],
-      [
-        "Ġw",
-        "ËĪÉĳ"
-      ],
-      [
-        "st",
-        "a"
-      ],
-      [
-        "Ġt",
-        "u"
-      ],
-      [
-        "Ġs",
-        "k"
-      ],
-      [
-        "ËĪÉĶ",
-        "l"
-      ],
-      [
-        "ËĪeËĲ",
-        "ÊĪ"
-      ],
-      [
-        "Ġl",
-        "ËĪaËĲÉª"
-      ],
-      [
-        "Ġl",
-        "ËĪaËĲ"
-      ],
-      [
-        "ËĪÉĽËĲ",
-        "s"
-      ],
-      [
-        "ËĪÉĽÉ¾",
-        "a"
-      ],
-      [
-        "ËĪÉĻ",
-        "Éľt"
-      ],
-      [
-        "Ġ",
-        "yn"
-      ],
-      [
-        "d",
-        "ÉĻn"
-      ],
-      [
-        "Ġd",
-        "i"
-      ],
-      [
-        "ËĪiËĲ",
-        "s"
-      ],
-      [
-        "ĠÃ°e",
-        "l"
-      ],
-      [
-        "ËĪÊĮ",
-        "r"
-      ],
-      [
-        "Ġh",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġb",
-        "ÉĻ"
-      ],
-      [
-        "Ġj",
-        "ËĪuËĲ"
-      ],
-      [
-        "ll",
-        "e"
-      ],
-      [
-        "st",
-        "o"
-      ],
-      [
-        "ËĪÉª",
-        "t"
-      ],
-      [
-        "ËĪoËĲ",
-        "É¾"
-      ],
-      [
-        "b",
-        "Ê°"
-      ],
-      [
-        "m",
-        "ÉĻn"
-      ],
-      [
-        "ËĮu",
-        "ÉĻ"
-      ],
-      [
-        "ËĮÉĻ",
-        "É¾"
-      ],
-      [
-        "ËĪÊĮ",
-        "n"
-      ],
-      [
-        "ĠlËĪaÉª",
-        "k"
-      ],
-      [
-        "Ġb",
-        "ËĪa"
-      ],
-      [
-        "Éª",
-        "Ã°"
-      ],
-      [
-        "Ġl",
-        "o"
-      ],
-      [
-        "z",
-        "i"
-      ],
-      [
-        "ËĪÊĮ",
-        "st"
-      ],
-      [
-        "m",
-        "ËĪi"
-      ],
-      [
-        "ÉĶ",
-        "Êģ"
-      ],
-      [
-        "ĠnËĪÉª",
-        "Ã§t"
-      ],
-      [
-        "Ġt",
-        "É¾"
-      ],
-      [
-        "Ġd",
-        "ËĪeËĲkÊ°"
-      ],
-      [
-        "Ġs",
-        "ËĮe"
-      ],
-      [
-        "Ġn",
-        "ËĪÉĻÊĬ"
-      ],
-      [
-        "Ġ",
-        "u"
-      ],
-      [
-        "Ġs",
-        "i"
-      ],
-      [
-        "ĠÉª",
-        "Ã§"
-      ],
-      [
-        "Ġp",
-        "r"
-      ],
-      [
-        "ĠtÉķ",
-        "ËĪy"
-      ],
-      [
-        "Ġm",
-        "ËĪu"
-      ],
-      [
-        "z",
-        "a"
-      ],
-      [
-        "Ġt",
-        "Êģ"
-      ],
-      [
-        "Ġw",
-        "ÉªÃ°"
-      ],
-      [
-        "t",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġp",
-        "ËĪÊĮÉ¾"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĶ"
-      ],
-      [
-        "ËĪoËĲ",
-        "r"
-      ],
-      [
-        "Ġh",
-        "ËĮa"
-      ],
-      [
-        "Ġk",
-        "ËĪonÉ¡"
-      ],
-      [
-        "Ġp",
-        "uÊģ"
-      ],
-      [
-        "Ġd",
-        "y"
-      ],
-      [
-        "ËĪÉª",
-        "n"
-      ],
-      [
-        "nt",
-        "e"
-      ],
-      [
-        "Ġk",
-        "ËĮa"
-      ],
-      [
-        "ËĪÉĻ",
-        "Éª"
-      ],
-      [
-        "Ġm",
-        "i"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĮuÉĻ"
-      ],
-      [
-        "ĠÊ",
-        "²"
-      ],
-      [
-        "Ġf",
-        "ËĪÉĳ"
-      ],
-      [
-        "Ġv",
-        "ÉĳËĲ"
-      ],
-      [
-        "ĠËĮa",
-        "ÊĬ"
-      ],
-      [
-        "ËĮ",
-        "uËĲ"
-      ],
-      [
-        "ĠËĪu",
-        "n"
-      ],
-      [
-        "Ġj",
-        "ËĪÊĮha"
-      ],
-      [
-        "j",
-        "uËĲ"
-      ],
-      [
-        "Ġm",
-        "Éªt"
-      ],
-      [
-        "Ġl",
-        "ËĪÉĽ"
-      ],
-      [
-        "ËĪeËĲ",
-        "Êĥ"
-      ],
-      [
-        "Ġf",
-        "ÉĶËĲ"
-      ],
-      [
-        "m",
-        "ÉĻ"
-      ],
-      [
-        "É¾",
-        "t"
-      ],
-      [
-        "ĠkËĮo",
-        "n"
-      ],
-      [
-        "Ġl",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġx",
-        "ËĪÉĳu"
-      ],
-      [
-        "p",
-        "l"
-      ],
-      [
-        "Ġd",
-        "ËĪi"
-      ],
-      [
-        "Ġl",
-        "ËĪoËĲ"
-      ],
-      [
-        "s",
-        "ÉĻ"
-      ],
-      [
-        "ËĪaËĲ",
-        "va"
-      ],
-      [
-        "Ġl",
-        "ËĪu"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĮÉĻÊĬ"
-      ],
-      [
-        "Ġh",
-        "av"
-      ],
-      [
-        "ĠËĮaËĲpk",
-        "ËĮoËĲ"
-      ],
-      [
-        "É¾",
-        "ËĪi"
-      ],
-      [
-        "Ġf",
-        "ËĪÉĻ"
-      ],
-      [
-        "Ġh",
-        "ËĮÉĻm"
-      ],
-      [
-        "ËĪonÉ¡",
-        "Éľ"
-      ],
-      [
-        "j",
-        "o"
-      ],
-      [
-        "Ġs",
-        "ÉĶ"
-      ],
-      [
-        "ËĪaËĲ",
-        "d"
-      ],
-      [
-        "w",
-        "ËĪiÉĻ"
-      ],
-      [
-        "ËĪa",
-        "nd"
-      ],
-      [
-        "ËĮa",
-        "Éªn"
-      ],
-      [
-        "t",
-        "É¾"
-      ],
-      [
-        "ĠËĮ",
-        "Éª"
-      ],
-      [
-        "ĠËĪu",
-        "na"
-      ],
-      [
-        "Ġx",
-        "wËĪÉĳ"
-      ],
-      [
-        "Ġj",
-        "ÉĶËĲ"
-      ],
-      [
-        "Êģ",
-        "ËĪi"
-      ],
-      [
-        "ĠkËĪuo",
-        "Éľ"
-      ],
-      [
-        "Ġa",
-        "Î²"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪaËĲ"
-      ],
-      [
-        "an",
-        "o"
-      ],
-      [
-        "t",
-        "ÉĻl"
-      ],
-      [
-        "Ġr",
-        "ËĮe"
-      ],
-      [
-        "ËĮÊĮ",
-        "t"
-      ],
-      [
-        "ĠjËĪi",
-        "Éĳ"
-      ],
-      [
-        "ĠÉ¾ËĮÉĻh",
-        "aËĲ"
-      ],
-      [
-        "Ġm",
-        "ËĪe"
-      ],
-      [
-        "ĠËĪy",
-        "Ã¦Éľn"
-      ],
-      [
-        "Ġf",
-        "ËĪu"
-      ],
-      [
-        "Ġb",
-        "l"
-      ],
-      [
-        "n",
-        "ËĪi"
-      ],
-      [
-        "s",
-        "ÉĻn"
-      ],
-      [
-        "Ġa",
-        "Éªn"
-      ],
-      [
-        "ËĪi",
-        "ÊĬ"
-      ],
-      [
-        "ĠÃ°e",
-        "Éª"
-      ],
-      [
-        "ĠÉª",
-        "ts"
-      ],
-      [
-        "Ġ",
-        "("
-      ],
-      [
-        "ËĪy",
-        "ËĲ"
-      ],
-      [
-        "ÉĻ",
-        "d"
-      ],
-      [
-        "ĠËĮ",
-        "o"
-      ],
-      [
-        "ĠÉĽ",
-        "s"
-      ],
-      [
-        "Ġv",
-        "iËĲ"
-      ],
-      [
-        "ËĲ",
-        "É¡eËĲ"
-      ],
-      [
-        "k",
-        "ËĪe"
-      ],
-      [
-        "ĠËĪa",
-        "l"
-      ],
-      [
-        "ÉĽ",
-        "l"
-      ],
-      [
-        "Ġ",
-        "ÊĮ"
-      ],
-      [
-        "ËĲ",
-        "o"
-      ],
-      [
-        "Ġk",
-        "ËĪo"
-      ],
-      [
-        "ĠÊĪ",
-        "ËĪuËĲ"
-      ],
-      [
-        "Ġs",
-        "ËĪÉª"
-      ],
-      [
-        "ËĪeËĲ",
-        "É¾"
-      ],
-      [
-        "Éľ",
-        "m"
-      ],
-      [
-        "ËĮ",
-        "ÉĻn"
-      ],
-      [
-        "ËĪaËĲ",
-        "i"
-      ],
-      [
-        "ËĪoËĲ",
-        "l"
-      ],
-      [
-        "Éª",
-        "ËĮeËĲ"
-      ],
-      [
-        "ĠÊ²",
-        "ËĪy"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "s",
-        "ËĪi"
-      ],
-      [
-        "Ġl",
-        "ËĪe"
-      ],
-      [
-        "ËĮ",
-        "ÉĴt"
-      ],
-      [
-        "ËĪiËĲ",
-        "p"
-      ],
-      [
-        "a",
-        "Êģ"
-      ],
-      [
-        "ĠÎ¸",
-        "ËĪÉªÅĭ"
-      ],
-      [
-        "ËĪÉĻËĲ",
-        "Éª"
-      ],
-      [
-        "ËĪÊĮ",
-        "l"
-      ],
-      [
-        "ĠhËĪoËĲ",
-        "taËĲ"
-      ],
-      [
-        "ËĪo",
-        "Éª"
-      ],
-      [
-        "nt",
-        "o"
-      ],
-      [
-        "z",
-        "h"
-      ],
-      [
-        "ĠdeËĲ",
-        "m"
-      ],
-      [
-        "ĠkÉĶ",
-        "m"
-      ],
-      [
-        "Ê°",
-        "ËĪiËĲk"
-      ],
-      [
-        "ĠdÊĴ",
-        "ËĪÊĮst"
-      ],
-      [
-        "p",
-        "É¾"
-      ],
-      [
-        "Ġl",
-        "y"
-      ],
-      [
-        "h",
-        "ËĪu"
-      ],
-      [
-        "ËĪÉĶ",
-        "Ã¸"
-      ],
-      [
-        "ËĪaËĲ",
-        "s"
-      ],
-      [
-        "ĠËĪa",
-        "n"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĴ"
-      ],
-      [
-        "Ġk",
-        "an"
-      ],
-      [
-        "Ġts",
-        "ËĪuo"
-      ],
-      [
-        "ËĪeËĲ",
-        "va"
-      ],
-      [
-        "ĠÉ¡",
-        "É¾"
-      ],
-      [
-        "Ġp",
-        "o"
-      ],
-      [
-        "ĠtÊĥ",
-        "ËĪÉĶ"
-      ],
-      [
-        "Êİ",
-        "a"
-      ],
-      [
-        "Ġm",
-        "ËĮi"
-      ],
-      [
-        "Êĥ",
-        "t"
-      ],
-      [
-        "t",
-        "ËĪi"
-      ],
-      [
-        "Ġh",
-        "ËĪÊĮ"
-      ],
-      [
-        "tÊĥ",
-        "e"
-      ],
-      [
-        "Ġf",
-        "ÉĶn"
-      ],
-      [
-        "v",
-        "e"
-      ],
-      [
-        "Ġn",
-        "ËĮe"
-      ],
-      [
-        "ËĪÉĶ",
-        "Êģ"
-      ],
-      [
-        "i",
-        "z"
-      ],
-      [
-        "Ġs",
-        "ËĪuo"
-      ],
-      [
-        "ËĪÉĽËĲ",
-        "r"
-      ],
-      [
-        "wËĪa",
-        "Êģ"
-      ],
-      [
-        "ËĪaÃ°",
-        "a"
-      ],
-      [
-        "Åĭ",
-        "k"
-      ],
-      [
-        "p",
-        "o"
-      ],
-      [
-        "Ġk",
-        "ËĪi"
-      ],
-      [
-        "ËĪa",
-        "d"
-      ],
-      [
-        "Ġv",
-        "ËĪi"
-      ],
-      [
-        "t",
-        "Éķ"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĻ"
-      ],
-      [
-        "Ġw",
-        "ËĪu"
-      ],
-      [
-        "ÉĴ",
-        "z"
-      ],
-      [
-        "ĠvÉĳËĲ",
-        "É¾"
-      ],
-      [
-        "Êģ",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġk",
-        "ËĪaËĲ"
-      ],
-      [
-        "k",
-        "e"
-      ],
-      [
-        "n",
-        "ÉĻ"
-      ],
-      [
-        "ËĪÊĮ",
-        "b"
-      ],
-      [
-        "ËĪuËĲ",
-        "É¾"
-      ],
-      [
-        "ËĮÉĻ",
-        "ËĲ"
-      ],
-      [
-        "ĠÊĪ",
-        "Ê°ËĪiËĲk"
-      ],
-      [
-        "Ġk",
-        "ËĪu"
-      ],
-      [
-        "Ġb",
-        "ËĮÊĮt"
-      ],
-      [
-        "Ġa",
-        "t"
-      ],
-      [
-        "Ġf",
-        "É¹"
-      ],
-      [
-        "ËĪa",
-        "x"
-      ],
-      [
-        "Ġz",
-        "oËĲ"
-      ],
-      [
-        "Ġt",
-        "ËĪaËĲ"
-      ],
-      [
-        "ĠÃ°",
-        "ËĮe"
-      ],
-      [
-        "n",
-        "eËĲ"
-      ],
-      [
-        "ĠÉĳ",
-        "ËĲ"
-      ],
-      [
-        "Ġa",
-        "ÊĬf"
-      ],
-      [
-        "a",
-        "m"
-      ],
-      [
-        "ÊĬ",
-        "Åĭ"
-      ],
-      [
-        "ĠÉĶ",
-        "ËĲ"
-      ],
-      [
-        "ĠÉķËĪi",
-        "ÉľÅĭ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĶËĲl"
-      ],
-      [
-        "Éª",
-        "m"
-      ],
-      [
-        "j",
-        "ËĪo"
-      ],
-      [
-        "ËĪiËĲ",
-        "ÉŁ"
-      ],
-      [
-        "Ġkw",
-        "ËĮÉĽ"
-      ],
-      [
-        "ĠmËĪa",
-        "s"
-      ],
-      [
-        "ÉĻ",
-        "h"
-      ],
-      [
-        "ĠËĪa",
-        "ÊĬ"
-      ],
-      [
-        "ËĪÉĶ",
-        "Éª"
-      ],
-      [
-        "É¡",
-        "ÉĻÉ¾"
-      ],
-      [
-        "r",
-        "ÉĻn"
-      ],
-      [
-        "ËĪÉª",
-        "k"
-      ],
-      [
-        "s",
-        "se"
-      ],
-      [
-        "Ġp",
-        "ËĪÉĳ"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĮe"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪi"
-      ],
-      [
-        "Ġa",
-        "z"
-      ],
-      [
-        "ĠÉ¡ËĪÊĮ",
-        "jaËĲ"
-      ],
-      [
-        "z",
-        "e"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĮaËĲ"
-      ],
-      [
-        "Ġf",
-        "ËĪi"
-      ],
-      [
-        "ĠËĮ",
-        "ÉĴn"
-      ],
-      [
-        "Ġx",
-        "ËĪo"
-      ],
-      [
-        "ĠËĮÊĬ",
-        "na"
-      ],
-      [
-        "ĠtÊ°",
-        "aËĲ"
-      ],
-      [
-        "Ġs",
-        "Éĳ"
-      ],
-      [
-        "ËĪeÉª",
-        "ÊĥÉĻn"
-      ],
-      [
-        "ĠtÉķËĪi",
-        "Éľ"
-      ],
-      [
-        "ĠÉŁ",
-        "aËĲ"
-      ],
-      [
-        "p",
-        "ËĲ"
-      ],
-      [
-        "Ġpl",
-        "y"
-      ],
-      [
-        "Î¸",
-        "ËĪi"
-      ],
-      [
-        "ËĲ",
-        "Éĸ"
-      ],
-      [
-        "Ġt",
-        "ËĪuei"
-      ],
-      [
-        "Ġl",
-        "ËĪÉĻ"
-      ],
-      [
-        "Ġd",
-        "ÉĳËĲ"
-      ],
-      [
-        "f",
-        "t"
-      ],
-      [
-        "ËĪa",
-        "m"
-      ],
-      [
-        "ĠsËĪÊĮ",
-        "kt"
-      ],
-      [
-        "Ġt",
-        "ËĪou"
-      ],
-      [
-        "Ġp",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ĠËĪa",
-        "i"
-      ],
-      [
-        "ĠwËĪÉĴ",
-        "n"
-      ],
-      [
-        "Ġz",
-        "ËĮaÉªn"
-      ],
-      [
-        "Ġe",
-        "st"
-      ],
-      [
-        "Ġm",
-        "ÉĶ"
-      ],
-      [
-        "ĠtÉķ",
-        "jËĪÉĳu"
-      ],
-      [
-        "Éľ",
-        "p"
-      ],
-      [
-        "ËĪÊĮ",
-        "z"
-      ],
-      [
-        "b",
-        "i"
-      ],
-      [
-        "ËĪÉĽËĲs",
-        "eËĲ"
-      ],
-      [
-        "Ġl",
-        "ËĪy"
-      ],
-      [
-        "Ġm",
-        "ËĮe"
-      ],
-      [
-        "Ġd",
-        "ËĮÉĽl"
-      ],
-      [
-        "ËĪiËĲ",
-        "l"
-      ],
-      [
-        "ĠkËĮo",
-        "mo"
-      ],
-      [
-        "Ġh",
-        "ËĪaÉľn"
-      ],
-      [
-        "ËĪoËĲ",
-        "ne"
-      ],
-      [
-        "ĠkËĪÊĮÉ¾",
-        "t"
-      ],
-      [
-        "Ġsy",
-        "Êģ"
-      ],
-      [
-        "ËĮÉĶ",
-        "É¾"
-      ],
-      [
-        "ĠÉª",
-        "f"
-      ],
-      [
-        "u",
-        "v"
-      ],
-      [
-        "z",
-        "ÉĻn"
-      ],
-      [
-        "o",
-        "l"
-      ],
-      [
-        "Ï",
-        "ĩ"
-      ],
-      [
-        "i",
-        "m"
-      ],
-      [
-        "Ġm",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ĠÃ°",
-        "Éª"
-      ],
-      [
-        "Ġv",
-        "ËĪÉĽ"
-      ],
-      [
-        "ÊĬ",
-        "d"
-      ],
-      [
-        "Ġt",
-        "r"
-      ],
-      [
-        "ËĪeËĲ",
-        "s"
-      ],
-      [
-        "Ã°",
-        "e"
-      ],
-      [
-        "d",
-        "e"
-      ],
-      [
-        "Ê°",
-        "Ïĩ"
-      ],
-      [
-        "ÉŁ",
-        "Ê°"
-      ],
-      [
-        "ËĮÉĻËĲ",
-        "ÉªÉľ"
-      ],
-      [
-        "b",
-        "ËĲ"
-      ],
-      [
-        "ËĪÊĬ",
-        "k"
-      ],
-      [
-        "ĠnËĪÉĶ",
-        "ÉªÉľ"
-      ],
-      [
-        "ĠËĮ",
-        "iËĲ"
-      ],
-      [
-        "ËĪÉĳËĲ",
-        "t"
-      ],
-      [
-        "ËĪiËĲ",
-        "É¾"
-      ],
-      [
-        "Ġt",
-        "É¹"
-      ],
-      [
-        "É¾",
-        "ÉĶ"
-      ],
-      [
-        "Ġw",
-        "ÉĴz"
-      ],
-      [
-        "Ġv",
-        "u"
-      ],
-      [
-        "b",
-        "ÉĻl"
-      ],
-      [
-        "b",
-        "ÉĻ"
-      ],
-      [
-        "É¹",
-        "i"
-      ],
-      [
-        "nt",
-        "s"
-      ],
-      [
-        "Ġs",
-        "ËĪaËĲ"
-      ],
-      [
-        "d",
-        "Ê°"
-      ],
-      [
-        "Ġt",
-        "ÊĬ"
-      ],
-      [
-        "ĠÊİ",
-        "ËĮi"
-      ],
-      [
-        "Î²",
-        "a"
-      ],
-      [
-        "h",
-        "ËĪÉĻÉľÅĭ"
-      ],
-      [
-        "Ġs",
-        "ËĪiËĲ"
-      ],
-      [
-        "ĠpËĮa",
-        "É¾a"
-      ],
-      [
-        "ËĪÉĽÉ¾",
-        "ÉĶ"
-      ],
-      [
-        "ËĪÉª",
-        "s"
-      ],
-      [
-        "É£",
-        "o"
-      ],
-      [
-        "ĠËĮa",
-        "l"
-      ],
-      [
-        "o",
-        "r"
-      ],
-      [
-        "Ġb",
-        "ËĪÊĮh"
-      ],
-      [
-        "Ġk",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġt",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġp",
-        "ËĪo"
-      ],
-      [
-        "ĠÊĴ",
-        "ÉĻ"
-      ],
-      [
-        "p",
-        "Êģ"
-      ],
-      [
-        "Ġ",
-        "ËĪaÉª"
-      ],
-      [
-        "hËĪÉĳ",
-        "ÉľÅĭ"
-      ],
-      [
-        "ÉĻl",
-        "i"
-      ],
-      [
-        "ËĪeÉª",
-        "t"
-      ],
-      [
-        "ĠjËĪiou",
-        "Éľ"
-      ],
-      [
-        "Ġd",
-        "ËĪÉĻ"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "l",
-        "ËĪi"
-      ],
-      [
-        "ËĮy",
-        "ÉĻ"
-      ],
-      [
-        "ĠlËĪoËĲ",
-        "É¡"
-      ],
-      [
-        "Ġn",
-        "ËĪÊĮ"
-      ],
-      [
-        "Ġh",
-        "ËĪÊĬ"
-      ],
-      [
-        "Ġn",
-        "ËĪÉĻÉľÅĭ"
-      ],
-      [
-        "ĠÊģ",
-        "ÉĻ"
-      ],
-      [
-        "z",
-        "ËĪi"
-      ],
-      [
-        "Ġt",
-        "ËĪuËĲ"
-      ],
-      [
-        "ĠkËĮo",
-        "me"
-      ],
-      [
-        "Ġl",
-        "ËĪeËĲ"
-      ],
-      [
-        "ËĪaËĲt",
-        "aËĲ"
-      ],
-      [
-        "Ġa",
-        "n"
-      ],
-      [
-        "ĠËĪy",
-        "u"
-      ],
-      [
-        "ĠËĮÊĮ",
-        "É¡ÉĻÉ¾"
-      ],
-      [
-        "ĠËĪÉª",
-        "n"
-      ],
-      [
-        "ĠhËĪo",
-        "ÉĻ"
-      ],
-      [
-        "v",
-        "ÉĻ"
-      ],
-      [
-        "ËĪÃ¸",
-        "ËĲ"
-      ],
-      [
-        "Î¸j",
-        "a"
-      ],
-      [
-        "ËĪuÉĻ",
-        "Éľn"
-      ],
-      [
-        "Ġk",
-        "ÉĻÉ¾"
-      ],
-      [
-        "ËĪa",
-        "t"
-      ],
-      [
-        "j",
-        "ËĪÃ¸"
-      ],
-      [
-        "ËĪÉĽt",
-        "Êģ"
-      ],
-      [
-        "Ġp",
-        "ËĪÉĳu"
-      ],
-      [
-        "st",
-        "ÉĻ"
-      ],
-      [
-        "Ġw",
-        "ÉĴt"
-      ],
-      [
-        "ËĪeËĲ",
-        "l"
-      ],
-      [
-        "ÊĪ",
-        "i"
-      ],
-      [
-        "Ġx",
-        "ËĪaiÉľ"
-      ],
-      [
-        "ËĪy",
-        "Êģ"
-      ],
-      [
-        "ĠhËĪoËĲ",
-        "É¡aËĲ"
-      ],
-      [
-        "Ġts",
-        "ËĪi"
-      ],
-      [
-        "ĠËĪÊĮ",
-        "p"
-      ],
-      [
-        "Ġn",
-        "ËĮÉĴt"
-      ],
-      [
-        "ĠlËĪÉª",
-        "eËĲ"
-      ],
-      [
-        "Ġh",
-        "ËĪa"
-      ],
-      [
-        "Ġf",
-        "l"
-      ],
-      [
-        "Ġn",
-        "ËĪeËĲ"
-      ],
-      [
-        "ËĮaËĲ",
-        "Éª"
-      ],
-      [
-        "Ġt",
-        "ËĪuo"
-      ],
-      [
-        "tÊĥ",
-        "ËĲ"
-      ],
-      [
-        "s",
-        "ËĪe"
-      ],
-      [
-        "bÊ°",
-        "i"
-      ],
-      [
-        "ĠbËĪÊĮh",
-        "ÊĬt"
-      ],
-      [
-        "ËĪÉĽ",
-        "nd"
-      ],
-      [
-        "Ġs",
-        "ËĪÉĶ"
-      ],
-      [
-        "ÉĻn",
-        "s"
-      ],
-      [
-        "ËĮÉĻ",
-        "l"
-      ],
-      [
-        "ÉĽ",
-        "Éľ"
-      ],
-      [
-        "ĠÉ¡",
-        "l"
-      ],
-      [
-        "ËĪÉª",
-        "É¾"
-      ],
-      [
-        "ËĪaËĲt",
-        "a"
-      ],
-      [
-        "Éľ",
-        "ËĲ"
-      ],
-      [
-        "ËĪÉĽnt",
-        "o"
-      ],
-      [
-        "sk",
-        "ËĮoËĲ"
-      ],
-      [
-        "ËĪÉĽ",
-        "k"
-      ],
-      [
-        "ts",
-        "i"
-      ],
-      [
-        "Ġt",
-        "ËĪonÉ¡"
-      ],
-      [
-        "Ġb",
-        "iËĲ"
-      ],
-      [
-        "Ġh",
-        "ËĪaËĲÉª"
-      ],
-      [
-        "Ġb",
-        "ËĪi"
-      ],
-      [
-        "j",
-        "j"
-      ],
-      [
-        "Êİ",
-        "i"
-      ],
-      [
-        "Ġk",
-        "Ê°"
-      ],
-      [
-        "Ġs",
-        "ËĪo"
-      ],
-      [
-        "ll",
-        "o"
-      ],
-      [
-        "Ġb",
-        "aÉª"
-      ],
-      [
-        "ĠÉĽ",
-        "nt"
-      ],
-      [
-        "Ġ",
-        "ËĪiËĲ"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪo"
-      ],
-      [
-        "É¾",
-        "eËĲ"
-      ],
-      [
-        "Ġk",
-        "Êĭ"
-      ],
-      [
-        "Ġm",
-        "ËĪeiÉľ"
-      ],
-      [
-        "ÊĬ",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "Ġt",
-        "ËĪaÉª"
-      ],
-      [
-        "Ġsu",
-        "s"
-      ],
-      [
-        "Ġr",
-        "i"
-      ],
-      [
-        "Ġv",
-        "ËĮÉĽ"
-      ],
-      [
-        "ËĪiËĲ",
-        "no"
-      ],
-      [
-        "v",
-        "ano"
-      ],
-      [
-        "ĠdËĮi",
-        "ËĲ"
-      ],
-      [
-        "ĠÊĲ",
-        "ËĪaÉľn"
-      ],
-      [
-        "Ê",
-        "Ĥ"
-      ],
-      [
-        "ĠÉĲ",
-        "b"
-      ],
-      [
-        "ËĪaËĲ",
-        "h"
-      ],
-      [
-        "Éª",
-        "Êĥ"
-      ],
-      [
-        "ĠdËĮe",
-        "lla"
-      ],
-      [
-        "tËĲ",
-        "i"
-      ],
-      [
-        "ĠËĪÊĬ",
-        "n"
-      ],
-      [
-        "Ġh",
-        "iËĲ"
-      ],
-      [
-        "Ġb",
-        "ËĪaËĲt"
-      ],
-      [
-        "Ġth",
-        "ËĪi"
-      ],
-      [
-        "Ġa",
-        "m"
-      ],
-      [
-        "Ġ",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġh",
-        "u"
-      ],
-      [
-        "Ġk",
-        "ËĪÊĮh"
-      ],
-      [
-        "Ġz",
-        "ËĪÉĳËĲ"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĮÉĶ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĻÊĬ"
-      ],
-      [
-        "y",
-        "ËĪi"
-      ],
-      [
-        "Ġl",
-        "ËĪÊĮ"
-      ],
-      [
-        "Ġd",
-        "ËĪeËĲ"
-      ],
-      [
-        "Ġs",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "sk",
-        "ËĮeËĲ"
-      ],
-      [
-        "É¾",
-        "o"
-      ],
-      [
-        "Êģ",
-        "ËĪÉĳ"
-      ],
-      [
-        "t",
-        "ËĪa"
-      ],
-      [
-        "Ġk",
-        "ËĪÊĬ"
-      ],
-      [
-        "ËĪant",
-        "e"
-      ],
-      [
-        "Ġd",
-        "ÉĶ"
-      ],
-      [
-        "Ġs",
-        "ËĪeÉª"
-      ],
-      [
-        "Ġs",
-        "ÉĽt"
-      ],
-      [
-        "É¹",
-        "Éª"
-      ],
-      [
-        "ĠÉ¡ËĮÉĻÊĬ",
-        "ÉªÅĭ"
-      ],
-      [
-        "z",
-        "o"
-      ],
-      [
-        "Ġj",
-        "ËĪaËĲ"
-      ],
-      [
-        "ĠÉĴv",
-        "Ã°ÉĻ"
-      ],
-      [
-        "ĠÊ",
-        "Ŀ"
-      ],
-      [
-        "ĠÉĽ",
-        "l"
-      ],
-      [
-        "Ġs",
-        "ËĪoËĲ"
-      ],
-      [
-        "Ġth",
-        "ËĪiÉľ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĽl"
-      ],
-      [
-        "Ġly",
-        "ËĮi"
-      ],
-      [
-        "nd",
-        "ÊĴ"
-      ],
-      [
-        "ĠÉķ",
-        "jËĪÉĳu"
-      ],
-      [
-        "Î¸",
-        "a"
-      ],
-      [
-        "ĠÉ¾ËĮÉĻh",
-        "eËĲ"
-      ],
-      [
-        "Ġma",
-        "Éª"
-      ],
-      [
-        "j",
-        "ÉĻ"
-      ],
-      [
-        "ĠËĪÊĮ",
-        "b"
-      ],
-      [
-        "as",
-        "jËĪÉĶ"
-      ],
-      [
-        "d",
-        "Êģ"
-      ],
-      [
-        "Ġkh",
-        "ËĪa"
-      ],
-      [
-        "ĠËĪe",
-        "s"
-      ],
-      [
-        "v",
-        "i"
-      ],
-      [
-        "f",
-        "i"
-      ],
-      [
-        "ËĮÉĻ",
-        "b"
-      ],
-      [
-        "Ġr",
-        "e"
-      ],
-      [
-        "Ġav",
-        "ËĮÉĽ"
-      ],
-      [
-        "Ġt",
-        "ËĮi"
-      ],
-      [
-        "Ġk",
-        "É¾"
-      ],
-      [
-        "Ġb",
-        "Éªk"
-      ],
-      [
-        "st",
-        "e"
-      ],
-      [
-        "ËĪeËĲÊĥ",
-        "c"
-      ],
-      [
-        "p",
-        "t"
-      ],
-      [
-        "z",
-        "ÉĻ"
-      ],
-      [
-        "Ġw",
-        "ËĪaËĲ"
-      ],
-      [
-        "k",
-        "l"
-      ],
-      [
-        "ĠsËĪÊĮ",
-        "m"
-      ],
-      [
-        "Éª",
-        "ÊĪ"
-      ],
-      [
-        "d",
-        "z"
-      ],
-      [
-        "v",
-        "o"
-      ],
-      [
-        "ËĮa",
-        "ÊĬt"
-      ],
-      [
-        "nd",
-        "e"
-      ],
-      [
-        "Ġd",
-        "ÉĽs"
-      ],
-      [
-        "ĠÉŁ",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġr",
-        "ËĮi"
-      ],
-      [
-        "s",
-        "ËĮeËĲ"
-      ],
-      [
-        "É¡",
-        "i"
-      ],
-      [
-        "Ġal",
-        "s"
-      ],
-      [
-        "ËĪi",
-        "Ã°o"
-      ],
-      [
-        "ĠnËĪi",
-        "Éľn"
-      ],
-      [
-        "ÊĬ",
-        "l"
-      ],
-      [
-        "ts",
-        "ËĲ"
-      ],
-      [
-        "ËĪant",
-        "o"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪÉĻÊĬ"
-      ],
-      [
-        "kËĲ",
-        "i"
-      ],
-      [
-        "ĠsËĪÊĮ",
-        "b"
-      ],
-      [
-        "Ġn",
-        "ËĪa"
-      ],
-      [
-        "Ġl",
-        "ËĮo"
-      ],
-      [
-        "Ġph",
-        "ËĪi"
-      ],
-      [
-        "m",
-        "ËĮe"
-      ],
-      [
-        "Ġf",
-        "a"
-      ],
-      [
-        "k",
-        "ÉĻ"
-      ],
-      [
-        "Ġz",
-        "ËĪu"
-      ],
-      [
-        "n",
-        "s"
-      ],
-      [
-        "ĠÊģ",
-        "e"
-      ],
-      [
-        "Ġb",
-        "ËĪo"
-      ],
-      [
-        "ËĪaËĲt",
-        "i"
-      ],
-      [
-        "Ġm",
-        "an"
-      ],
-      [
-        "ĠlËĪi",
-        "Éĳ"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĮyÉĻ"
-      ],
-      [
-        "Ġf",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "ĠkÊĭ",
-        "ËĪeËĲÊĥc"
-      ],
-      [
-        "Ġx",
-        "ËĪÉĳ"
-      ],
-      [
-        "ĠtÉķ",
-        "ËĪu"
-      ],
-      [
-        "j",
-        "ÉĻÉ¾"
-      ],
-      [
-        "ĠÉª",
-        "st"
-      ],
-      [
-        "w",
-        "ËĪi"
-      ],
-      [
-        "ĠËĮaÉªn",
-        "ÉĻ"
-      ],
-      [
-        "Éª",
-        "É¡"
-      ],
-      [
-        "Ġs",
-        "ÊĪ"
-      ],
-      [
-        "ËĪi",
-        "ÉĻl"
-      ],
-      [
-        "Ġn",
-        "ËĪiÉĽÉľn"
-      ],
-      [
-        "ĠËĮÉĽ",
-        "ËĲ"
-      ],
-      [
-        "ËĪaÉª",
-        "nd"
-      ],
-      [
-        "Ġz",
-        "ËĪi"
-      ],
-      [
-        "v",
-        "ÉĻn"
-      ],
-      [
-        "m",
-        "z"
-      ],
-      [
-        "Ã°",
-        "os"
-      ],
-      [
-        "dÊĴ",
-        "ËĲ"
-      ],
-      [
-        "j",
-        "ËĪa"
-      ],
-      [
-        "É¾",
-        "ËĪÉĶ"
-      ],
-      [
-        "l",
-        "ËĪe"
-      ],
-      [
-        "Ê",
-        "²"
-      ],
-      [
-        "Ġv",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġl",
-        "ËĪiÉĽ"
-      ],
-      [
-        "Î¸",
-        "e"
-      ],
-      [
-        "mËĪe",
-        "nte"
-      ],
-      [
-        "ĠÉªn",
-        "Ã°ÉĻ"
-      ],
-      [
-        "ĠaÉª",
-        "m"
-      ],
-      [
-        "n",
-        "ÉĻn"
-      ],
-      [
-        "Ġh",
-        "ÉĻm"
-      ],
-      [
-        "É¾",
-        "aËĲ"
-      ],
-      [
-        "ĠsËĪuo",
-        "Éľ"
-      ],
-      [
-        "ĠÉ²",
-        "ËĪi"
-      ],
-      [
-        "ĠÉ¹",
-        "ËĪiÉĻl"
-      ],
-      [
-        "l",
-        "ËĪa"
-      ],
-      [
-        "Ġb",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġk",
-        "ËĪai"
-      ],
-      [
-        "Êģ",
-        "ËĪa"
-      ],
-      [
-        "Ġw",
-        "ËĪÉľËĲ"
-      ],
-      [
-        "Ġa",
-        "ËĲ"
-      ],
-      [
-        "Ġp",
-        "as"
-      ],
-      [
-        "ËĪÊĮ",
-        "s"
-      ],
-      [
-        "w",
-        "ËĪÉĽÉ¾"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪe"
-      ],
-      [
-        "ĠhËĮa",
-        "tÉĻ"
-      ],
-      [
-        "a",
-        "Éªn"
-      ],
-      [
-        "ĠËĪÉĶ",
-        "pÊ°"
-      ],
-      [
-        "Êģ",
-        "ËĪe"
-      ],
-      [
-        "ĠÉŁaËĲ",
-        "ËĪeËĲÉ¡aËĲ"
-      ],
-      [
-        "ĠËĪÊĬ",
-        "s"
-      ],
-      [
-        "ĠtÉķhËĪi",
-        "Éľ"
-      ],
-      [
-        "nt",
-        "Êĥ"
-      ],
-      [
-        "Ġx",
-        "ËĪuo"
-      ],
-      [
-        "ËĪu",
-        "Êģ"
-      ],
-      [
-        "ĠÉª",
-        "m"
-      ],
-      [
-        "É³",
-        "Éĸ"
-      ],
-      [
-        "ËĪyÉĻ",
-        "Éľkh"
-      ],
-      [
-        "ĠËĪy",
-        "ÉĽ"
-      ],
-      [
-        "Ġm",
-        "ËĮaËĲ"
-      ],
-      [
-        "Åĵ",
-        "Êģ"
-      ],
-      [
-        "ĠËĪa",
-        "lt"
-      ],
-      [
-        "Ġk",
-        "ÉĻm"
-      ],
-      [
-        "Êİ",
-        "o"
-      ],
-      [
-        "ĠÉĲ",
-        "n"
-      ],
-      [
-        "Ġf",
-        "y"
-      ],
-      [
-        "ĠËĮÉĽ",
-        "ra"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÊĬ"
-      ],
-      [
-        "Ġp",
-        "ËĪÊĮ"
-      ],
-      [
-        "l",
-        "s"
-      ],
-      [
-        "Ġl",
-        "ËĪiËĲ"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĪy"
-      ],
-      [
-        "ĠbÉªk",
-        "ËĪÊĮz"
-      ],
-      [
-        "ĠÉ¡",
-        "ÉĽt"
-      ],
-      [
-        "Ġb",
-        "É¾"
-      ],
-      [
-        "t",
-        "Ê°"
-      ],
-      [
-        "tÉĻl",
-        "ËĮÉĻb"
-      ],
-      [
-        "x",
-        "o"
-      ],
-      [
-        "sk",
-        "ËĮaËĲ"
-      ],
-      [
-        "É²",
-        "Ê²"
-      ],
-      [
-        "ËĪeËĲk",
-        "ÊĪ"
-      ],
-      [
-        "r",
-        "ÉĻ"
-      ],
-      [
-        "tÊĥ",
-        "o"
-      ],
-      [
-        "ĠpÊģ",
-        "ÉĶ"
-      ],
-      [
-        "ĠÉ¹",
-        "ËĪaÉªt"
-      ],
-      [
-        "Ġp",
-        "ËĪei"
-      ],
-      [
-        "ËĮ",
-        "ÉªÃ§"
-      ],
-      [
-        "j",
-        "ËĪÉĽÉ¾"
-      ],
-      [
-        "tËĲ",
-        "a"
-      ],
-      [
-        "ĠÉĲb",
-        "ËĮaÊĬt"
-      ],
-      [
-        "ĠkÊĭËĪeËĲÊĥc",
-        "ÉĻn"
-      ],
-      [
-        "Ġv",
-        "ËĪe"
-      ],
-      [
-        "ÊĬ",
-        "Éľ"
-      ],
-      [
-        "Ġa",
-        "kËĪe"
-      ],
-      [
-        "Ġp",
-        "ËĪai"
-      ],
-      [
-        "v",
-        "ËĪÉĽ"
-      ],
-      [
-        "ĠÎ¸",
-        "É¹"
-      ],
-      [
-        "Éª",
-        "f"
-      ],
-      [
-        "Ġav",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġk",
-        "ËĪe"
-      ],
-      [
-        "d",
-        "ËĪi"
-      ],
-      [
-        "ËĪeËĲ",
-        "Éĸ"
-      ],
-      [
-        "Ġb",
-        "ÉĻt"
-      ],
-      [
-        "ÊĪ",
-        "Ê°"
-      ],
-      [
-        "t",
-        "eËĲ"
-      ],
-      [
-        "Î¸j",
-        "ËĪÉĶn"
-      ],
-      [
-        "d",
-        "Éľ"
-      ],
-      [
-        "ĠjËĪi",
-        "Éľ"
-      ],
-      [
-        "Ġv",
-        "e"
-      ],
-      [
-        "É£",
-        "ËĪu"
-      ],
-      [
-        "ËĪÊĮh",
-        "ÉĻl"
-      ],
-      [
-        "Ġp",
-        "ÉĶ"
-      ],
-      [
-        "ĠÉ¡",
-        "r"
-      ],
-      [
-        "ĠÃ°",
-        "a"
-      ],
-      [
-        "Ġv",
-        "ËĪiËĲ"
-      ],
-      [
-        "ĠËĮ",
-        "ÉĳËĲ"
-      ],
-      [
-        "ËĪÉĻÊĬ",
-        "nt"
-      ],
-      [
-        "Ġb",
-        "ËĪaËĲÉ¾"
-      ],
-      [
-        "ĠmËĪÊĮ",
-        "tÉĻlËĮÉĻb"
-      ],
-      [
-        "l",
-        "d"
-      ],
-      [
-        "ĠtÉķ",
-        "ËĮÉĶ"
-      ],
-      [
-        "p",
-        "a"
-      ],
-      [
-        "Ã°",
-        "ËĪad"
-      ],
-      [
-        "ËĪi",
-        "É¾"
-      ],
-      [
-        "Ġx",
-        "ËĪu"
-      ],
-      [
-        "ĠlËĪi",
-        "ÉľÅĭ"
-      ],
-      [
-        "ËĪeÉª",
-        "s"
-      ],
-      [
-        "ĠÉĹËĮe",
-        "Éľn"
-      ],
-      [
-        "Ġth",
-        "ËĪiÉĽ"
-      ],
-      [
-        "tËĲ",
-        "e"
-      ],
-      [
-        "ĠavËĮÉĽ",
-        "k"
-      ],
-      [
-        "ĠËĮ",
-        "ÉĶ"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĳu"
-      ],
-      [
-        "Éª",
-        "v"
-      ],
-      [
-        "iËĲ",
-        "z"
-      ],
-      [
-        "ËĪo",
-        "s"
-      ],
-      [
-        "ĠÉ¡",
-        "É¹"
-      ],
-      [
-        "a",
-        "nd"
-      ],
-      [
-        "ĠlËĪi",
-        "ou"
-      ],
-      [
-        "ĠËĪo",
-        "Éľ"
-      ],
-      [
-        "É¡",
-        "l"
-      ],
-      [
-        "Ġp",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "Ġm",
-        "ËĮeËĲ"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĴ"
-      ],
-      [
-        "n",
-        "os"
-      ],
-      [
-        "Ã§",
-        "ÉĻn"
-      ],
-      [
-        "f",
-        "ÉĻn"
-      ],
-      [
-        "ĠsËĪÊĮkt",
-        "ËĮeËĲ"
-      ],
-      [
-        "Ġ",
-        "ËĪaÉªn"
-      ],
-      [
-        "ËĪoËĲ",
-        "re"
-      ],
-      [
-        "j",
-        "ËĪÉĽn"
-      ],
-      [
-        "ĠÃ°",
-        "ËĪÉĽn"
-      ],
-      [
-        "ĠtÉķh",
-        "ËĪiÉĽÉľn"
-      ],
-      [
-        "Ġh",
-        "ËĪaÉª"
-      ],
-      [
-        "É¾",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġs",
-        "ËĪu"
-      ],
-      [
-        "ĠkËĪÉª",
-        "jaËĲ"
-      ],
-      [
-        "Ġpj",
-        "ËĮÊĬ"
-      ],
-      [
-        "ĠhÉĻm",
-        "ËĮaËĲ"
-      ],
-      [
-        "ĠËĮÊĮ",
-        "p"
-      ],
-      [
-        "Ġp",
-        "ËĪÊĮhÉĻl"
-      ],
-      [
-        "Ġx",
-        "ËĪÉĻ"
-      ],
-      [
-        "d",
-        "ËĪe"
-      ],
-      [
-        "Ġm",
-        "Éĳ"
-      ],
-      [
-        "ĠÊĬ",
-        "m"
-      ],
-      [
-        "nd",
-        "ÉĻ"
-      ],
-      [
-        "Ġd",
-        "ËĪÉĻÊĬnt"
-      ],
-      [
-        "ËĪeËĲ",
-        "ÊĥÉĻn"
-      ],
-      [
-        "ĠÃ°a",
-        "ts"
-      ],
-      [
-        "i",
-        "s"
-      ],
-      [
-        "Ġc",
-        "ËĪaËĲh"
-      ],
-      [
-        "p",
-        "e"
-      ],
-      [
-        "Ġs",
-        "ËĮo"
-      ],
-      [
-        "ĠÃ°",
-        "ËĪe"
-      ],
-      [
-        "Ġs",
-        "ËĪaËĲt"
-      ],
-      [
-        "ËĪa",
-        "Êģ"
-      ],
-      [
-        "Ġs",
-        "ËĪe"
-      ],
-      [
-        "ÉĻ",
-        "k"
-      ],
-      [
-        "Éª",
-        "Êĭ"
-      ],
-      [
-        "ĠkËĪoËĲ",
-        "i"
-      ],
-      [
-        "k",
-        "ÉĶ"
-      ],
-      [
-        "Ġv",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "Ġf",
-        "ËĪei"
-      ],
-      [
-        "Ġl",
-        "ËĪeËĲk"
-      ],
-      [
-        "Ġh",
-        "ËĪiÉĻ"
-      ],
-      [
-        "Ġa",
-        "ÊĬ"
-      ],
-      [
-        "ËĪÉĽ",
-        "ndo"
-      ],
-      [
-        "ËĪe",
-        "s"
-      ],
-      [
-        "Ġz",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĽÉ¾a"
-      ],
-      [
-        "nËĪi",
-        "Éľn"
-      ],
-      [
-        "ĠkËĪÊĮ",
-        "m"
-      ],
-      [
-        "Ġl",
-        "ËĪÉĴ"
-      ],
-      [
-        "Éª",
-        "st"
-      ],
-      [
-        "Ġp",
-        "Éĳ"
-      ],
-      [
-        "Ġf",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġth",
-        "ËĪonÉ¡"
-      ],
-      [
-        "nk",
-        "e"
-      ],
-      [
-        "ËĮ",
-        "Éªk"
-      ],
-      [
-        "ĠÉ²",
-        "ËĪÉĻ"
-      ],
-      [
-        "ËĮÊĮ",
-        "m"
-      ],
-      [
-        "ËĪiËĲ",
-        "t"
-      ],
-      [
-        "ĠwËĪÉĴ",
-        "nt"
-      ],
-      [
-        "ËĪaÎ²",
-        "an"
-      ],
-      [
-        "ĠbËĪÊĮ",
-        "r"
-      ],
-      [
-        "ÉĽ",
-        "nd"
-      ],
-      [
-        "ĠËĮÉĳËĲ",
-        "bÉľ"
-      ],
-      [
-        "Ġv",
-        "ËĪaÉª"
-      ],
-      [
-        "ĠtÊĥ",
-        "ËĮi"
-      ],
-      [
-        "ĠÎ¸ËĪÉªÅĭ",
-        "k"
-      ],
-      [
-        "st",
-        "i"
-      ],
-      [
-        "Ġk",
-        "É¹"
-      ],
-      [
-        "ĠËĪa",
-        "ÊĬt"
-      ],
-      [
-        "st",
-        "ÉĻn"
-      ],
-      [
-        "ĠÊĭ",
-        "ËĪÊĮn"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĮaËĲ"
-      ],
-      [
-        "ËĪaËĲÉľ",
-        "É²"
-      ],
-      [
-        "Êģ",
-        "i"
-      ],
-      [
-        "ĠnËĪÉĶ",
-        "x"
-      ],
-      [
-        "ĠÉ¹ËĪiÉĻl",
-        "Éª"
-      ],
-      [
-        "Ġv",
-        "ËĮi"
-      ],
-      [
-        "ĠÃ°e",
-        "ÉĻ"
-      ],
-      [
-        "ËĮÉª",
-        "tÊĥ"
-      ],
-      [
-        "Ġv",
-        "ËĪyÉĻ"
-      ],
-      [
-        "ĠËĮaËĲpk",
-        "ËĮaËĲ"
-      ],
-      [
-        "Ġf",
-        "ËĮaËĲÉª"
-      ],
-      [
-        "Ġp",
-        "ËĪÉĶ"
-      ],
-      [
-        "ĠnËĪÊĮ",
-        "mb"
-      ],
-      [
-        "Î¸",
-        "es"
-      ],
-      [
-        "j",
-        "ËĪÉĽÊģ"
-      ],
-      [
-        "ĠkËĪÊĬ",
-        "cÊ°"
-      ],
-      [
-        "m",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġv",
-        "ËĪu"
-      ],
-      [
-        "Ġl",
-        "ÅĵÊģ"
-      ],
-      [
-        "ĠiËĲ",
-        "m"
-      ],
-      [
-        "ÊĪ",
-        "ÉĻÉ¾"
-      ],
-      [
-        "tÊĥ",
-        "i"
-      ],
-      [
-        "ËĲ",
-        "s"
-      ],
-      [
-        "Ġt",
-        "ËĪy"
-      ],
-      [
-        "ĠmËĪi",
-        "ÉľÅĭ"
-      ],
-      [
-        "É¾",
-        "ËĪe"
-      ],
-      [
-        "m",
-        "ËĮa"
-      ],
-      [
-        "Ġm",
-        "ËĮiËĲ"
-      ],
-      [
-        "ĠÉĽ",
-        "ks"
-      ],
-      [
-        "Éª",
-        "p"
-      ],
-      [
-        "ĠkËĪÊĮÉ¾",
-        "nËĮaËĲ"
-      ],
-      [
-        "ĠËĮaÊĬ",
-        "x"
-      ],
-      [
-        "r",
-        "ËĪiËĲ"
-      ],
-      [
-        "Ġc",
-        "ËĪÊĮl"
-      ],
-      [
-        "m",
-        "os"
-      ],
-      [
-        "ĠkËĪÊĮÉ¾t",
-        "ËĮeËĲ"
-      ],
-      [
-        "iËĲ",
-        "É¾"
-      ],
-      [
-        "k",
-        "ÉĻn"
-      ],
-      [
-        "Ġd",
-        "ËĪu"
-      ],
-      [
-        "n",
-        "aËĲ"
-      ],
-      [
-        "Ġp",
-        "wËĪe"
-      ],
-      [
-        "ËĮÉĶ",
-        "Éª"
-      ],
-      [
-        "ĠtÉķh",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ĠÎ²",
-        "ËĪi"
-      ],
-      [
-        "ËĪiÉĽ",
-        "Éľt"
-      ],
-      [
-        "Ġt",
-        "e"
-      ],
-      [
-        "ËĪaÃ°",
-        "os"
-      ],
-      [
-        "m",
-        "ËĪa"
-      ],
-      [
-        "Ġv",
-        "ËĪo"
-      ],
-      [
-        "Ġm",
-        "ËĪÉª"
-      ],
-      [
-        "Ġb",
-        "ËĮi"
-      ],
-      [
-        "a",
-        "d"
-      ],
-      [
-        "d",
-        "o"
-      ],
-      [
-        "Ġn",
-        "ËĪaÊĬ"
-      ],
-      [
-        "ĠÊ²ËĪy",
-        "Éľ"
-      ],
-      [
-        "w",
-        "ËĪÉĽ"
-      ],
-      [
-        "ËĪi",
-        "s"
-      ],
-      [
-        "e",
-        "l"
-      ],
-      [
-        "Ġpa",
-        "r"
-      ],
-      [
-        "Ġt",
-        "ËĪai"
-      ],
-      [
-        "ĠdËĪÉª",
-        "jaËĲ"
-      ],
-      [
-        "h",
-        "ËĪi"
-      ],
-      [
-        "ĠÉ¾",
-        "ËĪÊĮ"
-      ],
-      [
-        "Ġd",
-        "ËĪe"
-      ],
-      [
-        "ËĪaÉª",
-        "d"
-      ],
-      [
-        "Ġp",
-        "er"
-      ],
-      [
-        "Ġs",
-        "ËĮÉĶ"
-      ],
-      [
-        "w",
-        "e"
-      ],
-      [
-        "ÊĬ",
-        "m"
-      ],
-      [
-        "Ġi",
-        "n"
-      ],
-      [
-        "ĠjËĪuËĲ",
-        "z"
-      ],
-      [
-        "ËĪiËĲp",
-        "ÉĻl"
-      ],
-      [
-        "ĠÊĭ",
-        "ËĪaËĲl"
-      ],
-      [
-        "Ġe",
-        "tËĪÉĽ"
-      ],
-      [
-        "ËĮÉĽ",
-        "m"
-      ],
-      [
-        "Ġn",
-        "ËĪu"
-      ],
-      [
-        "ËĪÉĽ",
-        "kt"
-      ],
-      [
-        "ĠiËĲ",
-        "É¾"
-      ],
-      [
-        "Ġb",
-        "É¹"
-      ],
-      [
-        "Ġtsh",
-        "ËĪi"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪÉĶÉľ"
-      ],
-      [
-        "Ġkw",
-        "ËĮa"
-      ],
-      [
-        "Ġf",
-        "ËĪuÉľ"
-      ],
-      [
-        "w",
-        "ËĮa"
-      ],
-      [
-        "Ġd",
-        "ËĪiËĲ"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪyÉĻ"
-      ],
-      [
-        "ËĮÉĽ",
-        "ËĲ"
-      ],
-      [
-        "r",
-        "ËĪa"
-      ],
-      [
-        "Ġn",
-        "e"
-      ],
-      [
-        "Ġz",
-        "ËĪyÉĻ"
-      ],
-      [
-        "Ġb",
-        "ËĪaÉª"
-      ],
-      [
-        "ĠÉŁ",
-        "ËĪÊĮb"
-      ],
-      [
-        "ËĪuËĲ",
-        "to"
-      ],
-      [
-        "ÊĬ",
-        "nt"
-      ],
-      [
-        "Ġc",
-        "Ê°"
-      ],
-      [
-        "ËĪÉĽnt",
-        "i"
-      ],
-      [
-        "ËĪo",
-        "ÉĻ"
-      ],
-      [
-        "Ġs",
-        "ËĮÊĮm"
-      ],
-      [
-        "Ġl",
-        "Éĳ"
-      ],
-      [
-        "ËĮe",
-        "va"
-      ],
-      [
-        "É¾",
-        "ÉĽ"
-      ],
-      [
-        "nt",
-        "Éľ"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĽn"
-      ],
-      [
-        "ËĪÉĳËĲ",
-        "k"
-      ],
-      [
-        "Ġki",
-        "l"
-      ],
-      [
-        "ËĪon",
-        "es"
-      ],
-      [
-        "f",
-        "f"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĽËĲ"
-      ],
-      [
-        "Ġv",
-        "ËĪÉĻÉª"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "ĠËĮÉª",
-        "nt"
-      ],
-      [
-        "ÊĬ",
-        "n"
-      ],
-      [
-        "Ġw",
-        "Éªl"
-      ],
-      [
-        "Ġs",
-        "in"
-      ],
-      [
-        "ĠËĮa",
-        "lla"
-      ],
-      [
-        "ĠaÎ²",
-        "ËĪia"
-      ],
-      [
-        "p",
-        "i"
-      ],
-      [
-        "ËĪo",
-        "Éľ"
-      ],
-      [
-        "Éªj",
-        "ËĮaËĲ"
-      ],
-      [
-        "k",
-        "u"
-      ],
-      [
-        "Ġv",
-        "ËĪÉª"
-      ],
-      [
-        "Ġtu",
-        "t"
-      ],
-      [
-        "ĠtËĪe",
-        "Éľ"
-      ],
-      [
-        "Ġh",
-        "ËĪÉĶ"
-      ],
-      [
-        "Î²",
-        "É¾e"
-      ],
-      [
-        "s",
-        "ÉĻÉ¾"
-      ],
-      [
-        "Ġkh",
-        "ËĪai"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġt",
-        "a"
-      ],
-      [
-        "ĠÉ²",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġn",
-        "u"
-      ],
-      [
-        "ËĪuËĲ",
-        "n"
-      ],
-      [
-        "ĠÉĻËĲ",
-        "Éľ"
-      ],
-      [
-        "ĠËĪa",
-        "ÊĬf"
-      ],
-      [
-        "ËĪiËĲd",
-        "Éľ"
-      ],
-      [
-        "nt",
-        "i"
-      ],
-      [
-        "Ġp",
-        "ËĪiËĲpÉĻl"
-      ],
-      [
-        "Ġk",
-        "j"
-      ],
-      [
-        "Ġp",
-        "e"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĳ"
-      ],
-      [
-        "ËĮa",
-        "Éª"
-      ],
-      [
-        "ËĪaËĲ",
-        "le"
-      ],
-      [
-        "Ġv",
-        "ËĮÉĻËĲÉªÉľ"
-      ],
-      [
-        "mp",
-        "o"
-      ],
-      [
-        "ĠkËĪÉª",
-        "t"
-      ],
-      [
-        "Ġn",
-        "ËĮÉĽ"
-      ],
-      [
-        "ĠÉŁ",
-        "ËĪaËĲtaËĲ"
-      ],
-      [
-        "ĠsËĪaËĲt",
-        "Ê°"
-      ],
-      [
-        "ĠÉŁ",
-        "ËĪi"
-      ],
-      [
-        "Ġs",
-        "o"
-      ],
-      [
-        "Ġb",
-        "ËĪÉĽ"
-      ],
-      [
-        "k",
-        "ËĪi"
-      ],
-      [
-        "Éªt",
-        "i"
-      ],
-      [
-        "Ġts",
-        "i"
-      ],
-      [
-        "Ġk",
-        "Êģ"
-      ],
-      [
-        "ËĮ",
-        "ÉĴ"
-      ],
-      [
-        "É¡",
-        "ÉĻl"
-      ],
-      [
-        "k",
-        "st"
-      ],
-      [
-        "Ġm",
-        "ËĪÉĻËĲ"
-      ],
-      [
-        "ËĪÊĮ",
-        "k"
-      ],
-      [
-        "Ġn",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "Ġa",
-        "p"
-      ],
-      [
-        "ĠlËĪÉª",
-        "kÊ°"
-      ],
-      [
-        "ll",
-        "i"
-      ],
-      [
-        "ĠkwËĪa",
-        "l"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĻËĲ"
-      ],
-      [
-        "Ġts",
-        "ËĪuei"
-      ],
-      [
-        "Ġd",
-        "o"
-      ],
-      [
-        "ĠkËĲ",
-        "jËĪo"
-      ],
-      [
-        "ÊĬ",
-        "z"
-      ],
-      [
-        "Ġp",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġm",
-        "ËĪuËĲ"
-      ],
-      [
-        "ĠÉ¡ÉĻ",
-        "v"
-      ],
-      [
-        "r",
-        "ËĪi"
-      ],
-      [
-        "Ġt",
-        "w"
-      ],
-      [
-        "ËĮ",
-        "Éªn"
-      ],
-      [
-        "d",
-        "ËĪÉĳ"
-      ],
-      [
-        "ĠÃ°",
-        "ËĪi"
-      ],
-      [
-        "ĠËĪaËĲ",
-        "i"
-      ],
-      [
-        "Ġh",
-        "ËĪiÉĽ"
-      ],
-      [
-        "ĠÃ°",
-        "ËĮÉĽm"
-      ],
-      [
-        "ĠpÊ°",
-        "ËĪÉªÉ¾"
-      ],
-      [
-        "ÉĴ",
-        "m"
-      ],
-      [
-        "ĠËĮ",
-        "eËĲ"
-      ],
-      [
-        "Ġth",
-        "ËĪaiÉľ"
-      ],
-      [
-        "Ġv",
-        "ËĪas"
-      ],
-      [
-        "Ġn",
-        "ÉĳËĲ"
-      ],
-      [
-        "p",
-        "ÉĻn"
-      ],
-      [
-        "Ġp",
-        "ËĮÉĻÉ¾"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪaËĲÉª"
-      ],
-      [
-        "ËĪou",
-        "Éľ"
-      ],
-      [
-        "ĠÊĲ",
-        "ËĪuÉľ"
-      ],
-      [
-        "ĠmËĪa",
-        "n"
-      ],
-      [
-        "ĠtËĪÉĻ",
-        "ÉªÉľ"
-      ],
-      [
-        "Ġl",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "m",
-        "ËĪÉĽnte"
-      ],
-      [
-        "ĠfËĪa",
-        "m"
-      ],
-      [
-        "s",
-        "jËĪÉĶ"
-      ],
-      [
-        "Ġp",
-        "ËĪÉĻ"
-      ],
-      [
-        "ËĪeËĲ",
-        "m"
-      ],
-      [
-        "Ġp",
-        "ËĪÊĮr"
-      ],
-      [
-        "j",
-        "ËĪi"
-      ],
-      [
-        "Ġl",
-        "ÉĽ"
-      ],
-      [
-        "Ġt",
-        "en"
-      ],
-      [
-        "ËĪoËĲ",
-        "ra"
-      ],
-      [
-        "k",
-        "i"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "k",
-        "Éª"
-      ],
-      [
-        "bËĲ",
-        "e"
-      ],
-      [
-        "ËĪa",
-        "lt"
-      ],
-      [
-        "Ã°",
-        "Éª"
-      ],
-      [
-        "p",
-        "ËĪi"
-      ],
-      [
-        "ĠËĮÉĽ",
-        "nt"
-      ],
-      [
-        "Ġm",
-        "ËĪei"
-      ],
-      [
-        "Ġh",
-        "ËĪÉĻÊĬ"
-      ],
-      [
-        "Ġh",
-        "ËĪÉĽÉ¾"
-      ],
-      [
-        "j",
-        "ËĪÉĳ"
-      ],
-      [
-        "ĠhËĪÊĬ",
-        "aËĲ"
-      ],
-      [
-        "m",
-        "Éľ"
-      ],
-      [
-        "Ġd",
-        "Ê°"
-      ],
-      [
-        "ĠtÊĥ",
-        "ËĪe"
-      ],
-      [
-        "l",
-        "ËĪÉĽ"
-      ],
-      [
-        "ËĪaËĲt",
-        "e"
-      ],
-      [
-        "Ġp",
-        "ËĪuËĲ"
-      ],
-      [
-        "Ġm",
-        "ËĪÊĬ"
-      ],
-      [
-        "ËĪaËĲÉª",
-        "ÊĪ"
-      ],
-      [
-        "d",
-        "iËĲ"
-      ],
-      [
-        "ĠfÉ¹",
-        "ÉĴm"
-      ],
-      [
-        "Ġh",
-        "ËĪÉĳËĲ"
-      ],
-      [
-        "Î²",
-        "o"
-      ],
-      [
-        "ĠmËĪi",
-        "Éľn"
-      ],
-      [
-        "ĠÃ°",
-        "iËĲz"
-      ],
-      [
-        "Ġk",
-        "ËĪou"
-      ],
-      [
-        "ËĪiËĲ",
-        "na"
-      ],
-      [
-        "Ġav",
-        "ËĮeva"
-      ],
-      [
-        "Ġ",
-        "ËĪaËĲÉ¾"
-      ],
-      [
-        "Ġn",
-        "ËĪuËĲÉ¾"
-      ],
-      [
-        "ĠÎ²",
-        "ËĪe"
-      ],
-      [
-        "Ġz",
-        "aÉªn"
-      ],
-      [
-        "ËĪÉĽ",
-        "d"
-      ],
-      [
-        "É",
-        "Ĺ"
-      ],
-      [
-        "ËĪeÉª",
-        "k"
-      ],
-      [
-        "s",
-        "ËĮÉĻÊĬ"
-      ],
-      [
-        "ËĪeËĲ",
-        "ÉŁ"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĪÉĻËĲ"
-      ],
-      [
-        "j",
-        "e"
-      ],
-      [
-        "cÊ°",
-        "ËĲ"
-      ],
-      [
-        "ËĪÉĶ",
-        "r"
-      ],
-      [
-        "ÉĽ",
-        "ËĲ"
-      ],
-      [
-        "ĠtÉķhËĪy",
-        "Ã¦Éľn"
-      ],
-      [
-        "ĠËĮaÉªn",
-        "ÉĻn"
-      ],
-      [
-        "ĠiËĲ",
-        "n"
-      ],
-      [
-        "ĠbËĪÊĮ",
-        "c"
-      ],
-      [
-        "ËĪiËĲ",
-        "m"
-      ],
-      [
-        "É¾",
-        "as"
-      ],
-      [
-        "ËĮÉĻ",
-        "s"
-      ],
-      [
-        "Ġv",
-        "ËĪeËĲ"
-      ],
-      [
-        "ĠËĪÉĻr",
-        "Éľ"
-      ],
-      [
-        "Ġd",
-        "uËĲ"
-      ],
-      [
-        "nt",
-        "ÉĻ"
-      ],
-      [
-        "ĠpÉ¹",
-        "ËĪÉĴ"
-      ],
-      [
-        "Ġb",
-        "ËĪÉª"
-      ],
-      [
-        "ĠwËĪo",
-        "Éľ"
-      ],
-      [
-        "n",
-        "ËĮi"
-      ],
-      [
-        "Ġh",
-        "ÉĲ"
-      ],
-      [
-        "Ġk",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġe",
-        "t"
-      ],
-      [
-        "jËĪÉĽ",
-        "ndo"
-      ],
-      [
-        "ĠËĪai",
-        "Éľ"
-      ],
-      [
-        "Ġl",
-        "i"
-      ],
-      [
-        "ĠËĪaÊĬ",
-        "s"
-      ],
-      [
-        "kËĲ",
-        "o"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪyÉĻ"
-      ],
-      [
-        "k",
-        "eËĲ"
-      ],
-      [
-        "Ġf",
-        "ËĪiËĲl"
-      ],
-      [
-        "ĠbÊ°",
-        "ËĪaËĲi"
-      ],
-      [
-        "ĠÉ¡ÉĻ",
-        "Êĥ"
-      ],
-      [
-        "ÊĴ",
-        "ËĪe"
-      ],
-      [
-        "Ġn",
-        "jËĪuËĲ"
-      ],
-      [
-        "ĠËĪa",
-        "k"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪaËĲ"
-      ],
-      [
-        "z",
-        "ËĪa"
-      ],
-      [
-        "v",
-        "ËĪe"
-      ],
-      [
-        "ĠhËĮa",
-        "ÊĬ"
-      ],
-      [
-        "ÉĲ",
-        "Ã§"
-      ],
-      [
-        "ĠÉ¾ËĪÊĮ",
-        "kÊ°"
-      ],
-      [
-        "p",
-        "ËĪe"
-      ],
-      [
-        "ĠtÉĻ",
-        "bi"
-      ],
-      [
-        "ĠpËĪÊĮhÉĻl",
-        "ËĮeËĲ"
-      ],
-      [
-        "Ġf",
-        "ËĪÉĽ"
-      ],
-      [
-        "Ġw",
-        "ËĮÉªtÊĥ"
-      ],
-      [
-        "ĠtÉķËĪy",
-        "ÉĽÉľ"
-      ],
-      [
-        "w",
-        "ËĮe"
-      ],
-      [
-        "ËĮa",
-        "Éªt"
-      ],
-      [
-        "ĠnÉĳËĲ",
-        "x"
-      ],
-      [
-        "ĠkËĪÉĶËĲ",
-        "n"
-      ],
-      [
-        "ÊĬ",
-        "k"
-      ],
-      [
-        "ĠbËĪaËĲ",
-        "d"
-      ],
-      [
-        "Åĭ",
-        "ÉĻn"
-      ],
-      [
-        "Ġn",
-        "i"
-      ],
-      [
-        "Ġb",
-        "ËĪe"
-      ],
-      [
-        "Ġm",
-        "ËĮÊĬ"
-      ],
-      [
-        "ËĪa",
-        "r"
-      ],
-      [
-        "ĠmËĮe",
-        "Éªk"
-      ],
-      [
-        "Ġs",
-        "ËĪaËĲÉ¾"
-      ],
-      [
-        "Î²",
-        "e"
-      ],
-      [
-        "ĠtÉķhËĪi",
-        "ÉľÅĭ"
-      ],
-      [
-        "it",
-        "ËĪe"
-      ],
-      [
-        "k",
-        "ËĮe"
-      ],
-      [
-        "ËĪÉĽËĲ",
-        "l"
-      ],
-      [
-        "ËĮ",
-        "ÉĴn"
-      ],
-      [
-        "ËĮ",
-        "Éĳ"
-      ],
-      [
-        "Ġb",
-        "ËĪÉªl"
-      ],
-      [
-        "Ġw",
-        "ÊĬd"
-      ],
-      [
-        "Ġb",
-        "ËĪoËĲl"
-      ],
-      [
-        "r",
-        "d"
-      ],
-      [
-        "i",
-        "ÉĻ"
-      ],
-      [
-        "Ġd",
-        "a"
-      ],
-      [
-        "Ġb",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "ĠnËĪÊĮmb",
-        "ÉĻÉ¾"
-      ],
-      [
-        "ËĪaËĲÉª",
-        "Éľ"
-      ],
-      [
-        "ĠÉĽ",
-        "m"
-      ],
-      [
-        "Ġm",
-        "iËĲÉ¾"
-      ],
-      [
-        "ËĪeÉª",
-        "m"
-      ],
-      [
-        "l",
-        "os"
-      ],
-      [
-        "ËĮÉĽ",
-        "t"
-      ],
-      [
-        "ĠËĮaÊĬ",
-        "s"
-      ],
-      [
-        "ĠmËĪa",
-        "Éľt"
-      ],
-      [
-        "Ġw",
-        "ËĪuÉĻ"
-      ],
-      [
-        "Ġw",
-        "ËĪeÉª"
-      ],
-      [
-        "Ġse",
-        "É²"
-      ],
-      [
-        "Ġb",
-        "jËĪÉĽ"
-      ],
-      [
-        "Ġw",
-        "ÉĽn"
-      ],
-      [
-        "f",
-        "l"
-      ],
-      [
-        "Ġkh",
-        "wËĪa"
-      ],
-      [
-        "d",
-        "ËĪÉĽ"
-      ],
-      [
-        "v",
-        "É¹Éª"
-      ],
-      [
-        "ĠËĪa",
-        "É¾"
-      ],
-      [
-        "jËĪÉĳu",
-        "Éľ"
-      ],
-      [
-        "ĠËĮaËĲpk",
-        "ËĮeËĲ"
-      ],
-      [
-        "b",
-        "Êģ"
-      ],
-      [
-        "ĠtËĪaÉª",
-        "m"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĳ"
-      ],
-      [
-        "Ġs",
-        "ËĮa"
-      ],
-      [
-        "Ġz",
-        "ËĪoÉª"
-      ],
-      [
-        "ËĪÉĶÉ¾",
-        "a"
-      ],
-      [
-        "Ġd",
-        "ËĪÃ¸"
-      ],
-      [
-        "ËĪÉĶÉ¾",
-        "t"
-      ],
-      [
-        "ĠÅĭ",
-        "ËĪÉĶ"
-      ],
-      [
-        "m",
-        "in"
-      ],
-      [
-        "Ġl",
-        "ËĪÊĬk"
-      ],
-      [
-        "ËĪÉĶËĲ",
-        "t"
-      ],
-      [
-        "ĠËĪÉĶ",
-        "tÉ¾"
-      ],
-      [
-        "Ġf",
-        "ËĪaÉª"
-      ],
-      [
-        "ĠÉ¡",
-        "ÉĴt"
-      ],
-      [
-        "ËĪeËĲ",
-        "ÉĻn"
-      ],
-      [
-        "k",
-        "ËĪÉĶ"
-      ],
-      [
-        "ĠvËĪÉĽ",
-        "É¹i"
-      ],
-      [
-        "m",
-        "ÉĽ"
-      ],
-      [
-        "ËĪaÉª",
-        "z"
-      ],
-      [
-        "Ġe",
-        "sp"
-      ],
-      [
-        "É²",
-        "a"
-      ],
-      [
-        "Ġl",
-        "ËĪo"
-      ],
-      [
-        "ËĪÉĽËĲ",
-        "ra"
-      ],
-      [
-        "Î²",
-        "ËĪi"
-      ],
-      [
-        "ou",
-        "Éľ"
-      ],
-      [
-        "ËĮÉĻ",
-        "k"
-      ],
-      [
-        "tÊĥ",
-        "uËĲ"
-      ],
-      [
-        "Ġn",
-        "ËĪyÉĻ"
-      ],
-      [
-        "ÊĪ",
-        "É¾"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪy"
-      ],
-      [
-        "ĠtËĪo",
-        "Ã°o"
-      ],
-      [
-        "ËĪÉª",
-        "Ã§t"
-      ],
-      [
-        "Ġm",
-        "ÉªÃ§"
-      ],
-      [
-        "ĠËĪa",
-        "nd"
-      ],
-      [
-        "Ġkw",
-        "ËĮÉĽl"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĪaËĲ"
-      ],
-      [
-        "ĠnËĪi",
-        "Éľ"
-      ],
-      [
-        "ËĪÉĶ",
-        "p"
-      ],
-      [
-        "ËĪiËĲ",
-        "z"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĪaÊĬ"
-      ],
-      [
-        "ĠÉ¾ËĮÉĻh",
-        "i"
-      ],
-      [
-        "ĠsËĮÊĬ",
-        "o"
-      ],
-      [
-        "ĠÉĽ",
-        "É¡"
-      ],
-      [
-        "Ġd",
-        "Åĵ"
-      ],
-      [
-        "ĠÉ¡ËĮaËĲ",
-        "ÉªÉľ"
-      ],
-      [
-        "d",
-        "Éª"
-      ],
-      [
-        "l",
-        "ËĮa"
-      ],
-      [
-        "st",
-        "ËĪi"
-      ],
-      [
-        "ĠdËĮiËĲ",
-        "z"
-      ],
-      [
-        "Ġt",
-        "ËĮÊĬ"
-      ],
-      [
-        "Î¸",
-        "i"
-      ],
-      [
-        "ĠËĪÉª",
-        "skËĮoËĲ"
-      ],
-      [
-        "nd",
-        "ÉĻn"
-      ],
-      [
-        "Ġts",
-        "v"
-      ],
-      [
-        "Ġh",
-        "ËĪÉĻËĲ"
-      ],
-      [
-        "ĠÊĥ",
-        "ËĪÊĬ"
-      ],
-      [
-        "ÉĻt",
-        "ËĮeËĲ"
-      ],
-      [
-        "p",
-        "ËĮÉĽ"
-      ],
-      [
-        "ËĪaÉ¾",
-        "ÉĶn"
-      ],
-      [
-        "Ġp",
-        "ÉĽÊģ"
-      ],
-      [
-        "Ġ",
-        "y"
-      ],
-      [
-        "m",
-        "nËĮeËĲ"
-      ],
-      [
-        "ËĪÉĽ",
-        "llo"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÉĻ"
-      ],
-      [
-        "ĠËĮa",
-        "d"
-      ],
-      [
-        "ĠÊĥ",
-        "v"
-      ],
-      [
-        "ËĪÊı",
-        "É¾"
-      ],
-      [
-        "r",
-        "ËĪe"
-      ],
-      [
-        "y",
-        "ËĲ"
-      ],
-      [
-        "Ġp",
-        "ËĪaËĲs"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĽn"
-      ],
-      [
-        "Éª",
-        "dÊĴ"
-      ],
-      [
-        "ËĪua",
-        "i"
-      ],
-      [
-        "Ġf",
-        "i"
-      ],
-      [
-        "Ġt",
-        "ËĪyÉĻ"
-      ],
-      [
-        "ËĪaËĲ",
-        "ÉŁ"
-      ],
-      [
-        "Ġt",
-        "jËĪe"
-      ],
-      [
-        "ËĪaËĲn",
-        "aËĲ"
-      ],
-      [
-        "st",
-        "É¾"
-      ],
-      [
-        "Êİ",
-        "e"
-      ],
-      [
-        "ËĮe",
-        "Éªt"
-      ],
-      [
-        "b",
-        "a"
-      ],
-      [
-        "Ã°",
-        "as"
-      ],
-      [
-        "v",
-        "Êģ"
-      ],
-      [
-        "Ġz",
-        "ËĪÉĻËĲ"
-      ],
-      [
-        "ËĪaËĲ",
-        "li"
-      ],
-      [
-        "ÉŁÊ°",
-        "eËĲ"
-      ],
-      [
-        "ËĪaËĲt",
-        "eËĲ"
-      ],
-      [
-        "Ġv",
-        "ËĪa"
-      ],
-      [
-        "Ġsa",
-        "l"
-      ],
-      [
-        "ËĪaËĲ",
-        "no"
-      ],
-      [
-        "ĠÉ¡ÉĻ",
-        "z"
-      ],
-      [
-        "ĠhËĪoËĲ",
-        "ti"
-      ],
-      [
-        "ĠÉ²",
-        "ËĪiÉĽ"
-      ],
-      [
-        "t",
-        "Éľ"
-      ],
-      [
-        "ĠËĪaËĲ",
-        "p"
-      ],
-      [
-        "Ġw",
-        "ËĪÉĽl"
-      ],
-      [
-        "Ġm",
-        "ËĪÉªl"
-      ],
-      [
-        "Ġfy",
-        "ËĲÉ¾"
-      ],
-      [
-        "ËĪÉĽËĲs",
-        "aËĲ"
-      ],
-      [
-        "Ġb",
-        "ËĮiËĲ"
-      ],
-      [
-        "ËĪaËĲ",
-        "jaËĲ"
-      ],
-      [
-        "ËĪÉª",
-        "p"
-      ],
-      [
-        "Ġf",
-        "Êģ"
-      ],
-      [
-        "tsi",
-        "ËĪoËĲne"
-      ],
-      [
-        "Ġw",
-        "ËĪuÉľ"
-      ],
-      [
-        "Ġv",
-        "i"
-      ],
-      [
-        "ĠwËĪÉĳ",
-        "Éľn"
-      ],
-      [
-        "ËĪoËĲ",
-        "n"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪÉĻÉª"
-      ],
-      [
-        "ĠÊĿ",
-        "ËĪo"
-      ],
-      [
-        "Ġr",
-        "a"
-      ],
-      [
-        "m",
-        "ÉĻnt"
-      ],
-      [
-        "ËĪaÊĬ",
-        "nd"
-      ],
-      [
-        "Ġp",
-        "ÉĽÉ¾"
-      ],
-      [
-        "ĠÉĹ",
-        "ËĪaËĲÊĬ"
-      ],
-      [
-        "oËĲ",
-        "É¾"
-      ],
-      [
-        "h",
-        "ËĪo"
-      ],
-      [
-        "ĠÉĴ",
-        "n"
-      ],
-      [
-        "ĠÊİ",
-        "e"
-      ],
-      [
-        "ĠsËĪÉª",
-        "ks"
-      ],
-      [
-        "É¡",
-        "n"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪa"
-      ],
-      [
-        "Ġ",
-        "Î¸j"
-      ],
-      [
-        "Ġp",
-        "ËĪe"
-      ],
-      [
-        "sp",
-        "e"
-      ],
-      [
-        "Ġv",
-        "ËĪÉĻ"
-      ],
-      [
-        "Ġf",
-        "ËĪÉª"
-      ],
-      [
-        "ĠËĮÉªnt",
-        "ÊĬ"
-      ],
-      [
-        "l",
-        "ÉĻn"
-      ],
-      [
-        "Ġn",
-        "ËĪiËĲd"
-      ],
-      [
-        "ĠsËĮÊĬ",
-        "a"
-      ],
-      [
-        "ĠËĪu",
-        "m"
-      ],
-      [
-        "Ġd",
-        "ËĪeÉª"
-      ],
-      [
-        "ĠËĪÊĮ",
-        "bÊ°i"
-      ],
-      [
-        "ËĪÉĳËĲ",
-        "É¾"
-      ],
-      [
-        "Ġb",
-        "ËĪiÉĽÉľt"
-      ],
-      [
-        "Êİ",
-        "os"
-      ],
-      [
-        "Ġtsh",
-        "ËĪaiÉľ"
-      ],
-      [
-        "ĠËĮÉª",
-        "skËĮaËĲ"
-      ],
-      [
-        "ĠaÊĬ",
-        "ÉĻ"
-      ],
-      [
-        "ĠËĪy",
-        "Ã¦"
-      ],
-      [
-        "Ġd",
-        "yn"
-      ],
-      [
-        "Ġm",
-        "ËĪiËĲn"
-      ],
-      [
-        "ĠËĪÊĮ",
-        "cÊ°ËĲ"
-      ],
-      [
-        "Ġs",
-        "ÉĽ"
-      ],
-      [
-        "Ġn",
-        "ËĪy"
-      ],
-      [
-        "Ġn",
-        "ËĮÉĽl"
-      ],
-      [
-        "É¡",
-        "É¾"
-      ],
-      [
-        "Êĥ",
-        "ËĪe"
-      ],
-      [
-        "ĠÊĤ",
-        "ËĮÉĽ"
-      ],
-      [
-        "ĠËĪÉĽ",
-        "vÉ¹Éª"
-      ],
-      [
-        "ËĪÉĽl",
-        "p"
-      ],
-      [
-        "ĠbËĪa",
-        "k"
-      ],
-      [
-        "Ġ",
-        "eËĲ"
-      ],
-      [
-        "Ġf",
-        "ËĪaËĲ"
-      ],
-      [
-        "Ġk",
-        "ÉĽl"
-      ],
-      [
-        "ĠËĪeËĲ",
-        "s"
-      ],
-      [
-        "j",
-        "ËĪaËĲd"
-      ],
-      [
-        "Ġl",
-        "ËĮi"
-      ],
-      [
-        "mb",
-        "É¾e"
-      ],
-      [
-        "k",
-        "tÉĻ"
-      ],
-      [
-        "nt",
-        "a"
-      ],
-      [
-        "t",
-        "ËĪu"
-      ],
-      [
-        "ĠÃ°",
-        "ËĪat"
-      ],
-      [
-        "ĠËĪa",
-        "Î²"
-      ],
-      [
-        "ÉĻÉ¹",
-        "i"
-      ],
-      [
-        "ĠkwËĮÉĽ",
-        "lla"
-      ],
-      [
-        "Ġb",
-        "ÉĻn"
-      ],
-      [
-        "r",
-        "ËĮÉĽ"
-      ],
-      [
-        "Ġn",
-        "ÉĶ"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÉª"
-      ],
-      [
-        "ĠËĪa",
-        "p"
-      ],
-      [
-        "É¹",
-        "ÉĻ"
-      ],
-      [
-        "ËĪa",
-        "Éľkh"
-      ],
-      [
-        "ĠÊĲ",
-        "ËĪi"
-      ],
-      [
-        "Ġ",
-        "ËĪÉĳËĲ"
-      ],
-      [
-        "Éª",
-        "É¡ÉĻn"
-      ],
-      [
-        "Ġw",
-        "ËĪai"
-      ],
-      [
-        "Ġp",
-        "ÉĻt"
-      ],
-      [
-        "kËĲ",
-        "a"
-      ],
-      [
-        "Ġb",
-        "ËĪÉĽËĲ"
-      ],
-      [
-        "ËĪeËĲ",
-        "Êĭ"
-      ],
-      [
-        "ls",
-        "ÉĻÊĬ"
-      ],
-      [
-        "ĠcËĪaËĲh",
-        "ÉªËĮeËĲ"
-      ],
-      [
-        "Ġk",
-        "ÉĻn"
-      ],
-      [
-        "ĠËĮaÉªn",
-        "ÉĻm"
-      ],
-      [
-        "ËĪuËĲ",
-        "t"
-      ],
-      [
-        "Ġh",
-        "ËĪaÊĬ"
-      ],
-      [
-        "Ġt",
-        "ËĪanto"
-      ],
-      [
-        "ĠhÉĲ",
-        "z"
-      ],
-      [
-        "Ġs",
-        "ËĪÊĮÉ¾"
-      ],
-      [
-        "Ġn",
-        "o"
-      ],
-      [
-        "Ġt",
-        "ËĪÉĶËĲ"
-      ],
-      [
-        "Ġz",
-        "ËĪaÉª"
-      ],
-      [
-        "ĠtÉķËĪiÉĽ",
-        "Éľ"
-      ],
-      [
-        "Ġko",
-        "zËĪi"
-      ],
-      [
-        "Ġk",
-        "ËĪei"
-      ],
-      [
-        "Ã°",
-        "ËĪÉĶÉ¾"
-      ],
-      [
-        "ËĮÉĶ",
-        "Êģ"
-      ],
-      [
-        "Ġt",
-        "ËĪÊĮÉ¾"
-      ],
-      [
-        "ĠÊĲ",
-        "ËĪÉĻ"
-      ],
-      [
-        "ĠÉķËĪy",
-        "ÉĽÉľ"
-      ],
-      [
-        "ĠmËĮÊĬ",
-        "ÉŁÊ°eËĲ"
-      ],
-      [
-        "m",
-        "f"
-      ],
-      [
-        "Ġv",
-        "ËĪiËĲdÉľ"
-      ],
-      [
-        "k",
-        "ËĪa"
-      ],
-      [
-        "ĠÉĲ",
-        "É¡"
-      ],
-      [
-        "k",
-        "w"
-      ],
-      [
-        "ĠÊģ",
-        "ÉĽ"
-      ],
-      [
-        "x",
-        "ÉĻn"
-      ],
-      [
-        "Ġd",
-        "ÊĬ"
-      ],
-      [
-        "ĠkËĪÊĮÉ¾",
-        "nËĮeËĲ"
-      ],
-      [
-        "jËĪaËĲd",
-        "aËĲ"
-      ],
-      [
-        "Ġf",
-        "ÉĻ"
-      ],
-      [
-        "ĠËĮi",
-        "mp"
-      ],
-      [
-        "Ġh",
-        "Éªz"
-      ],
-      [
-        "Ġ",
-        "Ê°Ïĩ"
-      ],
-      [
-        "ËĪoËĲ",
-        "ni"
-      ],
-      [
-        "Ġx",
-        "ËĪiÉľ"
-      ],
-      [
-        "ËĪeËĲ",
-        "sÊĪ"
-      ],
-      [
-        "Êı",
-        "bÉľ"
-      ],
-      [
-        "ËĮÉĶÉ¾",
-        "ke"
-      ],
-      [
-        "ĠÉ¡",
-        "ËĪÉĻÊĬ"
-      ],
-      [
-        "ËĪÉª",
-        "ÊĥÉĻn"
-      ],
-      [
-        "l",
-        "es"
-      ],
-      [
-        "Ġf",
-        "ËĪiËĲ"
-      ],
-      [
-        "É¡",
-        "tÉĻ"
-      ],
-      [
-        "ËĪeËĲ",
-        "re"
-      ],
-      [
-        "Ġv",
-        "ËĮaËĲ"
-      ],
-      [
-        "Ġ",
-        "ËĪeÉª"
-      ],
-      [
-        "Ġm",
-        "ËĪuÉĻÉľn"
-      ],
-      [
-        "ĠÉ¡ËĪÊĬ",
-        "d"
-      ],
-      [
-        "ĠmËĮa",
-        "Éªn"
-      ],
-      [
-        "z",
-        "ËĪe"
-      ],
-      [
-        "ĠlËĪi",
-        "Éľ"
-      ],
-      [
-        "Ġm",
-        "u"
-      ],
-      [
-        "Ġk",
-        "ËĮÉĽl"
-      ],
-      [
-        "Ġj",
-        "ËĮÉĻh"
-      ],
-      [
-        "Ġf",
-        "ËĮÉĶÉ¾"
-      ],
-      [
-        "f",
-        "É¹"
-      ],
-      [
-        "Ġk",
-        "ËĪaÉªn"
-      ],
-      [
-        "ĠËĪÉĴ",
-        "lsÉĻÊĬ"
-      ],
-      [
-        "Î¸",
-        "ÉªÅĭ"
-      ],
-      [
-        "Ġth",
-        "ËĪonÉ¡Éľ"
-      ],
-      [
-        "t",
-        "ËĪÉĳ"
-      ],
-      [
-        "Î¸j",
-        "o"
-      ],
-      [
-        "m",
-        "ËĪÉĶ"
-      ],
-      [
-        "Ġ",
-        "os"
-      ],
-      [
-        "Ġs",
-        "ÊĬ"
-      ],
-      [
-        "ĠsËĪÊĮ",
-        "mÉĻ"
-      ],
-      [
-        "ĠvËĮÉĽ",
-        "n"
-      ],
-      [
-        "n",
-        "ËĪo"
-      ],
-      [
-        "ĠËĪak",
-        "tÊĥuËĲ"
-      ],
-      [
-        "É£",
-        "a"
-      ],
-      [
-        "ĠtÊ°",
-        "i"
-      ],
-      [
-        "Ġf",
-        "ËĮi"
-      ],
-      [
-        "Ġv",
-        "ËĪÉĽl"
-      ],
-      [
-        "ĠtËĪu",
-        "tËĲi"
-      ],
-      [
-        "x",
-        "os"
-      ]
-    ]
-  }
-}
\ No newline at end of file

From b70c49502062f0d7f6307ec763218b7f511e8cab Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Mon, 9 Mar 2026 13:59:38 -0400
Subject: [PATCH 77/94] remove name from training modes

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 examples/tts/conf/magpietts/easy_magpietts.yaml       |  6 +++---
 .../tts/conf/magpietts/easy_magpietts_lhotse.yaml     |  6 +++---
 nemo/collections/tts/models/easy_magpietts.py         | 11 +++++++----
 nemo/collections/tts/modules/__init__.py              |  6 +-----
 tests/collections/tts/test_infer_vs_process_batch.py  |  1 -
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml
index a668686dc28c..c6612499993d 100644
--- a/examples/tts/conf/magpietts/easy_magpietts.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts.yaml
@@ -90,10 +90,10 @@ model:
   # Multi-mode training configuration
   # The model will randomly select one of the modes for each batch during training.
   # Each mode has its own task embedding that is prepended to the context.
-  # During inference, you can specify which mode to use via the 'inference_mode' parameter.
+  # During inference, you can specify which mode to use via the derived
+  # 'inference_mode' string: "{text_input_mode}_{streaming_phonemes_delay}_{streaming_speech_delay}".
   training_modes:
-    - name: "streaming_0_1"
-      text_input_mode: "streaming" # Options: "full", "streaming"
+    - text_input_mode: "streaming" # Options: "full", "streaming"
       streaming_phonemes_delay: 0
       streaming_speech_delay: 1
   
diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
index 6eb4d03a98d2..f43814a6a479 100644
--- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
+++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml
@@ -85,10 +85,10 @@ model:
   # Multi-mode training configuration
   # The model will randomly select one of the modes for each batch during training.
   # Each mode has its own task embedding that is prepended to the context.
-  # During inference, you can specify which mode to use via the 'inference_mode' parameter.
+  # During inference, you can specify which mode to use via the derived
+  # 'inference_mode' string: "{text_input_mode}_{streaming_phonemes_delay}_{streaming_speech_delay}".
   training_modes:
-    - name: "streaming_0_1"
-      text_input_mode: "streaming" # Options: "full", "streaming"
+    - text_input_mode: "streaming" # Options: "full", "streaming"
       streaming_phonemes_delay: 0
       streaming_speech_delay: 1
 
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index e8bb877dfb53..b366a32cd024 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -76,19 +76,24 @@ class TrainingMode:
     Configuration for a training mode in multi-mode training.
 
     Attributes:
-        name: Unique identifier for this mode (e.g., "full", "streaming_4_8")
         text_input_mode: Either "full" or "streaming"
         streaming_phonemes_delay: Delay for phoneme stream (only used in streaming mode)
         streaming_speech_delay: Delay for speech stream (only used in streaming mode)
         mode_idx: Index of this mode in the list of modes (used for task embedding lookup)
     """
 
-    name: str
     text_input_mode: str
     streaming_phonemes_delay: int
     streaming_speech_delay: int
     mode_idx: int
 
+    @property
+    def name(self) -> str:
+        """Derived identifier used for inference selection and logging."""
+        return (
+            f"{self.text_input_mode}_{self.streaming_phonemes_delay}_{self.streaming_speech_delay}"
+        )
+
 
 @dataclass
 class ProcessBatchOutput:
@@ -335,7 +340,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             # Create a default training mode for backward compatibility
             self.training_modes = [
                 TrainingMode(
-                    name="streaming_4_8",
                     text_input_mode="streaming",
                     streaming_phonemes_delay=4,
                     streaming_speech_delay=8,
@@ -347,7 +351,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.training_modes = []
             for mode_idx, mode_cfg in enumerate(training_modes_cfg):
                 mode = TrainingMode(
-                    name=mode_cfg.name,
                     text_input_mode=mode_cfg.text_input_mode,
                     streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0),
                     streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0),
diff --git a/nemo/collections/tts/modules/__init__.py b/nemo/collections/tts/modules/__init__.py
index 866f418dbacd..ceda09492ada 100644
--- a/nemo/collections/tts/modules/__init__.py
+++ b/nemo/collections/tts/modules/__init__.py
@@ -14,8 +14,4 @@
 
 import nemo.collections.tts.modules.adapters
 import nemo.collections.tts.modules.ffn_modules
-import nemo.collections.tts.modules.moe_modules
-from nemo.collections.tts.modules.tacotron2 import Decoder as Taco2Decoder
-from nemo.collections.tts.modules.tacotron2 import Encoder as Taco2Encoder
-from nemo.collections.tts.modules.tacotron2 import Postnet as Taco2Postnet
-from nemo.collections.tts.modules.waveglow import WaveGlowModule
+import nemo.collections.tts.modules.moe_modules
\ No newline at end of file
diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py
index 3741deddf430..0ea66e2870ef 100644
--- a/tests/collections/tts/test_infer_vs_process_batch.py
+++ b/tests/collections/tts/test_infer_vs_process_batch.py
@@ -73,7 +73,6 @@ def build_minimal_config(codecmodel_path: str) -> OmegaConf:
         # Training modes (single streaming mode)
         'training_modes': [
             {
-                'name': 'streaming_4_8',
                 'text_input_mode': 'streaming',
                 'streaming_phonemes_delay': 4,
                 'streaming_speech_delay': 8,

From 00acdb475a3f1b9c94e9934f9180492f9416b96d Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Mon, 9 Mar 2026 19:47:54 +0000
Subject: [PATCH 78/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 4 +---
 nemo/collections/tts/modules/__init__.py      | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index b366a32cd024..d999fcc31739 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -90,9 +90,7 @@ class TrainingMode:
     @property
     def name(self) -> str:
         """Derived identifier used for inference selection and logging."""
-        return (
-            f"{self.text_input_mode}_{self.streaming_phonemes_delay}_{self.streaming_speech_delay}"
-        )
+        return f"{self.text_input_mode}_{self.streaming_phonemes_delay}_{self.streaming_speech_delay}"
 
 
 @dataclass
diff --git a/nemo/collections/tts/modules/__init__.py b/nemo/collections/tts/modules/__init__.py
index ceda09492ada..c4dffba34215 100644
--- a/nemo/collections/tts/modules/__init__.py
+++ b/nemo/collections/tts/modules/__init__.py
@@ -14,4 +14,4 @@
 
 import nemo.collections.tts.modules.adapters
 import nemo.collections.tts.modules.ffn_modules
-import nemo.collections.tts.modules.moe_modules
\ No newline at end of file
+import nemo.collections.tts.modules.moe_modules

From 49bd6ff7e836bc7fdaeb47fa6caba554b4eefce5 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Mon, 9 Mar 2026 14:14:44 -0700
Subject: [PATCH 79/94] removing some debugging statements

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 nemo/collections/tts/data/text_to_speech_dataset_lhotse.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index 356cc8ca4d15..c1ac9975d215 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -205,10 +205,8 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
             self.eos_id = self.bos_id + 1
             self.pad_id = self.text_tokenizer.pad
 
+        # initialize the phoneme tokenizer once per dataset/worker when config is available.
         if self.phoneme_tokenizer is None and self.phoneme_tokenizer_config is not None:
-            worker_info = torch.utils.data.get_worker_info()
-            worker_id = worker_info.id if worker_info is not None else 0
-            logging.info(f"Worker {worker_id} initializing phoneme tokenizer...")
             self.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.phoneme_tokenizer_config)
 
         # define list to store batched information

From fdbf72d059493ff0db2b7be5fa10242d22582e4b Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Mon, 9 Mar 2026 19:24:15 -0700
Subject: [PATCH 80/94] new base class (#68)

* new base class

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

* Magpie models refactoring

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>

---------

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 examples/tts/magpietts_inference.py           |    8 +-
 nemo/collections/tts/models/__init__.py       |    4 +
 nemo/collections/tts/models/base_magpietts.py |  569 ++++
 nemo/collections/tts/models/easy_magpietts.py | 2336 +----------------
 .../tts/models/easy_magpietts_inference.py    | 2018 ++++++++++++++
 nemo/collections/tts/models/magpietts.py      |  621 +----
 .../tts/modules/magpietts_inference/utils.py  |    3 +
 7 files changed, 2638 insertions(+), 2921 deletions(-)
 create mode 100644 nemo/collections/tts/models/base_magpietts.py
 create mode 100644 nemo/collections/tts/models/easy_magpietts_inference.py

diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py
index d38c093eb1de..f1ed60c27428 100644
--- a/examples/tts/magpietts_inference.py
+++ b/examples/tts/magpietts_inference.py
@@ -559,12 +559,8 @@ def main(argv=None):
                 model_inference_parameters[field_name] = arg_from_cmdline
 
     if "max_decoder_steps" not in model_inference_parameters:
-        if args.longform_mode in {'always', 'auto'}:
-            model_inference_parameters["max_decoder_steps"] = args.longform_max_decoder_steps
-        elif args.is_decoder_only_model:
+        if args.is_decoder_only_model:
             model_inference_parameters["max_decoder_steps"] = 300
-        else:
-            model_inference_parameters["max_decoder_steps"] = 440
 
     inference_config = InferenceConfig(
         model_inference_parameters=ModelInferenceParameters.from_dict(model_inference_parameters),
@@ -581,8 +577,6 @@ def main(argv=None):
         phoneme_sampling_method=args.phoneme_sampling_method,
         dropout_text_input=args.dropout_text_input,
         legacy_context_stacking=args.legacy_context_stacking,
-        longform_mode=args.longform_mode,
-        longform_word_threshold=args.longform_word_threshold,
     )
 
     eval_config = EvaluationConfig(
diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py
index 0783c79bacab..28d49bca1c81 100644
--- a/nemo/collections/tts/models/__init__.py
+++ b/nemo/collections/tts/models/__init__.py
@@ -14,7 +14,9 @@
 
 from nemo.collections.tts.models.aligner import AlignerModel
 from nemo.collections.tts.models.audio_codec import AudioCodecModel
+from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel
 from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel
+from nemo.collections.tts.models.easy_magpietts_inference import EasyMagpieTTSInferenceModel
 from nemo.collections.tts.models.easy_magpietts_preference_optimization import EasyMagpieTTSModelOnlinePO
 from nemo.collections.tts.models.fastpitch import FastPitchModel
 from nemo.collections.tts.models.fastpitch_ssl import FastPitchModel_SSL
@@ -30,6 +32,7 @@
 __all__ = [
     "AlignerModel",
     "AudioCodecModel",
+    "BaseMagpieTTSModel",
     "FastPitchModel",
     "FastPitchModel_SSL",
     "SSLDisentangler",
@@ -37,6 +40,7 @@
     "InferBatchOutput",
     "MagpieTTSModel",
     "EasyMagpieTTSModel",
+    "EasyMagpieTTSInferenceModel",
     "EasyMagpieTTSModelOnlinePO",
     "MagpieTTSModelOfflinePODataGen",
     "MagpieTTSModelOfflinePO",
diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py
new file mode 100644
index 000000000000..f3eacb945051
--- /dev/null
+++ b/nemo/collections/tts/models/base_magpietts.py
@@ -0,0 +1,569 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+from torch.utils.data import get_worker_info
+
+from nemo.collections.tts.data.text_to_speech_dataset_lhotse import (
+    instantiate_phoneme_tokenizer,
+    setup_tokenizers,
+)
+from nemo.collections.tts.modules.magpietts_modules import (
+    SpecialAudioToken,
+    cosine_schedule,
+)
+from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
+from nemo.core.classes import ModelPT
+from nemo.utils import logging
+
+
+def worker_init_fn(worker_id):
+    """Per-worker init for DataLoader workers.
+
+    Sets up tokenizers for the dataset (text and optionally phoneme)
+    when using multiprocessing.
+    """
+    logging.info(f"Worker {worker_id} initializing...")
+    worker_info = get_worker_info()
+    dataset = worker_info.dataset
+    tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type)
+    dataset.text_tokenizer = tokenizer
+    if hasattr(dataset, 'phoneme_tokenizer_config'):
+        dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config)
+
+
+class BaseMagpieTTSModel(ModelPT):
+    """Base class for MagpieTTS models.
+
+    Contains shared functionality for audio codec helpers, special token
+    manipulation, local transformer functions, and state dict handling.
+    Subclasses (EasyMagpieTTSModel, MagpieTTSModel) provide their own
+    ``__init__``, data loading, training/inference logic, etc.
+    """
+
+    # ------------------------------------------------------------------
+    # State-dict exclusion – subclasses override
+    # ------------------------------------------------------------------
+
+    def _get_state_dict_keys_to_exclude(self) -> List[str]:
+        """Return list of key substrings to exclude from checkpoint save/load.
+
+        Subclasses should override to specify model-specific exclusions
+        (e.g. codec model, eval models).
+        """
+        return ['_codec_model']
+
+    # ------------------------------------------------------------------
+    # state_dict / load_state_dict / optimizer param groups
+    # ------------------------------------------------------------------
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        if hasattr(self, '_no_state_dict') and self._no_state_dict:
+            return {}
+        state_dict = super().state_dict(destination, prefix, keep_vars)
+        keys_substrings_to_exclude = self._get_state_dict_keys_to_exclude()
+        for key in list(state_dict.keys()):
+            if any(substring in key for substring in keys_substrings_to_exclude):
+                del state_dict[key]
+        return state_dict
+
+    def load_state_dict(self, state_dict, strict=True):
+        if not strict:
+            super().load_state_dict(state_dict, strict=False)
+        modules_to_skip = self._get_state_dict_keys_to_exclude()
+        for name, child in self.named_children():
+            if name in modules_to_skip:
+                continue
+            if any(param.numel() > 0 for param in child.parameters()):
+                new_state_dict = {}
+                for key in state_dict.keys():
+                    name_with_dot = f"{name}."
+                    if key.startswith(name_with_dot):
+                        new_state_dict[key[len(name_with_dot):]] = state_dict[key]
+                child.load_state_dict(new_state_dict)
+
+    def setup_optimizer_param_groups(self):
+        """Exclude frozen eval/inference-only models from the optimizer."""
+        modules_to_exclude = set(self._get_state_dict_keys_to_exclude())
+
+        excluded_param_ids = set()
+        for name, module in self.named_children():
+            if name in modules_to_exclude:
+                for param in module.parameters():
+                    excluded_param_ids.add(id(param))
+
+        trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids]
+
+        logging.info(
+            f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, "
+            f"{len(excluded_param_ids)} params excluded (eval models)"
+        )
+
+        self._optimizer_param_groups = [{"params": trainable_params}]
+
+    # ------------------------------------------------------------------
+    # Special token helpers
+    # ------------------------------------------------------------------
+
+    def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1):
+        # codes: (B, C, T')
+        codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0)
+        codes_len = codes_len + num_eos_tokens
+        for idx in range(codes.size(0)):
+            codes[idx, :, codes_len[idx] - 1] = eos_id
+        return codes, codes_len
+
+    def add_special_tokens(self, codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1):
+        # codes: (B, C, T')
+        codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id)
+        codes_len = codes_len + num_bos_tokens
+        codes, codes_len = self.add_eos_token(
+            codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens
+        )
+        return codes, codes_len
+
+    def remove_bos_token(self, codes, codes_len, num_tokens=1):
+        codes = codes[:, :, num_tokens:]
+        codes_len = codes_len - num_tokens
+        return codes, codes_len
+
+    def remove_embedded_bos_token(self, embedded, embedded_len):
+        embedded = embedded[:, 1:, :]
+        embedded_len = embedded_len - 1
+        return embedded, embedded_len
+
+    def remove_eos_token(self, codes, codes_len):
+        codes_len = codes_len - 1
+        codes = codes[:, :, :-1]
+        mask = get_mask_from_lengths(lengths=codes_len)
+        codes = codes * mask.unsqueeze(1)
+        return codes, codes_len
+
+    def remove_embedded_eos_token(self, embedded, embedded_len):
+        # embedded: (B, T', D)
+        embedded_len = embedded_len - 1
+        embedded = embedded[:, :-1, :]
+        mask = get_mask_from_lengths(lengths=embedded_len)
+        embedded = embedded * mask.unsqueeze(2)
+        return embedded, embedded_len
+
+    def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1):
+        codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens)
+        codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len)
+        return codes, codes_len
+
+    # ------------------------------------------------------------------
+    # Audio codec helpers
+    # ------------------------------------------------------------------
+
+    def audio_to_codes(self, audio, audio_len, sample_rate=None):
+        self._codec_model.eval()
+        with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32):
+            codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate)
+            return codes, codes_len
+
+    def codes_to_audio(self, codes, codes_len):
+        # codes: (B, C, T')
+        self._codec_model.eval()
+        with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
+            if self._codec_converter is not None:
+                codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len)
+            audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len)
+            return audio, audio_len, codes
+
+    # ------------------------------------------------------------------
+    # Padding / forbidden-logits helpers
+    # ------------------------------------------------------------------
+
+    def pad_audio_codes(self, audio_codes: torch.Tensor):
+        """Pads the time dimension of the audio codes to a multiple of the frame stacking factor.
+
+        Args:
+            audio_codes: (B, C, T)
+        Returns:
+            (B, C, T_padded)
+        """
+        T = audio_codes.size(2)
+        T_padded = int(np.ceil(T / self.frame_stacking_factor) * self.frame_stacking_factor)
+        num_pad = T_padded - T
+        audio_codes = torch.nn.functional.pad(input=audio_codes, pad=(0, num_pad))
+        return audio_codes
+
+    def clear_forbidden_logits(self, logits: torch.Tensor, forbid_audio_eos: bool = False) -> torch.Tensor:
+        """Sets logits of forbidden tokens to ``-inf`` so they will never be sampled.
+
+        Specifically, we forbid sampling of all special tokens except AUDIO_EOS
+        which is allowed by default.
+
+        Args:
+            logits: (B, C, num_audio_tokens_per_codebook)
+            forbid_audio_eos: If True, also forbid AUDIO_EOS tokens from being sampled.
+        """
+        logits[
+            :,
+            :,
+            SpecialAudioToken.get_forbidden_tokens(self.codebook_size, forbid_audio_eos=forbid_audio_eos),
+        ] = float('-inf')
+        return logits
+
+    # ------------------------------------------------------------------
+    # MaskGit helpers
+    # ------------------------------------------------------------------
+
+    def maskgit_create_random_mask(self, codes):
+        """Creates a mask where True indicates positions that should be replaced with MASK_TOKEN."""
+        B, C, T = codes.shape
+        rand_values = torch.rand(B, T, device=codes.device)
+        frac_masked = cosine_schedule(rand_values)
+        n_masked = torch.ceil(frac_masked * C).long()
+        random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1)
+        mask_indices = torch.arange(C, device=codes.device).view(1, C, 1)
+        mask = mask_indices < n_masked.view(B, 1, T)
+        mask = torch.gather(mask, 1, random_permutations)
+        return mask
+
+    def maskgit_apply_random_mask(self, codes):
+        """Randomly replaces some codes with MASK_TOKEN following the cosine schedule."""
+        mask = self.maskgit_create_random_mask(codes)
+        codes_with_mask = torch.where(mask, self.mask_token_id, codes)
+        return codes_with_mask, mask
+
+    # ------------------------------------------------------------------
+    # Local transformer – training
+    # ------------------------------------------------------------------
+
+    def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False):
+        """Predicts the logits for all codebooks using the local transformer.
+
+        Used in both autoregressive (AR) and MaskGit (MG) modes during
+        training and validation (not inference/sampling).
+
+        The sequence layout is slightly different between AR and MG modes, as shown below
+        (using an 8-codebook setup as an example)::
+
+            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+            | AR target  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |   none  |
+            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+            | MG target  |  none   |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
+            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+            |   Input    | Magpie  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
+            |            | Latent  | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK |
+            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+            | Seq. Index |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |    8    |
+            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+
+        Args:
+            dec_out: (B, T', E)
+            audio_codes_target: (B, C, T')
+            targets_offset_by_one: if False, target for index 0 is codebook 0 (AR);
+                if True, target for index 1 is codebook 0 (MaskGit).
+        """
+        C = self.num_audio_codebooks
+        dec_out_all = dec_out.reshape(-1, dec_out.size(-1))  # (B*T', E)
+        local_transformer_input = [dec_out_all]
+        audio_codes_target = self.pad_audio_codes(audio_codes_target).long()
+        for fs_index in range(self.frame_stacking_factor):
+            for codebook_num in range(C):
+                codes = audio_codes_target[:, codebook_num, fs_index :: self.frame_stacking_factor]
+                codes = codes.reshape(-1)
+                codebook_embedding = self.audio_embeddings[codebook_num + fs_index * C](codes)
+                codebook_embedding = self.audio_in_projection(codebook_embedding)
+                local_transformer_input.append(codebook_embedding)
+
+        local_transformer_input = torch.stack(local_transformer_input, dim=1)
+        local_transformer_input = self.local_transformer_in_projection(local_transformer_input)
+        _mask = torch.ones(
+            local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
+        )
+        local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']
+        if not targets_offset_by_one:
+            local_transformer_output = local_transformer_output[:, :-1, :]
+        else:
+            local_transformer_output = local_transformer_output[:, 1:, :]
+
+        local_transformer_output = self.local_transformer_audio_out_projection(local_transformer_output)
+
+        all_code_logits = []
+        for fs_index in range(self.frame_stacking_factor):
+            for codebook_num in range(audio_codes_target.size(1)):
+                codebook_logits = self.local_transformer_out_projections[codebook_num + fs_index * C](
+                    local_transformer_output[:, codebook_num + fs_index * C, :]
+                )
+                all_code_logits.append(codebook_logits)
+        all_code_logits = torch.cat(all_code_logits, dim=1)
+
+        all_code_logits = all_code_logits.view(
+            audio_codes_target.size(0), audio_codes_target.size(2) // self.frame_stacking_factor, -1
+        )
+
+        return all_code_logits
+
+    # ------------------------------------------------------------------
+    # Local transformer – AR sampling
+    # ------------------------------------------------------------------
+
+    def local_transformer_sample_autoregressive(
+        self,
+        dec_output: torch.Tensor,
+        temperature: float = 0.7,
+        topk: int = 80,
+        unfinished_items: Dict[int, bool] = {},
+        finished_items: Dict[int, bool] = {},
+        use_cfg: bool = False,
+        cfg_scale: float = 1.0,
+        use_kv_cache: bool = True,
+        forbid_audio_eos: bool = False,
+    ) -> torch.Tensor:
+        """Sample audio codes autoregressively across codebooks using the local transformer.
+
+        Uses multinomial sampling with temperature, top-k, and
+        classifier-free guidance (CFG).
+
+        Args:
+            dec_output: Decoder output tensor (B, E).
+            temperature: Sampling temperature. When <= 0, uses argmax.
+            topk: Number of top-probability tokens to consider.
+            unfinished_items: Batch indices that have not completed generation (EOS forbidden).
+            finished_items: Batch indices that are completed (EOS forced).
+            use_cfg: Whether to use classifier-free guidance (doubled batch).
+            cfg_scale: Scale factor for CFG.
+            use_kv_cache: Whether to use key-value caching in the local transformer.
+            forbid_audio_eos: Whether to globally forbid audio EOS.
+
+        Returns:
+            Sampled audio codes (B, num_codebooks, frame_stacking_factor).
+        """
+        self.local_transformer.reset_cache(use_cache=use_kv_cache)
+        dec_output = dec_output.unsqueeze(1)  # (B, 1, E)
+        local_transformer_input = self.local_transformer_in_projection(dec_output)
+        all_preds = []
+        for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor):
+            _mask = torch.ones(
+                local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
+            )
+            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']
+
+            lt_out_for_proj = self.local_transformer_audio_out_projection(local_transformer_output[:, -1, :])
+            codebook_logits = self.local_transformer_out_projections[codebook_num](lt_out_for_proj)
+
+            if use_cfg:
+                actual_batch_size = codebook_logits.size(0) // 2
+                conditional_logits = codebook_logits[:actual_batch_size]
+                unconditional_logits = codebook_logits[actual_batch_size:]
+                cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
+                codebook_logits[:actual_batch_size] = cfg_logits
+
+            codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
+            codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
+
+            for item_idx in unfinished_items:
+                codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
+            for item_idx in finished_items:
+                codebook_logits[item_idx, :] = float('-inf')
+                codebook_logits[item_idx, self.audio_eos_id] = 0.0
+
+            codebook_logits = self.clear_forbidden_logits(
+                codebook_logits.unsqueeze(1), forbid_audio_eos=forbid_audio_eos
+            ).squeeze(1)
+
+            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]
+            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(-1)
+            codebook_logits_rescored = codebook_logits.clone()
+            codebook_logits_rescored[indices_to_remove] = float('-inf')
+
+            if temperature <= 0.0:
+                codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True)
+            else:
+                codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1)
+                codebook_preds = torch.multinomial(codebook_probs, 1)
+
+            if use_cfg:
+                codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size]
+            all_preds.append(codebook_preds)
+
+            next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(1)
+            next_local_transformer_input = self.audio_in_projection(next_local_transformer_input)
+            next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input)
+            local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1)
+
+        all_preds = torch.cat(all_preds, dim=1)  # (B, num_codebooks * frame_stacking_factor)
+        all_preds = all_preds.reshape(-1, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1)
+        if use_cfg:
+            all_preds = all_preds[:actual_batch_size]
+
+        return all_preds
+
+    # ------------------------------------------------------------------
+    # Local transformer – MaskGit sampling
+    # ------------------------------------------------------------------
+
+    def local_transformer_sample_maskgit(
+        self,
+        dec_output: torch.Tensor,
+        temperature: float = 0.7,
+        topk: int = 80,
+        unfinished_items: Dict[int, bool] = {},
+        finished_items: Dict[int, bool] = {},
+        use_cfg: bool = False,
+        cfg_scale: float = 1.0,
+        n_steps: int = 3,
+        noise_scale: float = 0.0,
+        fixed_schedule: Optional[List[int]] = None,
+        dynamic_cfg_scale: bool = False,
+        sampling_type: Optional[str] = None,
+        forbid_audio_eos: bool = False,
+    ) -> torch.Tensor:
+        """Sample audio codes using MaskGit-like iterative prediction with the local transformer.
+
+        If frame-stacking is enabled, the codes for all frames in the stack
+        are sampled, treated as one long sequence.
+
+        Args:
+            dec_output: Decoder output tensor (B, E).
+            temperature: Sampling temperature.
+            topk: Number of top-probability tokens to consider.
+            unfinished_items: Batch indices that have not completed generation.
+            finished_items: Batch indices that are completed.
+            use_cfg: Whether to use classifier-free guidance.
+            cfg_scale: Scale factor for CFG.
+            n_steps: Number of iterative refinement steps.
+            noise_scale: Scale factor for noise added to confidence scores.
+            fixed_schedule: Fixed schedule for number of tokens to unmask per step.
+            dynamic_cfg_scale: Whether to dynamically adjust CFG scale.
+            sampling_type: Sampling strategy (``"default"``, ``"causal"``,
+                ``"purity_causal"``, ``"purity_default"``).
+            forbid_audio_eos: Whether to globally forbid audio EOS.
+
+        Returns:
+            Sampled audio codes (B, num_codebooks, frame_stacking_factor).
+        """
+        device = dec_output.device
+        self.local_transformer.reset_cache(use_cache=False)
+        dec_output = dec_output.unsqueeze(1)
+        local_transformer_input_init = self.local_transformer_in_projection(dec_output)
+        codebook_seq_len = self.num_audio_codebooks * self.frame_stacking_factor
+        B = dec_output.size(0)
+
+        min_confidence = 0
+        max_confidence = 5
+        confidences = min_confidence * torch.ones(B, codebook_seq_len, device=device)
+        codes = self.mask_token_id * torch.ones((B, codebook_seq_len), device=device, dtype=torch.long)
+        sampled_codes = codes.clone()
+        if fixed_schedule is not None:
+            n_steps = len(fixed_schedule)
+        for step in range(n_steps):
+            progress = step / n_steps
+            frac_masked = cosine_schedule(torch.tensor(progress))
+            if sampling_type == "causal" or sampling_type == "purity_causal":
+                frac_masked = torch.ones_like(frac_masked) * (1.0 - progress)
+            if fixed_schedule is None:
+                n_masked = torch.ceil(codebook_seq_len * frac_masked).long()
+            else:
+                n_masked = codebook_seq_len - fixed_schedule[step]
+            n_unmasked = codebook_seq_len - n_masked
+
+            if sampling_type == "causal" or sampling_type == "purity_causal":
+                n_frames_to_allow = int(np.floor(progress * self.frame_stacking_factor + 1))
+                confidences[:, n_frames_to_allow * self.num_audio_codebooks:] = min_confidence - 1
+
+            _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1)
+            if use_cfg:
+                actual_batch_size = topk_indices.size(0) // 2
+                assert (
+                    topk_indices[actual_batch_size:] == topk_indices[:actual_batch_size]
+                ).all(), "Topk indices are not the same for conditional and unconditional codes"
+
+            unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices)
+            codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
+
+            local_transformer_input = local_transformer_input_init
+            for codebook_num in range(codebook_seq_len):
+                next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1)
+                next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input)
+                local_transformer_input = torch.cat(
+                    [local_transformer_input, next_local_transformer_input], dim=1
+                )
+
+            _mask = torch.ones(B, codebook_seq_len + 1, device=device)
+            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']
+
+            logits = []
+            for codebook_num in range(codebook_seq_len):
+                codebook_logits = self.local_transformer_out_projections[codebook_num](
+                    local_transformer_output[:, codebook_num + 1, :]
+                )
+                logits.append(codebook_logits)
+            logits = torch.stack(logits, dim=1)
+
+            if use_cfg:
+                actual_batch_size = logits.size(0) // 2
+                conditional_logits = logits[:actual_batch_size]
+                unconditional_logits = logits[actual_batch_size:]
+                if not dynamic_cfg_scale:
+                    current_cfg_scale = cfg_scale
+                else:
+                    progress = step / (n_steps - 1)
+                    interp = progress
+                    current_cfg_scale = (cfg_scale - 1) * interp + 1.0
+                cfg_logits = current_cfg_scale * conditional_logits + (1.0 - current_cfg_scale) * unconditional_logits
+                logits[:actual_batch_size] = cfg_logits
+
+            logits = self.clear_forbidden_logits(logits, forbid_audio_eos=forbid_audio_eos)
+
+            for item_idx in unfinished_items:
+                logits[item_idx, self.audio_eos_id] = float('-inf')
+            for item_idx in finished_items:
+                logits[item_idx, :, :] = float('-inf')
+                logits[item_idx, :, self.audio_eos_id] = 0.0
+
+            logits_topk = torch.topk(logits, topk, dim=-1)[0]
+            indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1)
+            logits_rescored = logits.clone()
+            logits_rescored[indices_to_remove] = float('-inf')
+            probs = torch.softmax(logits_rescored / temperature, dim=-1)
+            sampled_codes = torch.multinomial(probs.view(B * codebook_seq_len, -1), 1).view(B, codebook_seq_len)
+            if use_cfg:
+                sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size]
+                probs[actual_batch_size:] = probs[:actual_batch_size]
+            if sampling_type != "purity_causal" and sampling_type != "purity_default":
+                confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1)
+            else:
+                confidences = probs.max(dim=2)[0]
+            sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
+            if noise_scale > 0.0:
+                noise = (
+                    (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps)
+                )
+                confidences += noise
+                confidences[actual_batch_size:] = confidences[:actual_batch_size]
+            confidence_eps = 0.1
+            assert (
+                confidences.max() + confidence_eps < max_confidence
+            ), f"Predicted confidence is approaching max_confidence: {confidences.max()}"
+            confidences.scatter_(
+                index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float)
+            )
+        codes = sampled_codes
+        assert not (
+            codes == self.mask_token_id
+        ).any(), "Codes contain mask tokens after completion of MaskGit sampling"
+
+        codes = codes.reshape(B, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1)
+
+        if use_cfg:
+            codes = codes[:actual_batch_size]
+        return codes
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index d999fcc31739..115b8e2d6a99 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -14,10 +14,8 @@
 import json
 import os
 import random
-import time
 from dataclasses import dataclass
-from functools import partial
-from typing import Any, Dict, List, Optional, Sequence, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 import soundfile as sf
@@ -28,9 +26,7 @@
 from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
 from omegaconf import DictConfig
 from torch import nn
-from torch.utils.data import get_worker_info
 from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 
 import nemo.collections.asr as nemo_asr
 from nemo.collections.asr.metrics.wer import word_error_rate
@@ -41,15 +37,15 @@
     instantiate_phoneme_tokenizer,
     setup_tokenizers,
 )
-from nemo.collections.tts.models import AudioCodecModel
-from nemo.collections.tts.modules import transformer_2501
-from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter
-from nemo.collections.tts.modules.magpietts_modules import (
-    CharAwareSubwordEncoder,
-    LocalTransformerType,
-    SpecialAudioToken,
-    cosine_schedule,
+from nemo.collections.tts.models.base_magpietts import worker_init_fn
+from nemo.collections.tts.models.easy_magpietts_inference import (
+    EasyMagpieTTSInferenceModel,
+    InferBatchOutput,
+    StreamingFinalizeOutput,
+    StreamingState,
+    TrainingMode,
 )
+from nemo.collections.tts.modules.magpietts_modules import LocalTransformerType
 from nemo.collections.tts.parts.utils.helpers import (
     compute_utmos_scores_from_filepaths,
     get_mask_from_lengths,
@@ -58,8 +54,6 @@
     transcribe_with_whisper,
     transcribe_with_whisper_from_filepaths,
 )
-from nemo.core.classes import ModelPT
-from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
 
 try:
@@ -70,29 +64,6 @@
     HAVE_UTMOSV2 = False
 
 
-@dataclass
-class TrainingMode:
-    """
-    Configuration for a training mode in multi-mode training.
-
-    Attributes:
-        text_input_mode: Either "full" or "streaming"
-        streaming_phonemes_delay: Delay for phoneme stream (only used in streaming mode)
-        streaming_speech_delay: Delay for speech stream (only used in streaming mode)
-        mode_idx: Index of this mode in the list of modes (used for task embedding lookup)
-    """
-
-    text_input_mode: str
-    streaming_phonemes_delay: int
-    streaming_speech_delay: int
-    mode_idx: int
-
-    @property
-    def name(self) -> str:
-        """Derived identifier used for inference selection and logging."""
-        return f"{self.text_input_mode}_{self.streaming_phonemes_delay}_{self.streaming_speech_delay}"
-
-
 @dataclass
 class ProcessBatchOutput:
     """
@@ -100,19 +71,19 @@ class ProcessBatchOutput:
 
     Attributes:
         loss: Total combined loss (codebook_loss + phoneme_loss + local_transformer_loss)
-        codebook_loss: Loss for audio codebook prediction
-        phoneme_loss: Loss for phoneme prediction (None if phoneme_tokenizer is not used)
-        local_transformer_loss: Loss from local transformer (None if not using local transformer)
-        local_transformer_logits: Logits from local transformer, shape (B, T', num_codebooks * num_tokens_per_codebook)
-        logits: Predicted logits from the main decoder, shape (B, T', num_codebooks * num_tokens_per_codebook)
-        phoneme_logits: Predicted phoneme logits, shape (B, T', phoneme_stacking_factor * phoneme_vocab_size). None if no phoneme tokenizer.
-        phoneme_tokens_target: Target phoneme tokens (shifted), shape (B, S, T'). None if no phoneme tokenizer.
-        phoneme_tokens_lens_target: Length of target phoneme tokens (B,). None if no phoneme tokenizer.
-        audio_codes_target: Target audio codes for the decoder, shape (B, C, T')
-        audio_codes_lens_target: Length of target audio codes for each batch item, shape (B,)
-        context_audio_codes: Audio codes extracted from context audio, shape (B, C, T')
-        context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,)
-        selected_training_mode: Name of the selected training mode (None if multi_mode_training is disabled)
+        codebook_loss: Cross-entropy loss for parallel audio codebook prediction
+        phoneme_loss: Cross-entropy loss for phoneme prediction (None if no phoneme tokenizer)
+        local_transformer_loss: Loss from local transformer (None if not used)
+        local_transformer_logits: Logits from local transformer (None if not used)
+        logits: Predicted logits for audio codes (B, T', num_codebooks * num_tokens_per_codebook)
+        phoneme_logits: Predicted logits for phoneme tokens (None if no phoneme tokenizer)
+        phoneme_tokens_target: Target phoneme tokens for loss computation
+        phoneme_tokens_lens_target: Lengths of target phoneme tokens
+        audio_codes_target: Target audio codes for loss computation (B, C, T'-1)
+        audio_codes_lens_target: Lengths of target audio codes (B,)
+        context_audio_codes: Processed context audio codes (B, C, T')
+        context_audio_codes_lens: Length of processed context audio codes (B,)
+        selected_training_mode: Name of the training mode used for this batch (e.g., "streaming_4_8")
     """
 
     loss: torch.Tensor
@@ -128,260 +99,22 @@ class ProcessBatchOutput:
     audio_codes_lens_target: torch.Tensor
     context_audio_codes: torch.Tensor
     context_audio_codes_lens: torch.Tensor
-    selected_training_mode: Optional[str] = None
+    selected_training_mode: Optional[str]
 
 
-@dataclass
-class StreamingState:
+class EasyMagpieTTSModel(EasyMagpieTTSInferenceModel):
     """
-    State for streaming TTS inference with batch support.
-
-    This dataclass maintains all the necessary state for autoregressive streaming
-    generation, allowing text tokens to be fed incrementally. Supports arbitrary
-    batch sizes where each batch item can have different context lengths and be
-    in different phases.
-
-    The streaming operates in four phases (per batch item):
-    1. Context phase (context_position < full_context_lens): Processing remaining context
-    2. Prompt phase (text_tokens_seen < phoneme_delay): Only text, no predictions
-    3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay): Phoneme predictions only
-    4. Audio phase (text_tokens_seen >= speech_delay): Both phoneme and audio predictions
-
-    Attributes:
-        batch_size: Number of items in the batch.
-        past_key_values: KV cache from the transformer for efficient autoregressive decoding.
-        cache_seq_len: Current sequence length in the cache.
-        all_predictions: List of predicted audio codes at each timestep, each tensor is (B, C, S) unstacked.
-        all_phoneme_predictions: List of predicted phoneme tokens at each timestep, each tensor is (B, phoneme_stacking_factor).
-        context_audio_codes: Processed context audio codes with special tokens.
-        context_audio_codes_lens: Length of context audio codes.
-        context_lens: Total context length (task_embedding + context_audio + context_text).
-        full_context_embedding: Full context embedding for each batch item (B, T_max_context, E).
-        full_context_lens: Full context length for each batch item (B,).
-        context_position: How much context has been processed per batch item (B,).
-        text_tokens_seen: Number of text tokens processed so far per batch item (B,).
-        phoneme_steps: Number of phoneme prediction steps taken per batch item (B,).
-        audio_steps: Number of audio prediction steps taken per batch item (B,).
-        phoneme_stream_ended: Whether the phoneme stream has ended per batch item (B,) bool tensor.
-        phoneme_eos_detected: Whether the phoneme EOS has been predicted per batch item (B,) bool tensor.
-        finished: Whether generation is complete per batch item (B,) bool tensor.
-        device: Device tensors are on.
-        training_mode: The training mode being used for inference.
-        use_cfg: Whether classifier-free guidance is enabled.
-        cfg_scale: CFG scale factor.
-        use_local_transformer: Whether to use local transformer for inference.
-        temperature: Sampling temperature.
-        topk: Top-k sampling parameter.
-        dummy_context_embedding_unconditional: Unconditional embedding for CFG (if enabled).
-        last_hidden: Last hidden state from transformer.
-        text_finished: Whether text input has finished per batch item (B,) bool tensor.
-        phoneme_input_type: 'gt' or 'pred' for phoneme tokens.
-        phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
-        last_phoneme_tokens: Last predicted phoneme tokens (B, phoneme_stacking_factor).
-        last_audio_codes: Last predicted audio codes (B, num_codebooks).
-        audio_prediction_start_idx: Global frame index where audio predictions start per batch item (B,).
-        audio_prediction_end_idx: Global frame index where audio predictions end per batch item (B,), -1 if not ended.
-        phoneme_prediction_start_idx: Global step index where phoneme predictions start per batch item (B,).
-        phoneme_prediction_end_idx: Global step index where phoneme predictions end per batch item (B,), -1 if not ended.
-    """
-
-    batch_size: int
-    past_key_values: Optional[Tuple]
-    cache_seq_len: int
-    all_predictions: List[torch.Tensor]
-    all_phoneme_predictions: List[torch.Tensor]
-    context_audio_codes: torch.Tensor
-    context_audio_codes_lens: torch.Tensor
-    context_lens: torch.Tensor
-    full_context_embedding: torch.Tensor
-    full_context_lens: torch.Tensor
-    context_position: torch.Tensor
-    text_tokens_seen: torch.Tensor
-    phoneme_steps: torch.Tensor
-    audio_steps: torch.Tensor
-    phoneme_stream_ended: torch.Tensor
-    phoneme_eos_detected: torch.Tensor
-    finished: torch.Tensor
-    device: torch.device
-    training_mode: TrainingMode
-    use_cfg: bool
-    cfg_scale: float
-    use_local_transformer: bool
-    temperature: float
-    topk: int
-    dummy_context_embedding_unconditional: Optional[torch.Tensor]
-    last_hidden: torch.Tensor
-    text_finished: torch.Tensor
-    phoneme_input_type: str
-    phoneme_sampling_method: str
-    last_phoneme_tokens: Optional[torch.Tensor]
-    last_audio_codes: Optional[torch.Tensor]
-    audio_prediction_start_idx: torch.Tensor
-    audio_prediction_end_idx: torch.Tensor
-    phoneme_prediction_start_idx: torch.Tensor
-    phoneme_prediction_end_idx: torch.Tensor
-    gt_phoneme_embeddings: Optional[torch.Tensor] = None  # (B, T', E) pre-computed GT embeddings
-    gt_phoneme_lens: Optional[torch.Tensor] = None  # (B,) lengths after stacking
-    gt_audio_embeddings: Optional[torch.Tensor] = None  # (B, T', E) pre-computed GT audio embeddings
-    gt_audio_lens: Optional[torch.Tensor] = None  # (B,) lengths after stacking
-
-
-@dataclass
-class StreamingFinalizeOutput:
-    """Output from streaming_finalize containing audio and phoneme predictions."""
+    Magpie-TTS Model Decoder Only Model with training support.
 
-    audio: torch.Tensor  # (B, max_audio_len) generated audio waveform
-    audio_len: torch.Tensor  # (B,) length of audio per batch item
-    audio_codes: torch.Tensor  # (B, num_codebooks, T) generated audio codes
-    audio_codes_len: torch.Tensor  # (B,) length of codes per batch item
-    phoneme_tokens: List[List[int]]  # List of phoneme token sequences per batch item
-    phoneme_text: List[str]  # Decoded phoneme strings per batch item
-
-
-@dataclass
-class InferBatchOutput:
-    """Output dataclass for EasyMagpieTTS infer_batch method."""
-
-    predicted_audio: torch.Tensor  # (B, T_audio)
-    predicted_audio_lens: torch.Tensor  # (B,)
-    predicted_codes: torch.Tensor  # (B, num_codebooks, T_frames)
-    predicted_codes_lens: torch.Tensor  # (B,)
-    rtf_metrics: Dict[str, Any]
-    predicted_phoneme_tokens: Optional[torch.Tensor] = None  # (B, phoneme_stacking_factor, T_phoneme_steps)
-    predicted_phoneme_tokens_lens: Optional[torch.Tensor] = None  # (B,) number of valid phoneme steps per item
-    phoneme_prediction_start_idx: Optional[torch.Tensor] = None  # (B,) start index into predicted_phoneme_tokens
-
-
-def worker_init_fn(worker_id):
-    # For mp.set_start_method("spawn", force=True)
-    # The dataset class should be picklable, so we initialize non-picklable objects here
-    logging.info(f"Worker {worker_id} initializing...")
-    worker_info = get_worker_info()
-    dataset = worker_info.dataset  # Get the dataset instance in this worker
-    tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type)
-    dataset.text_tokenizer = tokenizer
-    if hasattr(dataset, 'phoneme_tokenizer_config'):
-        dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config)
-
-
-class EasyMagpieTTSModel(ModelPT):
-    """
-    Magpie-TTS Model Decoder Only Model
-    audio/text
+    Subclasses EasyMagpieTTSInferenceModel to add training_step, validation_step,
+    process_batch, data loading, and training-specific configuration (loss weights,
+    phoneme corruption, eval models for validation metrics).
     """
 
     def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
-        self.world_size = 1
-        if trainer is not None:
-            self.world_size = trainer.num_nodes * trainer.num_devices
-
-        # load codec
-        codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False)
-        self.sample_rate = codec_model.sample_rate
-        self.output_sample_rate = codec_model.output_sample_rate
-
-        if hasattr(codec_model, "discriminator"):
-            # del codec discriminator to free memory
-            del codec_model.discriminator
-
-        # Set up codebook configuration
-        vector_quantizer = cfg.get('vector_quantizer')
-        if vector_quantizer is not None:
-            vector_quantizer = instantiate(vector_quantizer)
-            num_audio_codebooks = vector_quantizer.num_codebooks
-            codebook_size = vector_quantizer.codebook_size
-            codec_converter = VectorQuantizerIndexConverter(
-                vector_quantizer_original=codec_model.vector_quantizer,
-                vector_quantizer_new=vector_quantizer,
-            )
-            data_num_audio_codebooks = codec_model.vector_quantizer.num_codebooks
-        else:
-            num_audio_codebooks = codec_model.num_codebooks
-            data_num_audio_codebooks = num_audio_codebooks
-            codebook_size = codec_model.codebook_size
-            codec_converter = None
-
-        # The dataloader needs to know the number of codebooks that the context codes were stored in
-        # In the case where there are no context codes saved, and there is no context audio (in the text context path),
-        # We create a dummy context code tensor that is only [context_BOS, context_EOS] that is repeated for
-        # data_num_audio_codebooks
-        self.data_num_audio_codebooks = data_num_audio_codebooks
-        self.num_audio_codebooks = num_audio_codebooks
-        self.codebook_size = codebook_size
-
-        self.codec_model_samples_per_frame = codec_model.samples_per_frame
-        # Our codebooks start with actual audio codec tokens, followed by special tokens.
-        # The `forced_*` options are for backward compatibility for models trained with older code.
-        # Our codebooks start with actual audio codec tokens, followed by special tokens.
-        # The `forced_*` options are for backward compatibility for models trained with older code.
-        get_token_index = partial(SpecialAudioToken.get_index, base_codebook_size=self.codebook_size)
-        self.audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_BOS)
-        self.audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_EOS)
-        self.context_audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_BOS)
-        self.context_audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_EOS)
-        self.mask_token_id = get_token_index(SpecialAudioToken.MASK_TOKEN)
-        self.num_all_tokens_per_codebook = self.codebook_size + len(SpecialAudioToken)
-        self.use_bpe_char_tokenizer = cfg.get('use_bpe_char_tokenizer', False)
-
-        # If specified, use this as the text conditioning tokenizer. Otherwise, use the first tokenizer.
-        self.text_conditioning_tokenizer_name = cfg.get('text_conditioning_tokenizer_name', None)
-        if self.text_conditioning_tokenizer_name is None:
-            self.text_conditioning_tokenizer_name = list(cfg.text_tokenizers.keys())[0]
-
-        self.cfg_unconditional_prob = cfg.get('cfg_unconditional_prob', 0.0)
-
-        # Multi-mode training configuration
-        # The model trains with multiple text input modes (full, streaming with various delays)
-        # Each mode has its own task embedding that is prepended to the context
-        training_modes_cfg = cfg.get('training_modes', None)
-        if training_modes_cfg is None:
-            # Create a default training mode for backward compatibility
-            self.training_modes = [
-                TrainingMode(
-                    text_input_mode="streaming",
-                    streaming_phonemes_delay=4,
-                    streaming_speech_delay=8,
-                    mode_idx=0,
-                )
-            ]
-
-        else:
-            self.training_modes = []
-            for mode_idx, mode_cfg in enumerate(training_modes_cfg):
-                mode = TrainingMode(
-                    text_input_mode=mode_cfg.text_input_mode,
-                    streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0),
-                    streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0),
-                    mode_idx=mode_idx,
-                )
-                self.training_modes.append(mode)
-
-        logging.info(f"Multi-mode training with {len(self.training_modes)} modes:")
-        for mode in self.training_modes:
-            logging.info(
-                f"  - {mode.name}: text_input_mode={mode.text_input_mode}, "
-                f"streaming_phonemes_delay={mode.streaming_phonemes_delay}, "
-                f"streaming_speech_delay={mode.streaming_speech_delay}"
-            )
-
-        # Create a mapping from mode name to mode object for easy lookup during inference
-        self.mode_name_to_mode = {mode.name: mode for mode in self.training_modes}
-        # Default mode for inference if not specified (first mode in the list)
-        self.default_inference_mode = self.training_modes[0].name
-
-        self.frame_stacking_factor = cfg.get('frame_stacking_factor', 1)
-
-        self.tokenizer = setup_tokenizers(
-            all_tokenizers_config=cfg.text_tokenizers,
-            mode='train',
-        )
+        super().__init__(cfg=cfg, trainer=trainer)
 
-        num_tokens_tokenizer = len(self.tokenizer.tokens)
-        num_tokens = num_tokens_tokenizer + 3  # +3 for BOS, EOS, CFG_UNK
-        self.bos_id = num_tokens - 3
-        self.eos_id = num_tokens - 2
-        self.cfg_unk_token_id = num_tokens - 1
-        self.phoneme_tokenizer = None
+        # Training-specific configuration
         self.dropout_text_input_prob = cfg.get('dropout_text_input_prob', 0.0)
         self.phoneme_corruption_batch_prob = cfg.get('phoneme_corruption_batch_prob', 0.0)
         self.phoneme_corruption_timestep_ratio = cfg.get('phoneme_corruption_timestep_ratio', 0.0)
@@ -390,163 +123,9 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.phoneme_loss_weight = cfg.get('phoneme_loss_weight', 1.0)
         self.parallel_codebook_loss_scale = cfg.get('parallel_codebook_loss_scale', 1.0)
         self.local_transformer_loss_scale = cfg.get('local_transformer_loss_scale', 1.0)
-        if cfg.get('phoneme_tokenizer', None) is not None:
-            self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer)
-            self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1)
-            self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size
-            if cfg.get('phoneme_corruption_batch_prob', None) is None:
-                # Legacy mode: remove the UNK token from the phoneme vocabulary
-                # TODO: Remove this.
-                self.phoneme_vocab_size -= 1
-            # If max phoneme probability is below this threshold at inference-time,
-            # replace the predicted timestep with UNK to reduce error propagation.
-            self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0)
-
-        self.pad_context_text_to_max_duration = False
-        self.add_language_to_context_text = cfg.get('add_language_to_context_text', False)
-
-        super().__init__(cfg=cfg, trainer=trainer)
-
-        # This needs to happen after super().__init__()
-        self._codec_model = codec_model
-        self._codec_model.freeze()  # Lightning does requires_grad = False and self.eval()
-        self._codec_converter = codec_converter
-
-        # Audio embedding dimension - can be smaller than hidden_dim to reduce parameters
-        self.audio_embedding_dim = cfg.get('audio_embedding_dim', cfg.hidden_dim)
-
-        audio_embeddings = []
-        for _ in range(self.num_audio_codebooks * self.frame_stacking_factor):
-            audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, self.audio_embedding_dim))
-        self.audio_embeddings = nn.ModuleList(audio_embeddings)
 
-        # Projection from audio_embedding_dim to embedding_dim (Identity if same)
-        if self.audio_embedding_dim != cfg.embedding_dim:
-            self.audio_in_projection = nn.Linear(self.audio_embedding_dim, cfg.embedding_dim)
-        else:
-            self.audio_in_projection = nn.Identity()
-
-        if self.phoneme_tokenizer is not None:
-            phoneme_embeddings = []
-            for _ in range(self.phoneme_stacking_factor):
-                phoneme_embeddings.append(nn.Embedding(self.phoneme_vocab_size, cfg.embedding_dim))
-            self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings)
-            self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor)
-
-        # Decoder backend selection - supports HuggingFace models or NemotronH
-        self.decoder_type = cfg.get('decoder_type', 'huggingface')  # backward compatible default
-        logging.info(f"Using decoder type: {self.decoder_type}")
-
-        if self.decoder_type == 'huggingface':
-            # Existing HuggingFace path
-            self.transformer_backend_config = AutoConfig.from_pretrained(
-                cfg.transformer_hf_backend,
-                trust_remote_code=True,
-            )
-            hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config)
-            self.decoder = hf_transformer.model
-            self.lm_text_head = hf_transformer.lm_head
-
-        elif self.decoder_type == 'nemotron_h':
-            # NemotronH hybrid Mamba2/Attention backend
-            from nemo.collections.tts.modules.nemotron_h_decoder import NemotronHConfig, NemotronHForCausalLM
-
-            # Build config from YAML parameters
-            nemotron_h_config_dict = dict(cfg.get('nemotron_h_config', {}))
-            # Ensure hidden_size matches embedding_dim for compatibility
-            if 'hidden_size' not in nemotron_h_config_dict:
-                nemotron_h_config_dict['hidden_size'] = cfg.embedding_dim
-            nemotron_config = NemotronHConfig(**nemotron_h_config_dict)
-            nemotron_model = NemotronHForCausalLM(nemotron_config)
-            self.decoder = nemotron_model.backbone
-            self.lm_text_head = nemotron_model.lm_head
-            logging.info(
-                f"NemotronH config: {nemotron_config.num_hidden_layers} layers, pattern={nemotron_config.hybrid_override_pattern[:20]}..."
-            )
-
-        else:
-            raise ValueError(f"Unknown decoder_type: {self.decoder_type}. Supported: 'huggingface', 'nemotron_h'")
-
-        self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim)
-        self.decoder.set_input_embeddings(self.text_embedding)
-        # self.decoder.float()
-
-        # Task embedding for multi-mode training
-        # Each mode has a unique task embedding that is prepended to the context
-        # Only create task embedding if there are multiple modes
-        num_modes = len(self.training_modes)
-        if num_modes > 1:
-            self.task_embedding = nn.Embedding(num_modes, cfg.embedding_dim)
-            logging.info(f"Created task embedding with {num_modes} modes, embedding_dim={cfg.embedding_dim}")
-        else:
-            self.task_embedding = None
-            logging.info(f"Single training mode '{self.training_modes[0].name}', skipping task embedding")
-
-        if self.use_bpe_char_tokenizer:
-            # BPE char tokenizer
-            assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer"
-            tokenizer_name = self.tokenizer.tokenizer_names[0]
-            tokenizer = self.tokenizer.tokenizers[tokenizer_name]
-            subword_vocab = tokenizer.get_vocab()
-            # special tokens will be stored as it is in the char_vocab
-            # Each special token will only be mapped to one char id
-            special_vocab = {
-                '<BOS>': self.bos_id,
-                '<EOS>': self.eos_id,
-                '<CFG_UNK>': self.cfg_unk_token_id,
-            }
-            self.cas_encoder = CharAwareSubwordEncoder(
-                d_embed=cfg.embedding_dim,
-                llm_tokenizer_vocab=subword_vocab,
-                subword_padding_idx=self.tokenizer.pad,
-                special_vocab=special_vocab,
-            )
-
-        # Projection from hidden_dim to audio_embedding_dim before final_proj (Identity if same)
-        if self.audio_embedding_dim != cfg.hidden_dim:
-            self.audio_out_projection = nn.Linear(cfg.hidden_dim, self.audio_embedding_dim)
-        else:
-            self.audio_out_projection = nn.Identity()
-
-        self.final_proj = nn.Linear(
-            self.audio_embedding_dim,
-            self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor,
-        )
         self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none')
 
-        self.local_transformer_type = LocalTransformerType(cfg.get('local_transformer_type', 'none').lower())
-        logging.info(f"Local transformer type: {self.local_transformer_type}")
-        if self.local_transformer_type != LocalTransformerType.NO_LT:
-            local_transformer_hidden_dim = cfg.get('local_transformer_hidden_dim', 256)
-            if local_transformer_hidden_dim != cfg.hidden_dim:
-                self.local_transformer_in_projection = nn.Linear(cfg.hidden_dim, local_transformer_hidden_dim)
-            else:
-                self.local_transformer_in_projection = nn.Identity()
-            self.local_transformer = transformer_2501.Transformer(
-                n_layers=self.cfg.get('local_transformer_n_layers', 2),
-                d_model=local_transformer_hidden_dim,
-                d_ffn=local_transformer_hidden_dim * 4,
-                sa_n_heads=self.cfg.get('local_transformer_n_heads', 1),
-                kernel_size=1,
-                is_causal=self.local_transformer_type == LocalTransformerType.AR,
-                max_length_causal_mask=self.num_audio_codebooks * self.frame_stacking_factor + 2,
-                use_learnable_pos_emb=True,
-            )
-            # Projection from local_transformer_hidden_dim to audio_embedding_dim (Identity if same)
-            if self.audio_embedding_dim != local_transformer_hidden_dim:
-                self.local_transformer_audio_out_projection = nn.Linear(
-                    local_transformer_hidden_dim, self.audio_embedding_dim
-                )
-            else:
-                self.local_transformer_audio_out_projection = nn.Identity()
-            local_transformer_out_projections = []
-            for _ in range(self.num_audio_codebooks * self.frame_stacking_factor):
-                # Have a separate projection layer for each codebook, to distinguish between them
-                local_transformer_out_projections.append(
-                    nn.Linear(self.audio_embedding_dim, self.num_all_tokens_per_codebook)
-                )
-            self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections)
-
         # Validation inference with metrics (optional)
         self.run_val_inference = cfg.get('run_val_inference', False)
         self.use_multilingual_asr = cfg.get('use_multilingual_asr', False)
@@ -584,270 +163,15 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self._utmos_calculator = UTMOSv2Calculator(device='cpu')
             logging.info("UTMOSv2 calculator initialized for validation naturalness scoring")
 
-    def setup_optimizer_param_groups(self):
-        """
-        Override to exclude frozen eval/inference-only models from the optimizer.
-        This prevents optimizer state mismatch errors when resuming from checkpoints
-        that were saved before these eval models were added.
-        """
-        modules_to_exclude = {
-            '_speaker_verification_model',
-            '_codec_model',
-            '_eval_asr_model',
-            '_eval_speaker_verification_model',
-            'whisper_model',
-            'whisper_processor',
-            '_utmos_calculator',
-        }
-
-        # Collect parameter ids to exclude
-        excluded_param_ids = set()
-        for name, module in self.named_children():
-            if name in modules_to_exclude:
-                for param in module.parameters():
-                    excluded_param_ids.add(id(param))
-
-        # Build param group with only trainable (non-excluded) parameters
-        trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids]
-
-        logging.info(
-            f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, "
-            f"{len(excluded_param_ids)} params excluded (eval models)"
-        )
-
-        self._optimizer_param_groups = [{"params": trainable_params}]
-
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        """
-        Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model
-        from the checkpoint. The codec model is saved in a separate checkpoint.
-        """
-        if hasattr(self, '_no_state_dict') and self._no_state_dict:
-            return {}
-        # Don't save the speaker verification and codec model in the state dict
-        state_dict = super().state_dict(destination, prefix, keep_vars)
-        keys_substrings_to_exclude = [
+    def _get_state_dict_keys_to_exclude(self):
+        return super()._get_state_dict_keys_to_exclude() + [
             '_speaker_verification_model',
-            '_codec_model',
             '_eval_asr_model',
             '_eval_speaker_verification_model',
             'whisper_model',
             'whisper_processor',
             '_utmos_calculator',
         ]
-        for key in list(state_dict.keys()):
-            if any([substring in key for substring in keys_substrings_to_exclude]):
-                del state_dict[key]
-        return state_dict
-
-    def load_state_dict(self, state_dict, strict=True):
-        """
-        Modify load_state_dict so that we don't restore weights to _speaker_verification_model and _codec_model when
-        strict is True.
-        When strict is False, we can call pytorch's load_state_dict.
-        When strict is True, we loop through all parameters and rename them to enable loading.
-        """
-        if strict == False:
-            super().load_state_dict(state_dict, strict=False)
-        for name, child in self.named_children():
-            if name in [
-                '_speaker_verification_model',
-                '_codec_model',
-                '_eval_asr_model',
-                '_eval_speaker_verification_model',
-                'whisper_model',
-                'whisper_processor',
-                '_utmos_calculator',
-            ]:
-                continue
-            if any(param.numel() > 0 for param in child.parameters()):
-                # If the module has parameters, we want to change the default mapping so that the state_dict gets
-                # loaded.
-                # Ex: state_dict[encoder.position_embeddings.weight] -> new_state_dict[position_embeddings.weight]
-                new_state_dict = {}
-                for key in state_dict.keys():
-                    name_with_dot = f"{name}."
-                    if key.startswith(name_with_dot):
-                        new_state_dict[key[len(name_with_dot) :]] = state_dict[key]
-                child.load_state_dict(new_state_dict)
-
-    def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1):
-        # codes: (B, C, T')
-        # codes_len: (B,)
-        codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0)
-        codes_len = codes_len + num_eos_tokens
-        # Insert EOS token at new final token entry
-        for idx in range(codes.size(0)):
-            codes[idx, :, codes_len[idx] - 1] = eos_id
-
-        return codes, codes_len
-
-    def add_special_tokens(self, codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1):
-        # codes: (B, C, T')
-        # codes_len: (B,)
-        codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id)
-        codes_len = codes_len + num_bos_tokens
-        codes, codes_len = self.add_eos_token(
-            codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens
-        )
-        return codes, codes_len
-
-    def remove_bos_token(self, codes, codes_len, num_tokens=1):
-        # codes: (B, C, T')
-        # codes_len: (B,)
-        codes = codes[:, :, num_tokens:]
-        codes_len = codes_len - num_tokens
-        return codes, codes_len
-
-    def remove_embedded_bos_token(self, embedded, embedded_len):
-        # codes: (B, T', C)
-        # codes_len: (B,)
-        embedded = embedded[:, 1:, :]
-        embedded_len = embedded_len - 1
-        return embedded, embedded_len
-
-    def remove_eos_token(self, codes, codes_len):
-        # codes: (B, C, T')
-        # codes_len: (B,)
-        codes_len = codes_len - 1
-        codes = codes[:, :, :-1]
-        mask = get_mask_from_lengths(lengths=codes_len)
-        codes = codes * mask.unsqueeze(1)
-        return codes, codes_len
-
-    def remove_embedded_eos_token(self, embedded, embedded_len):
-        # embedded: (B, T', D)
-        # embedded_len: (B,)
-        embedded_len = embedded_len - 1
-        embedded = embedded[:, :-1, :]
-        mask = get_mask_from_lengths(lengths=embedded_len)
-        embedded = embedded * mask.unsqueeze(2)
-        return embedded, embedded_len
-
-    def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1):
-        codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens)
-        codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len)
-        return codes, codes_len
-
-    def audio_to_codes(self, audio, audio_len, sample_rate=None):
-        self._codec_model.eval()
-        with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32):
-            codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate)
-            return codes, codes_len
-
-    def codes_to_audio(self, codes, codes_len):
-        # codes: (B, C, T')
-        # codes_len: (B,)
-        self._codec_model.eval()
-        if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor:
-            # Unstack the audio codes if they are stacked
-            codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor)
-
-        with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
-            # Pass the modified integer token IDs
-            if self._codec_converter is not None:
-                codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len)
-            if codes_len.min() < 4:
-                # Pad the codes with 0s to make the minimum length 4
-                # codes is (B, C, T)
-                codes = torch.nn.functional.pad(input=codes, pad=(0, 4 - codes_len.min()), value=0)
-                # Updates all lens less than 4 to 4
-                codes_len = torch.where(codes_len < 4, torch.ones_like(codes_len) * 4, codes_len)
-                codes = codes[:, :, : codes_len.max()]
-
-            audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len)
-            # audio: (B, T)
-            # audio_len: (B,)
-            return audio, audio_len, codes
-
-    def embed_audio_tokens(self, audio_tokens):
-        # audio_tokens: (B, C, T')
-        # Add and average the embeddings of the audio tokens across the codebooks
-        audio_embedding = None
-        for c in range(audio_tokens.size(1)):
-            embedding = self.audio_embeddings[c](audio_tokens[:, c, :])
-            if audio_embedding is None:
-                audio_embedding = embedding
-            else:
-                audio_embedding = audio_embedding + embedding
-        audio_embedding = audio_embedding / audio_tokens.size(1)
-        # Project from audio_embedding_dim to embedding_dim
-        audio_embedding = self.audio_in_projection(audio_embedding)
-        return audio_embedding
-
-    def embed_phoneme_tokens(self, phoneme_tokens):
-        # phoneme_tokens: (B, S, T')
-        phoneme_embedding = None
-        for c in range(phoneme_tokens.size(1)):
-            embedding = self.phoneme_embeddings[c](phoneme_tokens[:, c, :])
-            if phoneme_embedding is None:
-                phoneme_embedding = embedding
-            else:
-                phoneme_embedding = phoneme_embedding + embedding
-        phoneme_embedding = phoneme_embedding / phoneme_tokens.size(1)
-        return phoneme_embedding
-
-    def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False):
-        """
-        Predicts the logits for all codebooks using the local transformer. Used in both autoregressive (AR) and MaskGit (MG) modes.
-        This function is used in training and validation, not inference/sampling.
-        The sequence layout is slightly different between AR and MG modes, as shown in the diagram below,
-        (using an 8-codebook setup as an example):
-        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-        | AR target  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |   none  |
-        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-        | MG target  |  none   |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
-        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-        |   Input    | Magpie  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
-        |            | Latent  | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK |
-        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-        | Seq. Index |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |    8    |
-        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-
-        dec_out: (B, T', E)
-        audio_codes_target: (B, C, T')
-        targets_offset_by_one: bool, if False, the target for index 0 is codebook 0, for index 1 is codebook 1, etc. (autoregressive)
-                                     if True,  the target for index 1 is codebook 0, for index 2 is codebook 1, etc. (MaskGit)
-        """
-        dec_out_all = dec_out.reshape(-1, dec_out.size(-1))  # (B*T', hidden_dim)
-        local_transformer_input = [dec_out_all]
-        for codebook_num in range(audio_codes_target.size(1)):
-            codes = audio_codes_target[:, codebook_num]  # (B, T')
-            codes = codes.reshape(-1)  # (B*T',)
-            codebook_embedding = self.audio_embeddings[codebook_num](codes)  # (B*T', audio_embedding_dim)
-            # Project from audio_embedding_dim to embedding_dim
-            codebook_embedding = self.audio_in_projection(codebook_embedding)
-            local_transformer_input.append(codebook_embedding)
-
-        local_transformer_input = torch.stack(local_transformer_input, dim=1)  # (B*T', C+1, E)
-        local_transformer_input = self.local_transformer_in_projection(local_transformer_input)  # (B*T', C+1, 128)
-        _mask = torch.ones(
-            local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
-        )
-        local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']  # (B*T', C+1, E)
-        if not targets_offset_by_one:
-            # for autoregressive local transformer the target for index 0 is codebook 0, for index 1 is codebook 1, etc.
-            local_transformer_output = local_transformer_output[:, :-1, :]  # (B*T', C, E)
-        else:
-            # for MaskGit the target for index **1** is codebook 0, for index 2 is codebook 1, etc.
-            local_transformer_output = local_transformer_output[:, 1:, :]  # (B*T', C, E)
-        # Project from local_transformer_hidden_dim to audio_embedding_dim
-        local_transformer_output = self.local_transformer_audio_out_projection(local_transformer_output)
-        all_code_logits = []
-        for codebook_num in range(audio_codes_target.size(1)):
-            # Using a separate projection layer for each codebook (to distinguish between them)
-            # Checked the time - this loop is not taking much time (compared to the local transformer forward pass)
-            codebook_logits = self.local_transformer_out_projections[codebook_num](
-                local_transformer_output[:, codebook_num, :]
-            )  # (B*T', num_all_tokens_per_codebook)
-            all_code_logits.append(codebook_logits)
-        all_code_logits = torch.cat(all_code_logits, dim=1)  # (B*T', num_codebooks * num_all_tokens_per_codebook)
-
-        all_code_logits = all_code_logits.view(
-            audio_codes_target.size(0), audio_codes_target.size(2), -1
-        )  # (B, T', C * num_all_tokens_per_codebook)
-
-        return all_code_logits
 
     def compute_loss(self, logits, audio_codes, audio_codes_lens):
         """
@@ -898,192 +222,6 @@ def compute_phoneme_loss(self, logits, phoneme_tokens, phoneme_tokens_lens):
         total_phoneme_loss = total_phoneme_loss / self.phoneme_stacking_factor
         return total_phoneme_loss, loss_mask
 
-    def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None, cache_position=None):
-        # Only pass cache_position for NemotronH (HF transformers may not accept it)
-        if self.decoder_type == 'nemotron_h':
-            backend_out = self.decoder(
-                inputs_embeds=inputs_embeds,
-                attention_mask=attention_mask,
-                use_cache=use_cache,
-                past_key_values=past_key_values,
-                cache_position=cache_position,
-            )
-        else:
-            backend_out = self.decoder(
-                inputs_embeds=inputs_embeds,
-                attention_mask=attention_mask,
-                use_cache=use_cache,
-                past_key_values=past_key_values,
-            )
-        # hidden_states = backend_out.last_hidden_state  # (B, T_total, H)
-        return backend_out
-
-    def logits_to_audio_codes(self, all_code_logits, audio_codes_lens):
-        # all_code_logits: (B, T', num_codebooks * num_tokens_per_codebook)
-        # audio_codes_lens: (B,)
-        all_preds = []
-        for idx in range(self.num_audio_codebooks * self.frame_stacking_factor):
-            si = idx * self.num_all_tokens_per_codebook
-            ei = si + self.num_all_tokens_per_codebook
-            codebook_logits = all_code_logits[:, :, si:ei]
-            codebook_probs = torch.softmax(codebook_logits, dim=-1)  # (B, T', num_tokens_per_codebook)
-            # argmax to get the tokens
-            codebook_preds = torch.argmax(codebook_probs, dim=-1)  # (B, T')
-            all_preds.append(codebook_preds)
-
-        all_preds = torch.stack(all_preds, dim=1)  # (B, C, T')
-        audio_mask = get_mask_from_lengths(audio_codes_lens)
-        all_preds = all_preds * audio_mask.unsqueeze(1)
-
-        return all_preds
-
-    def local_transformer_sample_autoregressive(
-        self,
-        dec_output,
-        temperature=0.7,
-        topk=80,
-        unfinished_items={},
-        finished_items={},
-        use_cfg=False,
-        cfg_scale=1.0,
-    ):
-        # dec_output: (B, E)
-        self.local_transformer.reset_cache(use_cache=False)
-        dec_output = dec_output.unsqueeze(1)  # (B, 1, E)
-        local_transformer_input = self.local_transformer_in_projection(dec_output)  # (B, 1, 128)
-        all_preds = []
-        for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor):
-            _mask = torch.ones(
-                local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
-            )
-            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']  # (B, T, 128)
-            # Project from local_transformer_hidden_dim to audio_embedding_dim
-            local_transformer_output_projected = self.local_transformer_audio_out_projection(
-                local_transformer_output[:, -1, :]
-            )
-            codebook_logits = self.local_transformer_out_projections[codebook_num](
-                local_transformer_output_projected
-            )  # (B, num_all_tokens_per_codebook)
-            if use_cfg:
-                actual_batch_size = codebook_logits.size(0) // 2
-                conditional_logits = codebook_logits[:actual_batch_size]
-                unconditional_logits = codebook_logits[actual_batch_size:]
-                cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
-                codebook_logits[:actual_batch_size] = cfg_logits
-
-            # Replace NaN/inf then clamp to prevent extreme values (e.g. from CFG) causing NaN in softmax
-            # print("codebook_logits stats before nan_to_num")
-            # print(f"min: {codebook_logits.min()}, max: {codebook_logits.max()}, mean: {codebook_logits.mean()}, std: {codebook_logits.std()}")
-            codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
-            codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
-
-            for item_idx in unfinished_items:
-                codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
-            for item_idx in finished_items:
-                codebook_logits[item_idx, :] = float('-inf')
-                codebook_logits[item_idx, self.audio_eos_id] = 0.0
-
-            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]  # (B, topk)
-            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(
-                -1
-            )  # (B, num_tokens_per_codebook)
-            codebook_logits_rescored = codebook_logits.clone()
-            codebook_logits_rescored[indices_to_remove] = float('-inf')
-
-            if temperature <= 0.0:
-                # Argmax sampling for deterministic output
-                codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True)  # (B, 1)
-            else:
-                codebook_probs = torch.softmax(
-                    codebook_logits_rescored / temperature, dim=-1
-                )  # (B, num_tokens_per_codebook)
-                codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
-            if use_cfg:
-                codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size]
-            all_preds.append(codebook_preds)
-            next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(
-                1
-            )  # (B, 1, audio_embedding_dim)
-            # Project from audio_embedding_dim to embedding_dim, then to local_transformer_hidden_dim
-            next_local_transformer_input = self.audio_in_projection(next_local_transformer_input)
-            next_local_transformer_input = self.local_transformer_in_projection(
-                next_local_transformer_input
-            )  # (B, 1, local_transformer_hidden_dim)
-            local_transformer_input = torch.cat(
-                [local_transformer_input, next_local_transformer_input], dim=1
-            )  # (B, T+1, local_transformer_hidden_dim)
-
-        all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
-        if use_cfg:
-            all_preds = all_preds[:actual_batch_size]
-
-        return all_preds
-
-    def sample_codes_from_logits(
-        self, all_code_logits_t, temperature=0.7, topk=80, unfinished_items={}, finished_items={}
-    ):
-        # all_code_logits_t: (B, num_codebooks * num_tokens_per_codebook), logits at a given timestep
-        all_preds = []
-        for idx in range(self.num_audio_codebooks * self.frame_stacking_factor):
-            si = idx * self.num_all_tokens_per_codebook
-            ei = si + self.num_all_tokens_per_codebook
-            codebook_logits = all_code_logits_t[:, si:ei]  # (B, num_tokens_per_codebook)
-            # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax
-            codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
-            codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
-            for item_idx in unfinished_items:
-                codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
-            for item_idx in finished_items:
-                codebook_logits[item_idx, :] = float('-inf')
-                codebook_logits[item_idx, self.audio_eos_id] = 0.0
-            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]  # (B, topk)
-            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(
-                -1
-            )  # (B, num_tokens_per_codebook)
-            codebook_logits_rescored = codebook_logits.clone()
-            codebook_logits_rescored[indices_to_remove] = float('-inf')
-
-            if temperature <= 0.0:
-                # Argmax sampling for deterministic output
-                codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True)  # (B, 1)
-            else:
-                codebook_probs = torch.softmax(
-                    codebook_logits_rescored / temperature, dim=-1
-                )  # (B, num_tokens_per_codebook)
-                codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
-            all_preds.append(codebook_preds)
-        all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
-        return all_preds
-
-    def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, topk=80):
-        # all_code_logits_t: (B, phoneme_stacking_factor * phoneme_vocab_size), logits at a given timestep
-        all_preds = []
-        for idx in range(self.phoneme_stacking_factor):
-            si = idx * self.phoneme_vocab_size
-            ei = si + self.phoneme_vocab_size
-            codebook_logits = all_code_logits_t[:, si:ei]  # (B, num_tokens_per_codebook)
-            # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax
-            codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
-            codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
-            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]  # (B, topk)
-            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(
-                -1
-            )  # (B, num_tokens_per_codebook)
-            codebook_logits_rescored = codebook_logits.clone()
-            codebook_logits_rescored[indices_to_remove] = float('-inf')
-
-            if temperature <= 0.0:
-                # Argmax sampling for deterministic output
-                codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True)  # (B, 1)
-            else:
-                codebook_probs = torch.softmax(
-                    codebook_logits_rescored / temperature, dim=-1
-                )  # (B, num_tokens_per_codebook)
-                codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
-            all_preds.append(codebook_preds)
-        all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
-        return all_preds
-
     def log_val_audio_example(
         self,
         logits,
@@ -1169,181 +307,6 @@ def log_val_audio_example(
 
         return wandb_audio_log
 
-    def join_embeddings_temporally(
-        self,
-        embeddings: Sequence[torch.Tensor],  # [ (B, Ti, E), … ]
-        lengths: Sequence[torch.Tensor],  # [ (B,), … ]  same order/size as `embeddings`
-        pad_embed: torch.Tensor | None = None,  # (E,)  defaults to zeros
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Merges Multiple Embedding sequences into a single Embedding Sequence.
-
-        Args:
-            embeddings  : Sequence of tensors, each of shape (B, Ti, E) — batch, time, embedding
-            lengths     : Sequence of tensors, each of shape (B,)
-            pad_embed   : (E,)  — embedding to use for padding, defaults to zeros
-
-        Returns:
-            joined      : (B, max_sum_len, E)  — merged & padded
-            out_lengths : (B,)  — total lengths of each batch element after merging
-        """
-        if len(embeddings) == 0:
-            raise ValueError("contexts must be non-empty")
-
-        B, _, E = embeddings[0].shape
-        device = embeddings[0].device
-        dtype = embeddings[0].dtype
-
-        # 1. compute output sizes
-        len_stack = torch.stack(tuple(lengths), dim=0)  # (N, B)
-        out_lengths = len_stack.sum(0)
-        max_len = int(out_lengths.max())
-
-        if pad_embed is None:
-            pad_embed = torch.zeros(E, dtype=dtype, device=device)
-
-        joined = pad_embed.expand(B, max_len, E).clone()  # (B,max_len,E)
-
-        # batch row indices
-        batch_rows = torch.arange(B, device=device).unsqueeze(1)  # (B,1)
-
-        # running offset keeps “write cursor” for each row
-        offset = torch.zeros(B, dtype=torch.long, device=device)  # (B,)
-
-        for i, (embedding_i, len_i) in enumerate(zip(embeddings, lengths)):
-            Ti = embedding_i.shape[1]
-            t_idx = torch.arange(Ti, device=device)  # (Ti,)
-            mask = t_idx.unsqueeze(0) < len_i.unsqueeze(1)  # (B,Ti)
-
-            # destination columns: offset + t
-            dest_cols = offset.unsqueeze(1) + t_idx  # (B,Ti)
-
-            # Assign embedding_i to the correct positions in joined
-            # Ensure dtype matches to avoid errors during mixed-precision training
-            joined[batch_rows.expand_as(mask)[mask], dest_cols[mask]] = embedding_i[mask].to(joined.dtype)
-
-            # move cursor past this segment
-            offset += len_i
-
-        return joined, out_lengths
-
-    def prepare_context_tensors(
-        self,
-        context_text_tokens: torch.Tensor,
-        context_text_tokens_lens: torch.Tensor,
-        context_audio_codes: Optional[torch.Tensor] = None,
-        context_audio_codes_lens: Optional[torch.Tensor] = None,
-        context_audio: Optional[torch.Tensor] = None,
-        context_audio_lens: Optional[torch.Tensor] = None,
-        training_mode: Optional[TrainingMode] = None,
-        dropout_conditional_input: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Prepare context tensors (without text) for the simplified process_batch.
-
-        This function processes context audio and context text to create the combined
-        context embedding.
-        Args:
-            context_text_tokens: Context text token IDs for speaker/style conditioning (B, L)
-            context_text_tokens_lens: Length of context text for each batch item (B,)
-            context_audio_codes: Pre-computed audio codes for context audio (B, C, T').
-                If None, will be computed from context_audio.
-            context_audio_codes_lens: Length of context audio codes (B,).
-                Required if context_audio_codes is provided.
-            context_audio: Raw context audio waveform (B, T).
-                Used to compute context_audio_codes if not provided.
-            context_audio_lens: Length of context audio (B,).
-                Required if context_audio is provided.
-            training_mode: Optional TrainingMode object specifying the mode to use.
-                If None, uses the first mode from training_modes as default.
-            dropout_conditional_input: If True, replace context with CFG unconditional token.
-
-        Returns:
-            Tuple of:
-                - context_embedding: Combined context embedding (B, T_context, E)
-                - context_lens: Total context length per batch item (B,)
-                - context_audio_codes: Processed audio codes with special tokens (B, C, T')
-                - context_audio_codes_lens: Length of processed context audio codes (B,)
-        """
-        # Determine the mode parameters to use
-        if training_mode is None:
-            training_mode = self.training_modes[0]
-
-        current_mode_idx = training_mode.mode_idx
-        batch_size = context_text_tokens.size(0)
-        device = context_text_tokens.device
-
-        # Context Audio
-        if context_audio_codes is None:
-            if context_audio is None:
-                raise ValueError("Either context_audio_codes or context_audio must be provided")
-            context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
-
-        if self._codec_converter is not None:
-            context_audio_codes = self._codec_converter.convert_original_to_new(
-                audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens
-            ).long()
-
-        context_audio_codes, context_audio_codes_lens = self.add_special_tokens(
-            codes=context_audio_codes,
-            codes_len=context_audio_codes_lens,
-            bos_id=self.context_audio_bos_id,
-            eos_id=self.context_audio_eos_id,
-        )
-
-        # Use legacy audio_bos_id/audio_eos_id if flag is set
-        stack_bos_id = (
-            self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id
-        )
-        stack_eos_id = (
-            self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id
-        )
-
-        context_audio_codes, context_audio_codes_lens = self.stack_codes(
-            context_audio_codes,
-            context_audio_codes_lens,
-            stack_bos_id,
-            stack_eos_id,
-            self.frame_stacking_factor,
-            self.num_audio_codebooks,
-        )
-        context_audio_embedded = self.embed_audio_tokens(context_audio_codes)  # (B, T', E)
-
-        # Context Text
-        context_text_lens = context_text_tokens_lens
-        context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens)  # (B, L, E)
-
-        # Prepare task embedding for multi-mode training
-        task_embedding = None
-        task_embedding_lens = None
-        if self.task_embedding is not None and current_mode_idx is not None:
-            mode_idx_tensor = torch.full((batch_size,), current_mode_idx, dtype=torch.long, device=device)
-            task_embedding = self.task_embedding(mode_idx_tensor).unsqueeze(1)  # (B, 1, E)
-            task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=device)  # (B,)
-
-        # Combine context embeddings: [task_embedding | context_audio | context_text]
-        if task_embedding is not None:
-            context_embedding, context_lens = self.join_embeddings_temporally(
-                embeddings=[task_embedding, context_audio_embedded, context_text_embedded],
-                lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens],
-            )
-        else:
-            context_embedding, context_lens = self.join_embeddings_temporally(
-                embeddings=[context_audio_embedded, context_text_embedded],
-                lengths=[context_audio_codes_lens, context_text_lens],
-            )
-
-        # Handle CFG unconditional dropout
-        if dropout_conditional_input:
-            cfg_token_id = self.cfg_unk_token_id
-            cfg_token_embedding = self.decoder.get_input_embeddings()(
-                torch.full((batch_size, 1), cfg_token_id, device=device)
-            )  # (B, 1, E)
-            # Expand CFG token to match context embedding size
-            context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1)  # (B, T_context, E)
-
-        return context_embedding, context_lens, context_audio_codes, context_audio_codes_lens
-
     def prepare_text_channel_embeddings(
         self,
         text: torch.Tensor,
@@ -1652,99 +615,6 @@ def slice_pred_embeddings(self, transformer_out, context_lens, target_lens):
         sliced = torch.gather(transformer_out, dim=1, index=gather_indices_exp)
         return sliced
 
-    def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_codebooks):
-        """
-        Stack multiple time steps into the channel dimension to reduce sequence length.
-
-        This function reshapes audio/phoneme codes by grouping consecutive time steps together
-        and placing them in the channel dimension. This allows the model to process multiple
-        frames in parallel while reducing the sequence length.
-
-        Args:
-            codes: Input codes tensor of shape (B, C, T) where B is batch size,
-                   C is number of codebooks, and T is sequence length.
-            codes_lens: Length of valid codes for each batch item, shape (B,).
-            bos_id: Beginning-of-sequence token ID used to detect and handle BOS tokens.
-            eos_id: End-of-sequence token ID used for padding.
-            stacking_factor: Number of time steps to stack together. If 1, no stacking is performed.
-            num_codebooks: Number of codebooks in the input.
-
-        Returns:
-            Tuple of:
-                - stacked_codes: Reshaped codes of shape (B, C * stacking_factor, T // stacking_factor).
-                  If input contains BOS tokens, they are preserved at the beginning.
-                - new_lens: Updated sequence lengths after stacking, shape (B,).
-        """
-        if stacking_factor == 1:
-            return codes, codes_lens
-
-        contains_bos = codes[0, 0, 0].item() == bos_id
-        if contains_bos:
-            bos_tensor_repeated = torch.full(
-                (codes.size(0), (stacking_factor) * num_codebooks, 1), bos_id, device=codes.device
-            )  # (B,stacking_factor*C, 1)
-            codes = codes[:, :, 1:]  # Remove the bos token
-            codes_lens = codes_lens - 1  # Remove the bos token
-        B, C, T = codes.shape
-        s = int(stacking_factor)
-
-        # --- Compute max padding needed ---
-        pad_t = (-T) % s  # pad so that T' is divisible by s
-        pad_tail = torch.full((B, C, pad_t), eos_id, dtype=codes.dtype, device=codes.device)
-        codes = torch.cat([codes, pad_tail], dim=-1)
-
-        # --- Stack time into channel dimension ---
-        Tp = codes.shape[-1]
-        T_out = Tp // s
-        codes = codes.view(B, C, T_out, s)
-        codes = codes.permute(0, 1, 3, 2).reshape(B, C * s, T_out)
-
-        new_lens = torch.div(codes_lens + s - 1, s, rounding_mode='floor')
-        if contains_bos:
-            codes = torch.cat([bos_tensor_repeated, codes], dim=2)
-            new_lens = new_lens + 1
-
-        return codes, new_lens
-
-    def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor):
-        """
-        Reverse the stacking operation to recover the original time dimension.
-
-        This is the inverse of `stack_codes`. It takes codes that have been stacked
-        in the channel dimension and expands them back into the time dimension.
-
-        Args:
-            stacked_codes: Stacked codes tensor of shape (B, C * stacking_factor, T_stacked)
-                          where T_stacked = T_original // stacking_factor.
-            stacked_lens: Length of valid stacked sequences for each batch item, shape (B,).
-            stacking_factor: The stacking factor used in the original `stack_codes` call.
-                            If 1, no unstacking is performed.
-
-        Returns:
-            Tuple of:
-                - unstacked_codes: Codes with restored time dimension, shape (B, C, T_stacked * stacking_factor).
-                - orig_lens: Recovered sequence lengths, shape (B,). Note that these are the
-                  maximum possible lengths; actual valid lengths may be shorter due to
-                  padding applied during stacking.
-        """
-        if stacking_factor == 1:
-            return stacked_codes, stacked_lens
-
-        B, CxS, T_out = stacked_codes.shape
-        s = int(stacking_factor)
-        assert CxS % s == 0, f"Channel dim ({CxS}) must be divisible by stacking_factor ({s})"
-
-        C = CxS // s
-        # Reshape: split channels back into (C, s)
-        x = stacked_codes.view(B, C, s, T_out)
-        # Bring s back into time dimension
-        x = x.permute(0, 1, 3, 2).reshape(B, C, T_out * s)
-
-        # Recover original lengths (before padding)
-        orig_lens = stacked_lens * s
-
-        return x, orig_lens
-
     def process_batch(
         self,
         text: torch.Tensor,
@@ -2660,1145 +1530,3 @@ def val_dataloader(self):
 
         self._val_dl_wrapped_with_dist_sampler = True
         return self._validation_dl
-
-    def _sample_audio_codes(
-        self,
-        last_hidden: torch.Tensor,
-        all_code_logits_t: torch.Tensor,
-        temperature: float,
-        topk: int,
-        use_local_transformer_for_inference: bool,
-        use_cfg: bool,
-        cfg_scale: float,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Sample audio codes from logits using either local transformer or parallel sampling.
-
-        Returns:
-            audio_codes_next: Sampled codes with temperature/topk (B, num_codebooks)
-            all_codes_next_argmax: Argmax sampled codes for EOS detection (B, num_codebooks)
-        """
-        if use_local_transformer_for_inference:
-            if self.local_transformer_type == LocalTransformerType.AR:
-                audio_codes_next = self.local_transformer_sample_autoregressive(
-                    dec_output=last_hidden[:, -1, :],
-                    temperature=temperature,
-                    topk=topk,
-                    use_cfg=use_cfg,
-                    cfg_scale=cfg_scale,
-                )
-            else:
-                raise ValueError(
-                    f"Local transformer inference requested but local transformer type is {self.local_transformer_type}"
-                )
-            # TODO @rfejgin: should we add argmax sampling for EOS here too?
-            all_codes_next_argmax = audio_codes_next
-        else:
-            # Parallel sampling from all codebook logits
-            audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk)
-            # Argmax sampling for reliable EOS detection
-            if temperature <= 0.0:
-                all_codes_next_argmax = audio_codes_next  # already argmax
-            else:
-                all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01)
-
-        return audio_codes_next, all_codes_next_argmax
-
-    def streaming_init(
-        self,
-        context_audio_codes: torch.Tensor,
-        context_audio_codes_lens: torch.Tensor,
-        context_text_tokens: torch.Tensor,
-        context_text_tokens_lens: torch.Tensor,
-        inference_mode: Optional[str] = None,
-        use_cfg: bool = False,
-        cfg_scale: float = 1.0,
-        use_local_transformer: bool = False,
-        temperature: float = 0.7,
-        topk: int = 80,
-        phoneme_input_type: str = 'predicted',
-        phoneme_sampling_method: str = 'argmax',
-        gt_phoneme_tokens: Optional[torch.Tensor] = None,
-        gt_phoneme_tokens_lens: Optional[torch.Tensor] = None,
-        gt_audio_codes: Optional[torch.Tensor] = None,
-        gt_audio_codes_lens: Optional[torch.Tensor] = None,
-        use_inference_mode: bool = True,
-    ) -> StreamingState:
-        """
-        Initialize streaming TTS inference state.
-
-        This prepares the model for streaming inference by processing the context
-        (audio + context text) and returning a StreamingState that can be used
-        with streaming_step() to incrementally generate audio.
-
-        Note: This function does NOT take the main text input. Text tokens are
-        provided incrementally via streaming_step().
-
-        For batched inference, each batch item can have a different context length.
-        This function processes only up to the minimum context length across the batch,
-        storing the remaining context to be processed in streaming_step's context phase.
-
-        The streaming inference follows phases (per batch item):
-        1. Context phase: Processing remaining context (if any) for items with longer context.
-        2. Prompt phase: First `streaming_speech_delay` text tokens are processed
-           without generating audio (building up context).
-        3. Generation phase: Audio BOS is added and audio codes are generated
-           autoregressively, with remaining text tokens added to audio embeddings.
-
-        Args:
-            context_audio_codes: Pre-computed audio codes for context audio (B, C, T').
-            context_audio_codes_lens: Length of context audio codes (B,).
-            context_text_tokens: Context text token IDs for speaker/style conditioning (B, L).
-            context_text_tokens_lens: Length of context text (B,).
-            inference_mode: Name of the inference mode to use (e.g., "streaming_4_8").
-                If None, uses the default inference mode.
-            use_cfg: Whether to use classifier-free guidance.
-            cfg_scale: CFG scale factor (higher = stronger conditioning).
-            use_local_transformer: Whether to use local transformer for AR sampling.
-            temperature: Sampling temperature for audio codes.
-            topk: Top-k sampling parameter.
-            phoneme_input_type: 'gt' or 'predicted' for phoneme tokens (use 'predicted' for streaming).
-            phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
-            gt_phoneme_tokens: Optional GT phoneme tokens (B, L) with BOS/EOS for teacher forcing.
-            gt_phoneme_tokens_lens: Lengths of GT phoneme tokens (B,).
-            gt_audio_codes: Optional GT audio codes (B, C*S, T) already stacked with BOS/EOS,
-                input portion ([:, :, :-1]) for teacher forcing. Pre-processed by caller.
-            gt_audio_codes_lens: Lengths of GT audio codes (B,) after stacking.
-
-        Returns:
-            StreamingState: Initial state for streaming inference.
-        """
-        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
-        with grad_ctx():
-            batch_size = context_audio_codes.size(0)
-            device = context_audio_codes.device
-
-            # Resolve inference mode
-            mode_name = inference_mode if inference_mode is not None else self.default_inference_mode
-            if mode_name not in self.mode_name_to_mode:
-                available_modes = list(self.mode_name_to_mode.keys())
-                raise ValueError(f"Unknown inference mode '{mode_name}'. Available modes: {available_modes}")
-
-            selected_training_mode = self.mode_name_to_mode[mode_name]
-
-            # Prepare context embedding using shared helper
-            context_embedding, context_lens, context_audio_codes, context_audio_codes_lens = (
-                self.prepare_context_tensors(
-                    context_text_tokens=context_text_tokens,
-                    context_text_tokens_lens=context_text_tokens_lens,
-                    context_audio_codes=context_audio_codes,
-                    context_audio_codes_lens=context_audio_codes_lens,
-                    training_mode=selected_training_mode,
-                    dropout_conditional_input=False,
-                )
-            )
-
-            # Store full context embedding and lens before any CFG manipulation
-            full_context_embedding = context_embedding.clone()  # (B, T_max, E)
-            full_context_lens = context_lens.clone()  # (B,)
-
-            # Compute min context length - we only process up to this in init
-            min_context_len = context_lens.min().item()
-
-            # Setup classifier-free guidance if enabled
-            dummy_context_embedding_unconditional = None
-            if use_cfg:
-                dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()(
-                    torch.full((1, 1), self.cfg_unk_token_id, device=device)
-                )
-                # Create unconditional context (same length as conditional)
-                dummy_context_expanded = dummy_context_embedding_unconditional.expand(
-                    batch_size, context_embedding.size(1), -1
-                )
-                # Concatenate conditional and unconditional: (2*B, T, E)
-                context_embedding = torch.cat([context_embedding, dummy_context_expanded], dim=0)
-
-            # First forward pass to process context - only up to min_context_len
-            cache_position = torch.arange(min_context_len, device=device)
-            transformer_out = self.forward(
-                inputs_embeds=context_embedding[:, :min_context_len, :],
-                attention_mask=None,
-                use_cache=True,
-                past_key_values=None,
-                cache_position=cache_position,
-            )
-
-            last_hidden = transformer_out.last_hidden_state
-            past_kv = transformer_out.past_key_values
-            current_cache_seq_len = min_context_len
-
-            # Process GT phoneme tokens if provided (for teacher forcing)
-            gt_phoneme_embeddings = None
-            gt_phoneme_lens = None
-            if gt_phoneme_tokens is not None and gt_phoneme_tokens_lens is not None:
-                gt_phoneme_expanded = gt_phoneme_tokens.unsqueeze(1)  # (B, 1, L)
-                gt_phoneme_stacked, gt_phoneme_lens = self.stack_codes(
-                    gt_phoneme_expanded,
-                    gt_phoneme_tokens_lens,
-                    self.phoneme_tokenizer.bos_token_id,
-                    self.phoneme_tokenizer.eos_token_id,
-                    self.phoneme_stacking_factor,
-                    1,
-                )
-                gt_phoneme_embeddings = self.embed_phoneme_tokens(gt_phoneme_stacked)  # (B, T', E)
-
-            # Process GT audio codes if provided (for teacher forcing)
-            gt_audio_embeddings = None
-            gt_audio_lens_state = None
-            if gt_audio_codes is not None and gt_audio_codes_lens is not None:
-                gt_audio_embeddings = self.embed_audio_tokens(gt_audio_codes)  # (B, T', E)
-                gt_audio_lens_state = gt_audio_codes_lens
-
-            # Initialize streaming state with batch support
-            state = StreamingState(
-                batch_size=batch_size,
-                past_key_values=past_kv,
-                cache_seq_len=current_cache_seq_len,
-                all_predictions=[],
-                all_phoneme_predictions=[],
-                context_audio_codes=context_audio_codes,
-                context_audio_codes_lens=context_audio_codes_lens,
-                context_lens=context_lens,
-                full_context_embedding=full_context_embedding,
-                full_context_lens=full_context_lens,
-                context_position=torch.full((batch_size,), min_context_len, dtype=torch.long, device=device),
-                text_tokens_seen=torch.zeros(batch_size, dtype=torch.long, device=device),
-                phoneme_steps=torch.zeros(batch_size, dtype=torch.long, device=device),
-                audio_steps=torch.zeros(batch_size, dtype=torch.long, device=device),
-                phoneme_stream_ended=torch.zeros(batch_size, dtype=torch.bool, device=device),
-                phoneme_eos_detected=torch.zeros(batch_size, dtype=torch.bool, device=device),
-                finished=torch.zeros(batch_size, dtype=torch.bool, device=device),
-                device=device,
-                training_mode=selected_training_mode,
-                use_cfg=use_cfg,
-                cfg_scale=cfg_scale,
-                use_local_transformer=use_local_transformer,
-                temperature=temperature,
-                topk=topk,
-                dummy_context_embedding_unconditional=dummy_context_embedding_unconditional,
-                last_hidden=last_hidden,
-                text_finished=torch.zeros(batch_size, dtype=torch.bool, device=device),
-                phoneme_input_type=phoneme_input_type,
-                phoneme_sampling_method=phoneme_sampling_method,
-                last_phoneme_tokens=None,
-                last_audio_codes=None,
-                audio_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
-                audio_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
-                phoneme_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
-                phoneme_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
-                gt_phoneme_embeddings=gt_phoneme_embeddings,
-                gt_phoneme_lens=gt_phoneme_lens,
-                gt_audio_embeddings=gt_audio_embeddings,
-                gt_audio_lens=gt_audio_lens_state,
-            )
-
-            return state
-
-    def streaming_step(
-        self,
-        state: StreamingState,
-        text_tokens: Optional[torch.Tensor] = None,
-        force_dropout_text: bool = False,
-        use_inference_mode: bool = True,
-    ) -> Tuple[StreamingState, Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Perform one streaming inference step with batch support.
-
-        This function processes one text token per batch item (or signals end of text with None)
-        and generates predictions according to the streaming delays. Each batch item can be
-        in a different phase.
-
-        The streaming operates in four phases per batch item:
-        1. Context phase (context_position < full_context_lens):
-           - Still processing remaining context from streaming_init
-           - Uses context embedding, ignores text_tokens for this item
-        2. Prompt phase (text_tokens_seen < phoneme_delay):
-           - Only text tokens are processed, KV cache is extended
-           - No phoneme or audio predictions
-        3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay):
-           - Starts with phoneme BOS on first step
-           - Only phoneme predictions (no audio)
-           - Input: text embedding + phoneme embedding
-        4. Audio phase (text_tokens_seen >= speech_delay):
-           - Starts with audio BOS on first step
-           - Both phoneme and audio predictions
-           - Input: text embedding + phoneme embedding + audio embedding
-
-        IMPORTANT: Only ONE forward call to the decoder per streaming_step.
-
-        Args:
-            state: Current StreamingState from streaming_init or previous streaming_step.
-            text_tokens: Next text token for each batch item, shape (B,), or None if text has finished.
-                For items still in context phase, the text_token value is ignored (can be 0).
-                When None is passed, the model continues generating until EOS.
-
-        Returns:
-            Tuple of:
-                - Updated StreamingState
-                - Predicted audio codes for this step (B, C, S) unstacked, or None if no items in audio phase
-                  where C = num_audio_codebooks and S = frame_stacking_factor
-                - Predicted phoneme tokens for this step (B, phoneme_stacking_factor) or None if no items in phoneme phase
-        """
-        if state.finished.all():
-            return state, None, None
-
-        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
-        with grad_ctx():
-            device = state.device
-            batch_size = state.batch_size
-            streaming_speech_delay = state.training_mode.streaming_speech_delay
-            streaming_phonemes_delay = state.training_mode.streaming_phonemes_delay
-
-            # ==================== DETERMINE PHASES PER BATCH ITEM ====================
-            needs_context = state.context_position < state.full_context_lens  # (B,) bool
-            needs_text = (~needs_context) & (~state.text_finished)
-            needs_phoneme = (
-                (~needs_context) & (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended)
-            )
-            needs_audio = (~needs_context) & (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished)
-
-            next_input = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
-            # --- Context phase items: use next context embedding ---
-            if needs_context.any():
-                # Gather context embeddings at current position for each item
-                # context_position: (B,) - position indices
-                # full_context_embedding: (B, T_max, E)
-                ctx_positions = state.context_position.clone()  # (B,)
-                # Clamp positions to valid range for gathering
-                ctx_positions = ctx_positions.clamp(max=state.full_context_embedding.size(1) - 1)
-                # Gather: need (B, 1, E) from (B, T, E) at positions (B,)
-                ctx_emb = state.full_context_embedding[
-                    torch.arange(batch_size, device=device), ctx_positions, :
-                ].unsqueeze(
-                    1
-                )  # (B, 1, E)
-                # Only apply to items in context phase
-                context_mask = needs_context.view(batch_size, 1, 1).float()
-                next_input = next_input + ctx_emb * context_mask
-
-            # --- Non-context phase items: handle text embedding ---
-            text_embedded = None
-            if text_tokens is not None and needs_text.any():
-                # Embed text tokens for all items (will be masked later)
-                text_tokens_2d = text_tokens.unsqueeze(1)  # (B, 1)
-                text_embedded = self.decoder.get_input_embeddings()(text_tokens_2d)  # (B, 1, E)
-
-                # Handle BPE char tokenizer
-                if self.use_bpe_char_tokenizer:
-                    text_mask = torch.ones_like(text_tokens_2d, dtype=torch.bool)
-                    cas_embedding = self.cas_encoder(text_tokens_2d, subword_mask=text_mask)  # (B, 1, E)
-                    text_embedded = text_embedded + cas_embedding
-
-                if force_dropout_text:
-                    text_embedded = text_embedded * 0
-
-                # Check for EOS tokens - mark those items as text_finished
-                # The EOS token itself IS embedded normally (matching process_batch behavior
-                # where EOS is part of the text sequence). After this step, text_finished is set
-                # so subsequent steps won't add any text embedding.
-                is_eos_token = (text_tokens == self.eos_id) & needs_text  # (B,) bool
-                text_add_mask = needs_text.view(batch_size, 1, 1).float()
-                next_input = next_input + text_embedded * text_add_mask
-                state.text_finished = state.text_finished | is_eos_token
-
-            elif text_tokens is None:
-                # Text finished signal for items not in context phase
-                state.text_finished = state.text_finished | ~needs_context
-
-            # --- Phoneme embedding for phoneme and audio phase items ---
-            if self.phoneme_tokenizer is not None:
-                if needs_phoneme.any():
-                    phoneme_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
-
-                    if state.phoneme_input_type == 'gt' and state.gt_phoneme_embeddings is not None:
-                        # Teacher forcing: use pre-computed GT phoneme embeddings
-                        # Only use GT embedding if within valid length, otherwise zero
-                        within_gt_len = state.phoneme_steps < state.gt_phoneme_lens  # (B,)
-                        positions = state.phoneme_steps.clamp(max=state.gt_phoneme_embeddings.size(1) - 1)
-                        gt_emb = state.gt_phoneme_embeddings[
-                            torch.arange(batch_size, device=device), positions, :
-                        ].unsqueeze(
-                            1
-                        )  # (B, 1, E)
-                        phoneme_mask = (needs_phoneme & within_gt_len).view(batch_size, 1, 1).float()
-                        phoneme_emb = phoneme_emb + gt_emb * phoneme_mask
-                    else:
-                        # Prediction mode: use BOS or last predicted phoneme
-                        first_phoneme_step = needs_phoneme & (state.phoneme_steps == 0)
-                        has_last_phoneme = (
-                            needs_phoneme & (~first_phoneme_step) & (state.last_phoneme_tokens is not None)
-                        )
-
-                        if first_phoneme_step.any():
-                            phoneme_bos = torch.full(
-                                (batch_size, self.phoneme_stacking_factor, 1),
-                                self.phoneme_tokenizer.bos_token_id,
-                                device=device,
-                            ).long()
-                            phoneme_bos_emb = self.embed_phoneme_tokens(phoneme_bos)  # (B, 1, E)
-                            first_mask = first_phoneme_step.view(batch_size, 1, 1).float()
-                            phoneme_emb = phoneme_emb + phoneme_bos_emb * first_mask
-
-                        if has_last_phoneme.any() and state.last_phoneme_tokens is not None:
-                            last_phoneme_emb = self.embed_phoneme_tokens(
-                                state.last_phoneme_tokens.unsqueeze(2)
-                            )  # (B, 1, E)
-                            last_mask = has_last_phoneme.view(batch_size, 1, 1).float()
-                            phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask
-
-                        # Only end phoneme stream in prediction mode when the phoneme EOS is detected
-                        state.phoneme_stream_ended = state.phoneme_stream_ended | state.phoneme_eos_detected
-
-                    next_input = next_input + phoneme_emb
-
-            # --- Audio embedding for audio phase items ---
-            if needs_audio.any():
-                audio_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
-
-                if state.gt_audio_embeddings is not None:
-                    # Teacher forcing: use pre-computed GT audio embeddings
-                    # Only use GT embedding if within valid length, otherwise zero
-                    within_gt_len = state.audio_steps < state.gt_audio_lens  # (B,)
-                    positions = state.audio_steps.clamp(max=state.gt_audio_embeddings.size(1) - 1)
-                    gt_emb = state.gt_audio_embeddings[
-                        torch.arange(batch_size, device=device), positions, :
-                    ].unsqueeze(
-                        1
-                    )  # (B, 1, E)
-                    audio_mask = (needs_audio & within_gt_len).view(batch_size, 1, 1).float()
-                    audio_emb = audio_emb + gt_emb * audio_mask
-                else:
-                    # Prediction mode: use BOS or last predicted audio
-                    first_audio_step = needs_audio & (state.audio_steps == 0)
-                    has_last_audio = needs_audio & ~first_audio_step & (state.last_audio_codes is not None)
-
-                    if first_audio_step.any():
-                        # Create BOS for items at first audio step
-                        audio_bos = torch.full(
-                            (batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1),
-                            self.audio_bos_id,
-                            device=device,
-                        ).long()
-                        audio_bos_emb = self.embed_audio_tokens(audio_bos)  # (B, 1, E)
-                        first_mask = first_audio_step.view(batch_size, 1, 1).float()
-                        audio_emb = audio_emb + audio_bos_emb * first_mask
-
-                    if has_last_audio.any() and state.last_audio_codes is not None:
-                        # Use last predicted audio
-                        last_audio_emb = self.embed_audio_tokens(state.last_audio_codes.unsqueeze(2))  # (B, 1, E)
-                        last_mask = has_last_audio.view(batch_size, 1, 1).float()
-                        audio_emb = audio_emb + last_audio_emb * last_mask
-
-                next_input = next_input + audio_emb
-
-            # ==================== HANDLE CFG ====================
-            if state.use_cfg:
-                # For unconditional branch, use dummy embedding for non-audio items
-                # and audio-only embedding for audio items
-                next_input_unconditional_context = state.dummy_context_embedding_unconditional.expand(
-                    batch_size, 1, -1
-                )
-                # After the context is finished, we use zero embedding for the unconditional branch until audio phase starts
-                next_input_unconditional_zeros = torch.zeros_like(next_input_unconditional_context)
-                context_mask = needs_context.view(batch_size, 1, 1).float()
-                next_input_unconditional = (
-                    context_mask * next_input_unconditional_context
-                    + (1 - context_mask) * next_input_unconditional_zeros
-                )
-
-                # For audio phase items, we use audio embedding for the unconditional branch
-                if needs_audio.any():
-                    audio_mask = needs_audio.view(batch_size, 1, 1).float()
-                    next_input_unconditional = next_input_unconditional * (1 - audio_mask) + audio_emb * audio_mask
-
-                # Concatenate conditional and unconditional: (2*B, 1, E)
-                next_input = torch.cat([next_input, next_input_unconditional], dim=0)
-
-            # ==================== FORWARD PASS ====================
-            cache_position = torch.tensor([state.cache_seq_len], device=device)
-            transformer_out = self.forward(
-                inputs_embeds=next_input,
-                attention_mask=None,
-                use_cache=True,
-                past_key_values=state.past_key_values,
-                cache_position=cache_position,
-            )
-
-            state.last_hidden = transformer_out.last_hidden_state
-            state.past_key_values = transformer_out.past_key_values
-            state.cache_seq_len += 1
-
-            # ==================== UPDATE STATE ====================
-            # Update context_position for items in context phase
-            state.context_position = state.context_position + needs_context.long()
-            # Keep updating text_tokens_seen for items once the context is finished
-            # This is because this counter is used to determine when to start predicting phonemes and audio
-            state.text_tokens_seen = state.text_tokens_seen + (~needs_context).long()
-
-            # Update phoneme_steps for items in phoneme or audio phase
-            state.phoneme_steps = state.phoneme_steps + needs_phoneme.long()
-
-            # Update audio_steps for items in audio phase
-            state.audio_steps = state.audio_steps + needs_audio.long()
-
-            # ==================== PREDICTIONS ====================
-            pred_phoneme_tokens = None
-            audio_codes_next = None
-
-            # Phoneme predictions for items in phoneme or audio phase
-            if needs_phoneme.any() and self.phoneme_tokenizer is not None:
-                # Track phoneme prediction start index for items just entering phoneme phase
-                first_phoneme_step = needs_phoneme & (state.phoneme_prediction_start_idx == -1)
-                if first_phoneme_step.any():
-                    current_phoneme_step_idx = len(state.all_phoneme_predictions)  # before append
-                    state.phoneme_prediction_start_idx = torch.where(
-                        first_phoneme_step,
-                        torch.full_like(state.phoneme_prediction_start_idx, current_phoneme_step_idx),
-                        state.phoneme_prediction_start_idx,
-                    )
-
-                # Check which items should predict phonemes (not ended)
-                pred_phoneme_tokens = self._predict_phoneme_tokens(state)  # (B, phoneme_stacking_factor)
-                state.last_phoneme_tokens = pred_phoneme_tokens
-                state.all_phoneme_predictions.append(pred_phoneme_tokens)
-
-                # Check for phoneme EOS per item
-                phoneme_eos_detected = needs_phoneme & (
-                    pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id
-                ).any(
-                    dim=1
-                )  # (B,)
-
-                state.phoneme_eos_detected = state.phoneme_eos_detected | phoneme_eos_detected
-
-                # Track phoneme prediction end index for items that just ended
-                newly_ended_phoneme = phoneme_eos_detected & (state.phoneme_prediction_end_idx == -1)
-                if newly_ended_phoneme.any():
-                    current_phoneme_step_idx = len(state.all_phoneme_predictions)  # after append
-                    state.phoneme_prediction_end_idx = torch.where(
-                        newly_ended_phoneme,
-                        torch.full_like(state.phoneme_prediction_end_idx, current_phoneme_step_idx),
-                        state.phoneme_prediction_end_idx,
-                    )
-
-            # Audio predictions for items in audio phase
-            if needs_audio.any():
-                # Track audio prediction start index for items just entering audio phase
-                first_audio_step = needs_audio & (state.audio_prediction_start_idx == -1)
-                if first_audio_step.any():
-                    # Track start in terms of frames (not steps)
-                    current_frame_idx = sum(p.size(-1) for p in state.all_predictions)  # total frames so far
-                    state.audio_prediction_start_idx = torch.where(
-                        first_audio_step,
-                        torch.full_like(state.audio_prediction_start_idx, current_frame_idx),
-                        state.audio_prediction_start_idx,
-                    )
-
-                audio_codes_next_stacked, all_codes_next_argmax = self._predict_audio_codes(state)  # (B, C*S)
-
-                # Unstack immediately: (B, C*S) -> (B, C, S) where S = frame_stacking_factor
-                S = self.frame_stacking_factor
-                C = self.num_audio_codebooks
-                audio_codes_unstacked = audio_codes_next_stacked.view(batch_size, C, S)  # (B, C, S)
-
-                # Update last_audio_codes with stacked format (needed for next step's embedding)
-                if state.last_audio_codes is None:
-                    state.last_audio_codes = audio_codes_next_stacked
-                else:
-                    update_mask = needs_audio.view(batch_size, 1).expand_as(audio_codes_next_stacked)
-                    state.last_audio_codes = torch.where(update_mask, audio_codes_next_stacked, state.last_audio_codes)
-
-                # Check for EOS in each frame and track exact end position
-                # Skip EOS detection in teacher-forced mode - rely on GT exhaustion instead
-                if state.gt_audio_embeddings is None:
-                    # all_codes_next_argmax is also (B, C*S), reshape to (B, C, S)
-                    all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S)
-
-                    # For each batch item, find if/where EOS occurs in this step's frames
-                    eos_in_sampled = audio_codes_unstacked == self.audio_eos_id  # (B, C, S)
-                    eos_in_argmax = all_codes_argmax_unstacked == self.audio_eos_id  # (B, C, S)
-                    eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1)  # (B, S)
-
-                    # Find first frame with EOS per batch item (or S if none)
-                    eos_frame_idx = torch.where(
-                        eos_any_codebook.any(dim=1),
-                        eos_any_codebook.int().argmax(dim=1),  # first frame with EOS
-                        torch.full((batch_size,), S, device=device),  # no EOS in this step
-                    )  # (B,)
-
-                    audio_eos_detected = eos_any_codebook.any(dim=1) & needs_audio
-                    state.finished = state.finished | audio_eos_detected
-
-                    # Track audio prediction end index (in frames) for items that just ended
-                    newly_ended_audio = audio_eos_detected & (state.audio_prediction_end_idx == -1)
-                    if newly_ended_audio.any():
-                        # End index = current frame count + frame offset where EOS was found
-                        current_frame_count = len(state.all_predictions) * self.frame_stacking_factor
-                        end_frame_idx = current_frame_count + eos_frame_idx
-                        state.audio_prediction_end_idx = torch.where(
-                            newly_ended_audio, end_frame_idx, state.audio_prediction_end_idx
-                        )
-
-                # Store unstacked codes
-                state.all_predictions.append(audio_codes_unstacked)
-                audio_codes_next = audio_codes_unstacked
-
-            # Force-finish items when GT audio is exhausted (teacher forcing).
-            # This is checked AFTER predictions so the last valid prediction is still made.
-            # audio_steps was already incremented above. When audio_steps >= gt_audio_lens,
-            # we've consumed all GT input positions and made all corresponding predictions.
-            if state.gt_audio_embeddings is not None and state.gt_audio_lens is not None:
-                gt_exhausted = needs_audio & (state.audio_steps >= state.gt_audio_lens)
-                state.finished = state.finished | gt_exhausted
-
-            return state, audio_codes_next, pred_phoneme_tokens
-
-    def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor:
-        """Predict phoneme tokens from the last hidden state."""
-        actual_batch_size = state.batch_size
-        last_hidden = state.last_hidden
-
-        # Get phoneme logits
-        all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :])
-        all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size]
-        phoneme_logits = all_code_logits_t_phoneme.view(
-            actual_batch_size, self.phoneme_stacking_factor, self.phoneme_vocab_size
-        )
-        max_probs = torch.softmax(phoneme_logits, dim=-1).max(dim=-1).values  # (B, phoneme_stacking_factor)
-
-        # Sample phonemes
-        if state.phoneme_sampling_method == 'argmax':
-            pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.0)
-        else:
-            pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(
-                all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk
-            )
-
-        # In prediction mode, low-confidence phoneme steps are replaced with UNK across
-        # all stacked channels (except steps where EOS is predicted).
-        if (
-            state.phoneme_input_type != 'gt'
-            and hasattr(self.phoneme_tokenizer, 'unk_token_id')
-            and self.phoneme_confidence_unk_threshold > 0.0
-        ):
-            underconfident_step = (max_probs < self.phoneme_confidence_unk_threshold).any(
-                dim=1, keepdim=True
-            )  # (B, 1)
-            eos_predicted_step = (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1, keepdim=True)
-            replace_with_unk = underconfident_step & (~eos_predicted_step)
-            if replace_with_unk.any():
-                unk_tokens = torch.full_like(pred_phoneme_tokens, self.phoneme_tokenizer.unk_token_id)
-                pred_phoneme_tokens = torch.where(replace_with_unk, unk_tokens, pred_phoneme_tokens)
-        # (B, phoneme_stacking_factor)
-        return pred_phoneme_tokens
-
-    def _predict_audio_codes(self, state: StreamingState) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Predict audio codes from the last hidden state."""
-        actual_batch_size = state.batch_size
-        last_hidden = state.last_hidden
-
-        # Compute audio logits
-        last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :])
-        all_code_logits_t = self.final_proj(last_hidden_audio)
-
-        # Apply CFG if enabled
-        if state.use_cfg:
-            conditional_logits = all_code_logits_t[:actual_batch_size]
-            unconditional_logits = all_code_logits_t[actual_batch_size:]
-            all_code_logits_t = state.cfg_scale * conditional_logits + (1.0 - state.cfg_scale) * unconditional_logits
-
-        # Sample audio codes
-        audio_codes_next, all_codes_next_argmax = self._sample_audio_codes(
-            last_hidden=last_hidden,
-            all_code_logits_t=all_code_logits_t,
-            temperature=state.temperature,
-            topk=state.topk,
-            use_local_transformer_for_inference=state.use_local_transformer,
-            use_cfg=state.use_cfg,
-            cfg_scale=state.cfg_scale,
-        )
-
-        return audio_codes_next, all_codes_next_argmax
-
-    def streaming_finalize(
-        self,
-        state: StreamingState,
-        use_inference_mode: bool = True,
-    ) -> StreamingFinalizeOutput:
-        """
-        Finalize streaming and return the complete generated audio and phoneme predictions.
-
-        This function should be called after all streaming_step() calls are complete
-        (i.e., when state.finished.all() is True or max steps reached).
-
-        Args:
-            state: Final StreamingState after streaming is complete.
-
-        Returns:
-            StreamingFinalizeOutput containing audio, codes, and phoneme predictions.
-        """
-        batch_size = state.batch_size
-
-        # Extract and decode phoneme predictions
-        phoneme_tokens_list: List[List[int]] = []
-        phoneme_text_list: List[str] = []
-        if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0:
-            # Stack phoneme predictions: each is (B, phoneme_stacking_factor)
-            all_phonemes = torch.stack(state.all_phoneme_predictions, dim=-1)  # (B, S, T)
-            for i in range(batch_size):
-                start = max(0, state.phoneme_prediction_start_idx[i].item())
-                end = state.phoneme_prediction_end_idx[i].item()
-                if end < 0:
-                    end = all_phonemes.size(-1)
-                # Flatten stacked phonemes back to sequence
-                tokens = all_phonemes[i, :, start:end].T.reshape(-1).tolist()
-                # Remove special tokens (BOS, EOS, PAD)
-                special = {self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.eos_token_id}
-                if hasattr(self.phoneme_tokenizer, 'pad_token_id'):
-                    special.add(self.phoneme_tokenizer.pad_token_id)
-                tokens = [t for t in tokens if t not in special]
-                phoneme_tokens_list.append(tokens)
-                phoneme_text_list.append(self.phoneme_tokenizer.decode(tokens))
-        else:
-            phoneme_tokens_list = [[] for _ in range(batch_size)]
-            phoneme_text_list = ["" for _ in range(batch_size)]
-
-        if len(state.all_predictions) == 0:
-            return StreamingFinalizeOutput(
-                audio=torch.zeros(batch_size, 0, device=state.device),
-                audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
-                audio_codes=torch.zeros(batch_size, self.num_audio_codebooks, 0, device=state.device),
-                audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
-                phoneme_tokens=phoneme_tokens_list,
-                phoneme_text=phoneme_text_list,
-            )
-
-        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
-        with grad_ctx():
-            # Concatenate all predictions - each is (B, C, S), concat gives (B, C, T_total_frames)
-            all_codes = torch.cat(state.all_predictions, dim=-1)  # (B, C, T_total_frames)
-            total_frames = all_codes.size(-1)
-            num_codebooks = all_codes.size(1)
-
-            # Start and end indices are in frames (not steps)
-            # If start_idx is -1, item never started audio predictions - use 0
-            # If end_idx is -1, item never ended - use total_frames
-            start_indices = torch.clamp(state.audio_prediction_start_idx, min=0)
-            end_indices = torch.where(
-                state.audio_prediction_end_idx >= 0,
-                state.audio_prediction_end_idx,
-                torch.full_like(state.audio_prediction_end_idx, total_frames),
-            )
-
-            # Calculate per-item lengths (in frames)
-            predicted_codes_lens = end_indices - start_indices
-            max_len = predicted_codes_lens.max().item()
-
-            # Handle case where all items have zero-length predictions
-            if max_len == 0:
-                return StreamingFinalizeOutput(
-                    audio=torch.zeros(batch_size, 0, device=state.device),
-                    audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
-                    audio_codes=torch.zeros(batch_size, num_codebooks, 0, device=state.device, dtype=all_codes.dtype),
-                    audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
-                    phoneme_tokens=phoneme_tokens_list,
-                    phoneme_text=phoneme_text_list,
-                )
-
-            # Create padded output tensor and slice each item's valid predictions
-            predicted_codes = torch.zeros(
-                batch_size, num_codebooks, max_len, dtype=all_codes.dtype, device=state.device
-            )
-            for i in range(batch_size):
-                start = start_indices[i].item()
-                end = end_indices[i].item()
-                length = end - start
-                if length > 0:
-                    predicted_codes[i, :, :length] = all_codes[i, :, start:end]
-
-            # No need to remove EOS - end_indices already point to the frame before EOS
-            # Decode to audio (codes are already unstacked: B, C, T)
-            audio, audio_len, decoded_codes = self.codes_to_audio(predicted_codes, predicted_codes_lens)
-
-            return StreamingFinalizeOutput(
-                audio=audio,
-                audio_len=audio_len,
-                audio_codes=predicted_codes,
-                audio_codes_len=predicted_codes_lens,
-                phoneme_tokens=phoneme_tokens_list,
-                phoneme_text=phoneme_text_list,
-            )
-
-    def infer_batch(
-        self,
-        batch: Dict[str, torch.Tensor],
-        max_decoder_steps: int = 500,
-        temperature: float = 0.7,
-        topk: int = 80,
-        use_cfg: bool = False,
-        cfg_scale: float = 1.0,
-        use_local_transformer_for_inference: bool = False,
-        phoneme_input_type: str = 'pred',
-        phoneme_sampling_method: str = 'argmax',
-        force_dropout_text: bool = False,
-        use_teacher_forced: bool = False,
-        use_inference_mode: bool = True,
-    ) -> InferBatchOutput:
-        """
-        Batch inference using streaming infrastructure.
-
-        This is a simple wrapper around streaming_init, streaming_step, and streaming_finalize
-        that processes a batch dictionary similar to training_step/validation_step.
-
-        Args:
-            batch: Dictionary containing:
-                - text: Text token IDs (B, L)
-                - text_lens: Lengths (B,)
-                - context_text_tokens: Context text tokens (B, L')
-                - context_text_tokens_lens: Lengths (B,)
-                - context_audio_codes: Context audio codes (B, C, T) OR
-                - context_audio / context_audio_lens: Raw context audio to encode
-                - phoneme_tokens (optional): GT phoneme tokens (B, L'')
-                - phoneme_tokens_lens (optional): Lengths (B,)
-                For teacher forcing (use_teacher_forced=True), also requires:
-                - audio_codes / audio_codes_lens: GT audio codes (B, C, T) OR
-                - audio / audio_lens: Raw audio waveforms to encode
-            max_decoder_steps: Maximum number of decoder steps.
-            temperature: Sampling temperature for audio codes. Use 0.0 for argmax.
-            topk: Top-k sampling parameter.
-            use_cfg: Whether to use classifier-free guidance.
-            cfg_scale: CFG scale factor.
-            use_local_transformer_for_inference: Whether to use local transformer.
-            phoneme_input_type: 'gt' or 'pred' for phoneme tokens.
-            phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
-            force_dropout_text: Whether to dropout text embeddings.
-            use_teacher_forced: If True, feed GT audio codes (and force GT phonemes, argmax sampling)
-                instead of predicted codes at each streaming step.
-
-        Returns:
-            InferBatchOutput containing predicted audio, codes, and RTF metrics.
-        """
-        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
-        with grad_ctx():
-            start_time = time.time()
-
-            # Extract tensors from batch
-            text = batch['text']
-            text_lens = batch['text_lens']
-            context_text_tokens = batch['context_text_tokens']
-            context_text_tokens_lens = batch['context_text_tokens_lens']
-
-            # Handle context audio - either use codes directly or encode from audio
-            if 'context_audio_codes' in batch:
-                context_audio_codes = batch['context_audio_codes']
-                context_audio_codes_lens = batch['context_audio_codes_lens']
-            else:
-                context_audio = batch['context_audio']
-                context_audio_lens = batch['context_audio_lens']
-                context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
-
-            # Optional GT phoneme tokens for teacher forcing
-            gt_phoneme_tokens = batch.get('phoneme_tokens')
-            gt_phoneme_tokens_lens = batch.get('phoneme_tokens_lens')
-
-            # Prepare GT audio codes for teacher forcing if requested
-            gt_audio_codes_for_init = None
-            gt_audio_codes_lens_for_init = None
-            if use_teacher_forced:
-                # Force GT phoneme input and argmax sampling
-                phoneme_input_type = 'gt'
-                temperature = 0.0
-
-                # Get GT audio codes - support both codes and raw audio
-                if 'audio_codes' in batch:
-                    gt_audio_codes_raw = batch['audio_codes']
-                    gt_audio_codes_lens_raw = batch['audio_codes_lens']
-                elif 'audio' in batch:
-                    gt_audio_codes_raw, gt_audio_codes_lens_raw = self.audio_to_codes(
-                        batch['audio'], batch['audio_lens']
-                    )
-                else:
-                    raise ValueError(
-                        "Teacher forcing requires 'audio_codes'/'audio_codes_lens' or 'audio'/'audio_lens' in batch."
-                    )
-
-                # Pre-process GT audio codes same as prepare_audio_channel_embeddings:
-                # codec convert, add BOS/EOS, stack, then take input portion ([:, :, :-1])
-                if self._codec_converter is not None:
-                    gt_audio_codes_raw = self._codec_converter.convert_original_to_new(
-                        audio_tokens=gt_audio_codes_raw, audio_lens=gt_audio_codes_lens_raw
-                    ).long()
-
-                gt_audio_codes_processed, gt_audio_codes_lens_processed = self.add_special_tokens(
-                    codes=gt_audio_codes_raw,
-                    codes_len=gt_audio_codes_lens_raw,
-                    bos_id=self.audio_bos_id,
-                    eos_id=self.audio_eos_id,
-                )
-                gt_audio_codes_processed, gt_audio_codes_lens_processed = self.stack_codes(
-                    gt_audio_codes_processed,
-                    gt_audio_codes_lens_processed,
-                    self.audio_bos_id,
-                    self.audio_eos_id,
-                    self.frame_stacking_factor,
-                    self.num_audio_codebooks,
-                )
-
-                # Input portion: all tokens except the last (teacher forcing shift)
-                gt_audio_codes_for_init = gt_audio_codes_processed[:, :, :-1]
-                gt_audio_codes_lens_for_init = gt_audio_codes_lens_processed - 1
-
-            batch_size = text.size(0)
-
-            # Initialize streaming state
-            state = self.streaming_init(
-                context_audio_codes=context_audio_codes,
-                context_audio_codes_lens=context_audio_codes_lens,
-                context_text_tokens=context_text_tokens,
-                context_text_tokens_lens=context_text_tokens_lens,
-                use_cfg=use_cfg,
-                cfg_scale=cfg_scale,
-                use_local_transformer=use_local_transformer_for_inference,
-                temperature=temperature,
-                topk=topk,
-                phoneme_input_type=phoneme_input_type,
-                phoneme_sampling_method=phoneme_sampling_method,
-                gt_phoneme_tokens=gt_phoneme_tokens,
-                gt_phoneme_tokens_lens=gt_phoneme_tokens_lens,
-                gt_audio_codes=gt_audio_codes_for_init,
-                gt_audio_codes_lens=gt_audio_codes_lens_for_init,
-                use_inference_mode=use_inference_mode,
-            )
-
-            time_to_first_prediction = None
-            generation_start_time = time.time()
-            device = text.device
-
-            # Generate until all items are finished or max steps reached
-            print("Generation started")
-            gen_step = 0
-            while not state.finished.all() and len(state.all_predictions) < max_decoder_steps:
-                gen_step += 1
-                if gen_step % 10 == 0:
-                    print(f"Generation step {gen_step} ")
-                # Gather the correct text token for each batch item based on text_tokens_seen
-                # Items in context phase will have their token ignored by streaming_step
-                positions = state.text_tokens_seen.clamp(max=text.size(1) - 1)
-                current_tokens = text[torch.arange(batch_size, device=device), positions]
-
-                # For items that have exhausted their text, provide EOS token
-                text_exhausted = state.text_tokens_seen >= text_lens
-                current_tokens = torch.where(
-                    text_exhausted, torch.full_like(current_tokens, self.eos_id), current_tokens
-                )
-
-                state, audio_codes, phoneme_tokens = self.streaming_step(
-                    state=state,
-                    text_tokens=current_tokens,
-                    force_dropout_text=force_dropout_text,
-                    use_inference_mode=use_inference_mode,
-                )
-
-                # Record time to first audio prediction
-                if time_to_first_prediction is None and audio_codes is not None:
-                    time_to_first_prediction = time.time() - start_time
-
-            tts_generation_time = time.time() - generation_start_time
-
-            # Finalize and decode audio
-            finalize_output = self.streaming_finalize(state, use_inference_mode=use_inference_mode)
-
-            end_time = time.time()
-            total_time = end_time - start_time
-
-            # Compute RTF metrics
-            total_audio_samples = finalize_output.audio_len.sum().item()
-            total_audio_duration = total_audio_samples / self.output_sample_rate
-            num_frames = len(state.all_predictions)
-            tts_generation_time_per_frame = tts_generation_time / num_frames if num_frames > 0 else 0.0
-
-            rtf_metrics = {
-                'rtf': total_audio_duration / total_time if total_time > 0 else 0.0,
-                'time_to_first_prediction': time_to_first_prediction,
-                'tts_generation_time': tts_generation_time,
-                'max_frames_generated': num_frames,
-                'tts_generation_time_per_frame': tts_generation_time_per_frame,
-                'batch_size': batch_size,
-            }
-
-            # Extract raw phoneme predictions from state
-            ib_phoneme_tokens = None
-            ib_phoneme_tokens_lens = None
-            if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0:
-                # Stack: each element is (B, phoneme_stacking_factor), stack along time -> (B, S, T)
-                ib_phoneme_tokens = torch.stack(state.all_phoneme_predictions, dim=-1)  # (B, S, T)
-                # Compute per-item lengths using start/end indices
-                ib_phoneme_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device)
-                for i in range(batch_size):
-                    start = max(0, state.phoneme_prediction_start_idx[i].item())
-                    end = state.phoneme_prediction_end_idx[i].item()
-                    if end < 0:
-                        end = ib_phoneme_tokens.size(-1)
-                    ib_phoneme_tokens_lens[i] = end - start
-
-            return InferBatchOutput(
-                predicted_audio=finalize_output.audio,
-                predicted_audio_lens=finalize_output.audio_len,
-                predicted_codes=finalize_output.audio_codes,
-                predicted_codes_lens=finalize_output.audio_codes_len,
-                rtf_metrics=rtf_metrics,
-                predicted_phoneme_tokens=ib_phoneme_tokens,
-                predicted_phoneme_tokens_lens=ib_phoneme_tokens_lens,
-                phoneme_prediction_start_idx=(
-                    state.phoneme_prediction_start_idx.clone() if ib_phoneme_tokens is not None else None
-                ),
-            )
-
-    @staticmethod
-    def _load_audio_for_inference(audio_path: str, target_sample_rate: int) -> torch.Tensor:
-        """
-        Load context audio and resample if needed.
-        Returns tensor of shape (1, num_samples).
-        """
-        audio, sr = sf.read(audio_path, dtype='float32')
-        if len(audio.shape) > 1:
-            audio = audio.mean(axis=1)
-        if sr != target_sample_rate:
-            import librosa
-
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate)
-        return torch.from_numpy(audio).unsqueeze(0)
-
-    @staticmethod
-    def _adjust_audio_to_duration_for_inference(
-        audio: torch.Tensor,
-        sample_rate: int,
-        target_duration: float,
-        codec_model_samples_per_frame: int,
-    ) -> torch.Tensor:
-        """
-        Match the same duration-alignment logic used in magpietts_streaming_inference.py.
-        """
-        num_codec_frames = int(target_duration * sample_rate / codec_model_samples_per_frame)
-        target_num_samples = num_codec_frames * codec_model_samples_per_frame
-        current_num_samples = audio.size(1)
-
-        if current_num_samples >= target_num_samples:
-            audio = audio[:, :target_num_samples]
-        else:
-            num_repeats = int(np.ceil(target_num_samples / current_num_samples))
-            audio_repeated = audio.repeat(1, num_repeats)
-            audio = audio_repeated[:, :target_num_samples]
-        return audio
-
-    def do_tts(
-        self,
-        transcript: str,
-        context_audio_file_path: Optional[str] = None,
-        context_text: str = "[NO TEXT CONTEXT]",
-        main_tokenizer_name: Optional[str] = None,
-        context_audio_duration: float = 5.0,
-        use_cfg: bool = True,
-        cfg_scale: float = 2.5,
-        use_local_transformer: bool = True,
-        temperature: float = 0.7,
-        topk: int = 80,
-        max_steps: int = 330,
-        gt_phoneme_text: Optional[str] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Generate speech from transcript using EasyMagpie inference with optional context text/audio.
-        Optionally accepts ground-truth phoneme text (IPA string) for decoder-only inference.
-        """
-        if transcript is None or transcript.strip() == "":
-            raise ValueError("`transcript` must be a non-empty string.")
-
-        device = next(self.parameters()).device
-        transcript = transcript.strip()
-        context_text = (context_text or "[NO TEXT CONTEXT]").strip()
-
-        if main_tokenizer_name is None:
-            # Match model init behavior: default to first configured tokenizer.
-            main_tokenizer_name = list(self.cfg.text_tokenizers.keys())[0]
-        if main_tokenizer_name not in self.tokenizer.tokenizers:
-            raise ValueError(
-                f"Unknown main_tokenizer_name='{main_tokenizer_name}'. "
-                f"Available tokenizers: {list(self.tokenizer.tokenizers.keys())}"
-            )
-
-        text_tokens = self.tokenizer.encode(transcript, tokenizer_name=main_tokenizer_name) + [self.eos_id]
-        text = torch.tensor([text_tokens], dtype=torch.long, device=device)
-        text_lens = torch.tensor([len(text_tokens)], dtype=torch.long, device=device)
-
-        context_text_tokens = self.tokenizer.encode(context_text, tokenizer_name=self.text_conditioning_tokenizer_name)
-        context_text_tensor = torch.tensor([context_text_tokens], dtype=torch.long, device=device)
-        context_text_lens = torch.tensor([len(context_text_tokens)], dtype=torch.long, device=device)
-
-        if context_audio_file_path is not None and context_audio_file_path.strip() != "":
-            context_audio = self._load_audio_for_inference(context_audio_file_path, self.sample_rate)
-            context_audio = self._adjust_audio_to_duration_for_inference(
-                context_audio,
-                self.sample_rate,
-                context_audio_duration,
-                self.codec_model_samples_per_frame,
-            )
-            context_audio = context_audio.to(device)
-            context_audio_lens = torch.tensor([context_audio.size(1)], dtype=torch.long, device=device)
-            with torch.inference_mode():
-                context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
-        else:
-            context_audio_codes = torch.zeros(
-                1,
-                self.data_num_audio_codebooks,
-                0,
-                dtype=torch.long,
-                device=device,
-            )
-            context_audio_codes_lens = torch.zeros(1, dtype=torch.long, device=device)
-
-        batch = {
-            'text': text,
-            'text_lens': text_lens,
-            'context_text_tokens': context_text_tensor,
-            'context_text_tokens_lens': context_text_lens,
-            'context_audio_codes': context_audio_codes,
-            'context_audio_codes_lens': context_audio_codes_lens,
-        }
-        phoneme_input_type = 'pred'
-        if gt_phoneme_text is not None:
-            if self.phoneme_tokenizer is None:
-                raise ValueError(
-                    "Model does not have a phoneme tokenizer configured, but gt_phoneme_text was provided."
-                )
-            gt_phoneme_text = gt_phoneme_text.strip()
-            if gt_phoneme_text == "":
-                raise ValueError("`gt_phoneme_text` must be a non-empty string when provided.")
-            gt_phoneme_tokens = self.phoneme_tokenizer.encode(gt_phoneme_text)
-            gt_phoneme_tokens = (
-                [self.phoneme_tokenizer.bos_token_id] + gt_phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
-            )
-            if len(gt_phoneme_tokens) == 0:
-                raise ValueError("Failed to encode `gt_phoneme_text` into phoneme tokens.")
-            batch['phoneme_tokens'] = torch.tensor([gt_phoneme_tokens], dtype=torch.long, device=device)
-            batch['phoneme_tokens_lens'] = torch.tensor([len(gt_phoneme_tokens)], dtype=torch.long, device=device)
-            phoneme_input_type = 'gt'
-
-        with torch.inference_mode():
-            output = self.infer_batch(
-                batch=batch,
-                max_decoder_steps=max_steps,
-                temperature=temperature,
-                topk=topk,
-                use_cfg=use_cfg,
-                cfg_scale=cfg_scale,
-                use_local_transformer_for_inference=use_local_transformer,
-                phoneme_input_type=phoneme_input_type,
-                phoneme_sampling_method='argmax',
-                use_teacher_forced=False,
-                use_inference_mode=True,
-            )
-        return output.predicted_audio, output.predicted_audio_lens
-
-    @classmethod
-    def list_available_models(cls) -> List[PretrainedModelInfo]:
-        return []
diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py
new file mode 100644
index 000000000000..5bab45559174
--- /dev/null
+++ b/nemo/collections/tts/models/easy_magpietts_inference.py
@@ -0,0 +1,2018 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+
+import numpy as np
+import soundfile as sf
+import torch
+from hydra.utils import instantiate
+from lightning.pytorch import Trainer
+from omegaconf import DictConfig
+from torch import nn
+from transformers import AutoConfig, AutoModelForCausalLM
+
+from nemo.collections.tts.data.text_to_speech_dataset_lhotse import (
+    instantiate_phoneme_tokenizer,
+    setup_tokenizers,
+)
+from nemo.collections.tts.models import AudioCodecModel
+from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel
+from nemo.collections.tts.modules import transformer_2501
+from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter
+from nemo.collections.tts.modules.magpietts_modules import (
+    CharAwareSubwordEncoder,
+    LocalTransformerType,
+    SpecialAudioToken,
+)
+from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
+from nemo.core.classes.common import PretrainedModelInfo
+from nemo.utils import logging
+
+
+@dataclass
+class TrainingMode:
+    """
+    Configuration for a training mode in multi-mode training.
+
+    Attributes:
+        text_input_mode: Either "full" or "streaming"
+        streaming_phonemes_delay: Delay for phoneme stream (only used in streaming mode)
+        streaming_speech_delay: Delay for speech stream (only used in streaming mode)
+        mode_idx: Index of this mode in the list of modes (used for task embedding lookup)
+    """
+
+    text_input_mode: str
+    streaming_phonemes_delay: int
+    streaming_speech_delay: int
+    mode_idx: int
+
+    @property
+    def name(self) -> str:
+        """Derived identifier used for inference selection and logging."""
+        return f"{self.text_input_mode}_{self.streaming_phonemes_delay}_{self.streaming_speech_delay}"
+
+
+@dataclass
+class StreamingState:
+    """
+    State for streaming TTS inference with batch support.
+
+    This dataclass maintains all the necessary state for autoregressive streaming
+    generation, allowing text tokens to be fed incrementally. Supports arbitrary
+    batch sizes where each batch item can have different context lengths and be
+    in different phases.
+
+    The streaming operates in four phases (per batch item):
+    1. Context phase (context_position < full_context_lens): Processing remaining context
+    2. Prompt phase (text_tokens_seen < phoneme_delay): Only text, no predictions
+    3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay): Phoneme predictions only
+    4. Audio phase (text_tokens_seen >= speech_delay): Both phoneme and audio predictions
+
+    Attributes:
+        batch_size: Number of items in the batch.
+        past_key_values: KV cache from the transformer for efficient autoregressive decoding.
+        cache_seq_len: Current sequence length in the cache.
+        all_predictions: List of predicted audio codes at each timestep, each tensor is (B, C, S) unstacked.
+        all_phoneme_predictions: List of predicted phoneme tokens at each timestep, each tensor is (B, phoneme_stacking_factor).
+        context_audio_codes: Processed context audio codes with special tokens.
+        context_audio_codes_lens: Length of context audio codes.
+        context_lens: Total context length (task_embedding + context_audio + context_text).
+        full_context_embedding: Full context embedding for each batch item (B, T_max_context, E).
+        full_context_lens: Full context length for each batch item (B,).
+        context_position: How much context has been processed per batch item (B,).
+        text_tokens_seen: Number of text tokens processed so far per batch item (B,).
+        phoneme_steps: Number of phoneme prediction steps taken per batch item (B,).
+        audio_steps: Number of audio prediction steps taken per batch item (B,).
+        phoneme_stream_ended: Whether the phoneme stream has ended per batch item (B,) bool tensor.
+        phoneme_eos_detected: Whether the phoneme EOS has been predicted per batch item (B,) bool tensor.
+        finished: Whether generation is complete per batch item (B,) bool tensor.
+        device: Device tensors are on.
+        training_mode: The training mode being used for inference.
+        use_cfg: Whether classifier-free guidance is enabled.
+        cfg_scale: CFG scale factor.
+        use_local_transformer: Whether to use local transformer for inference.
+        temperature: Sampling temperature.
+        topk: Top-k sampling parameter.
+        dummy_context_embedding_unconditional: Unconditional embedding for CFG (if enabled).
+        last_hidden: Last hidden state from transformer.
+        text_finished: Whether text input has finished per batch item (B,) bool tensor.
+        phoneme_input_type: 'gt' or 'pred' for phoneme tokens.
+        phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
+        last_phoneme_tokens: Last predicted phoneme tokens (B, phoneme_stacking_factor).
+        last_audio_codes: Last predicted audio codes (B, num_codebooks).
+        audio_prediction_start_idx: Global frame index where audio predictions start per batch item (B,).
+        audio_prediction_end_idx: Global frame index where audio predictions end per batch item (B,), -1 if not ended.
+        phoneme_prediction_start_idx: Global step index where phoneme predictions start per batch item (B,).
+        phoneme_prediction_end_idx: Global step index where phoneme predictions end per batch item (B,), -1 if not ended.
+    """
+
+    batch_size: int
+    past_key_values: Optional[Tuple]
+    cache_seq_len: int
+    all_predictions: List[torch.Tensor]
+    all_phoneme_predictions: List[torch.Tensor]
+    context_audio_codes: torch.Tensor
+    context_audio_codes_lens: torch.Tensor
+    context_lens: torch.Tensor
+    full_context_embedding: torch.Tensor
+    full_context_lens: torch.Tensor
+    context_position: torch.Tensor
+    text_tokens_seen: torch.Tensor
+    phoneme_steps: torch.Tensor
+    audio_steps: torch.Tensor
+    phoneme_stream_ended: torch.Tensor
+    phoneme_eos_detected: torch.Tensor
+    finished: torch.Tensor
+    device: torch.device
+    training_mode: TrainingMode
+    use_cfg: bool
+    cfg_scale: float
+    use_local_transformer: bool
+    temperature: float
+    topk: int
+    dummy_context_embedding_unconditional: Optional[torch.Tensor]
+    last_hidden: torch.Tensor
+    text_finished: torch.Tensor
+    phoneme_input_type: str
+    phoneme_sampling_method: str
+    last_phoneme_tokens: Optional[torch.Tensor]
+    last_audio_codes: Optional[torch.Tensor]
+    audio_prediction_start_idx: torch.Tensor
+    audio_prediction_end_idx: torch.Tensor
+    phoneme_prediction_start_idx: torch.Tensor
+    phoneme_prediction_end_idx: torch.Tensor
+    gt_phoneme_embeddings: Optional[torch.Tensor] = None  # (B, T', E) pre-computed GT embeddings
+    gt_phoneme_lens: Optional[torch.Tensor] = None  # (B,) lengths after stacking
+    gt_audio_embeddings: Optional[torch.Tensor] = None  # (B, T', E) pre-computed GT audio embeddings
+    gt_audio_lens: Optional[torch.Tensor] = None  # (B,) lengths after stacking
+
+
+@dataclass
+class StreamingFinalizeOutput:
+    """Output from streaming_finalize containing audio and phoneme predictions."""
+
+    audio: torch.Tensor  # (B, max_audio_len) generated audio waveform
+    audio_len: torch.Tensor  # (B,) length of audio per batch item
+    audio_codes: torch.Tensor  # (B, num_codebooks, T) generated audio codes
+    audio_codes_len: torch.Tensor  # (B,) length of codes per batch item
+    phoneme_tokens: List[List[int]]  # List of phoneme token sequences per batch item
+    phoneme_text: List[str]  # Decoded phoneme strings per batch item
+
+
+@dataclass
+class InferBatchOutput:
+    """Output dataclass for EasyMagpieTTS infer_batch method."""
+
+    predicted_audio: torch.Tensor  # (B, T_audio)
+    predicted_audio_lens: torch.Tensor  # (B,)
+    predicted_codes: torch.Tensor  # (B, num_codebooks, T_frames)
+    predicted_codes_lens: torch.Tensor  # (B,)
+    rtf_metrics: Dict[str, Any]
+    predicted_phoneme_tokens: Optional[torch.Tensor] = None  # (B, phoneme_stacking_factor, T_phoneme_steps)
+    predicted_phoneme_tokens_lens: Optional[torch.Tensor] = None  # (B,) number of valid phoneme steps per item
+    phoneme_prediction_start_idx: Optional[torch.Tensor] = None  # (B,) start index into predicted_phoneme_tokens
+
+
+class EasyMagpieTTSInferenceModel(BaseMagpieTTSModel):
+    """
+    Inference-only base class for EasyMagpieTTS decoder-only model.
+
+    Contains the model architecture (codec, embeddings, decoder, local transformer),
+    shared building-block methods, and all inference methods (streaming_init,
+    streaming_step, streaming_finalize, infer_batch, do_tts).
+
+    EasyMagpieTTSModel subclasses this to add training, validation, and data loading.
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
+        self.world_size = 1
+        if trainer is not None:
+            self.world_size = trainer.num_nodes * trainer.num_devices
+
+        # load codec
+        codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False)
+        self.sample_rate = codec_model.sample_rate
+        self.output_sample_rate = codec_model.output_sample_rate
+
+        if hasattr(codec_model, "discriminator"):
+            # del codec discriminator to free memory
+            del codec_model.discriminator
+
+        # Set up codebook configuration
+        vector_quantizer = cfg.get('vector_quantizer')
+        if vector_quantizer is not None:
+            vector_quantizer = instantiate(vector_quantizer)
+            num_audio_codebooks = vector_quantizer.num_codebooks
+            codebook_size = vector_quantizer.codebook_size
+            codec_converter = VectorQuantizerIndexConverter(
+                vector_quantizer_original=codec_model.vector_quantizer,
+                vector_quantizer_new=vector_quantizer,
+            )
+            data_num_audio_codebooks = codec_model.vector_quantizer.num_codebooks
+        else:
+            num_audio_codebooks = codec_model.num_codebooks
+            data_num_audio_codebooks = num_audio_codebooks
+            codebook_size = codec_model.codebook_size
+            codec_converter = None
+
+        # The dataloader needs to know the number of codebooks that the context codes were stored in
+        # In the case where there are no context codes saved, and there is no context audio (in the text context path),
+        # We create a dummy context code tensor that is only [context_BOS, context_EOS] that is repeated for
+        # data_num_audio_codebooks
+        self.data_num_audio_codebooks = data_num_audio_codebooks
+        self.num_audio_codebooks = num_audio_codebooks
+        self.codebook_size = codebook_size
+
+        self.codec_model_samples_per_frame = codec_model.samples_per_frame
+        # Our codebooks start with actual audio codec tokens, followed by special tokens.
+        # The `forced_*` options are for backward compatibility for models trained with older code.
+        get_token_index = partial(SpecialAudioToken.get_index, base_codebook_size=self.codebook_size)
+        self.audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_BOS)
+        self.audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_EOS)
+        self.context_audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_BOS)
+        self.context_audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_EOS)
+        self.mask_token_id = get_token_index(SpecialAudioToken.MASK_TOKEN)
+        self.num_all_tokens_per_codebook = self.codebook_size + len(SpecialAudioToken)
+        self.use_bpe_char_tokenizer = cfg.get('use_bpe_char_tokenizer', False)
+
+        # If specified, use this as the text conditioning tokenizer. Otherwise, use the first tokenizer.
+        self.text_conditioning_tokenizer_name = cfg.get('text_conditioning_tokenizer_name', None)
+        if self.text_conditioning_tokenizer_name is None:
+            self.text_conditioning_tokenizer_name = list(cfg.text_tokenizers.keys())[0]
+
+        self.cfg_unconditional_prob = cfg.get('cfg_unconditional_prob', 0.0)
+
+        # Multi-mode training configuration
+        # The model trains with multiple text input modes (full, streaming with various delays)
+        # Each mode has its own task embedding that is prepended to the context
+        training_modes_cfg = cfg.get('training_modes', None)
+        if training_modes_cfg is None:
+            # Create a default training mode for backward compatibility
+            self.training_modes = [
+                TrainingMode(
+                    text_input_mode="streaming",
+                    streaming_phonemes_delay=4,
+                    streaming_speech_delay=8,
+                    mode_idx=0,
+                )
+            ]
+
+        else:
+            self.training_modes = []
+            for mode_idx, mode_cfg in enumerate(training_modes_cfg):
+                mode = TrainingMode(
+                    text_input_mode=mode_cfg.text_input_mode,
+                    streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0),
+                    streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0),
+                    mode_idx=mode_idx,
+                )
+                self.training_modes.append(mode)
+
+        logging.info(f"Multi-mode training with {len(self.training_modes)} modes:")
+        for mode in self.training_modes:
+            logging.info(
+                f"  - {mode.name}: text_input_mode={mode.text_input_mode}, "
+                f"streaming_phonemes_delay={mode.streaming_phonemes_delay}, "
+                f"streaming_speech_delay={mode.streaming_speech_delay}"
+            )
+
+        # Create a mapping from mode name to mode object for easy lookup during inference
+        self.mode_name_to_mode = {mode.name: mode for mode in self.training_modes}
+        # Default mode for inference if not specified (first mode in the list)
+        self.default_inference_mode = self.training_modes[0].name
+
+        self.frame_stacking_factor = cfg.get('frame_stacking_factor', 1)
+
+        self.tokenizer = setup_tokenizers(
+            all_tokenizers_config=cfg.text_tokenizers,
+            mode='train',
+        )
+
+        num_tokens_tokenizer = len(self.tokenizer.tokens)
+        num_tokens = num_tokens_tokenizer + 3  # +3 for BOS, EOS, CFG_UNK
+        self.bos_id = num_tokens - 3
+        self.eos_id = num_tokens - 2
+        self.cfg_unk_token_id = num_tokens - 1
+        self.phoneme_tokenizer = None
+        if cfg.get('phoneme_tokenizer', None) is not None:
+            self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer)
+            self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1)
+            self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size
+            if cfg.get('phoneme_corruption_batch_prob', None) is None:
+                # Legacy mode: remove the UNK token from the phoneme vocabulary
+                # TODO: Remove this.
+                self.phoneme_vocab_size -= 1
+            # If max phoneme probability is below this threshold at inference-time,
+            # replace the predicted timestep with UNK to reduce error propagation.
+            self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0)
+
+        self.pad_context_text_to_max_duration = False
+        self.add_language_to_context_text = cfg.get('add_language_to_context_text', False)
+
+        super().__init__(cfg=cfg, trainer=trainer)
+
+        # This needs to happen after super().__init__()
+        self._codec_model = codec_model
+        self._codec_model.freeze()  # Lightning does requires_grad = False and self.eval()
+        self._codec_converter = codec_converter
+
+        # Audio embedding dimension - can be smaller than hidden_dim to reduce parameters
+        self.audio_embedding_dim = cfg.get('audio_embedding_dim', cfg.hidden_dim)
+
+        audio_embeddings = []
+        for _ in range(self.num_audio_codebooks * self.frame_stacking_factor):
+            audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, self.audio_embedding_dim))
+        self.audio_embeddings = nn.ModuleList(audio_embeddings)
+
+        # Projection from audio_embedding_dim to embedding_dim (Identity if same)
+        if self.audio_embedding_dim != cfg.embedding_dim:
+            self.audio_in_projection = nn.Linear(self.audio_embedding_dim, cfg.embedding_dim)
+        else:
+            self.audio_in_projection = nn.Identity()
+
+        if self.phoneme_tokenizer is not None:
+            phoneme_embeddings = []
+            for _ in range(self.phoneme_stacking_factor):
+                phoneme_embeddings.append(nn.Embedding(self.phoneme_vocab_size, cfg.embedding_dim))
+            self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings)
+            self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor)
+
+        # Decoder backend selection - supports HuggingFace models or NemotronH
+        self.decoder_type = cfg.get('decoder_type', 'huggingface')  # backward compatible default
+        logging.info(f"Using decoder type: {self.decoder_type}")
+
+        if self.decoder_type == 'huggingface':
+            # Existing HuggingFace path
+            self.transformer_backend_config = AutoConfig.from_pretrained(
+                cfg.transformer_hf_backend,
+                trust_remote_code=True,
+            )
+            hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config)
+            self.decoder = hf_transformer.model
+            self.lm_text_head = hf_transformer.lm_head
+
+        elif self.decoder_type == 'nemotron_h':
+            # NemotronH hybrid Mamba2/Attention backend
+            from nemo.collections.tts.modules.nemotron_h_decoder import NemotronHConfig, NemotronHForCausalLM
+
+            # Build config from YAML parameters
+            nemotron_h_config_dict = dict(cfg.get('nemotron_h_config', {}))
+            # Ensure hidden_size matches embedding_dim for compatibility
+            if 'hidden_size' not in nemotron_h_config_dict:
+                nemotron_h_config_dict['hidden_size'] = cfg.embedding_dim
+            nemotron_config = NemotronHConfig(**nemotron_h_config_dict)
+            nemotron_model = NemotronHForCausalLM(nemotron_config)
+            self.decoder = nemotron_model.backbone
+            self.lm_text_head = nemotron_model.lm_head
+            logging.info(
+                f"NemotronH config: {nemotron_config.num_hidden_layers} layers, pattern={nemotron_config.hybrid_override_pattern[:20]}..."
+            )
+
+        else:
+            raise ValueError(f"Unknown decoder_type: {self.decoder_type}. Supported: 'huggingface', 'nemotron_h'")
+
+        self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim)
+        self.decoder.set_input_embeddings(self.text_embedding)
+
+        # Task embedding for multi-mode training
+        # Each mode has a unique task embedding that is prepended to the context
+        # Only create task embedding if there are multiple modes
+        num_modes = len(self.training_modes)
+        if num_modes > 1:
+            self.task_embedding = nn.Embedding(num_modes, cfg.embedding_dim)
+            logging.info(f"Created task embedding with {num_modes} modes, embedding_dim={cfg.embedding_dim}")
+        else:
+            self.task_embedding = None
+            logging.info(f"Single training mode '{self.training_modes[0].name}', skipping task embedding")
+
+        if self.use_bpe_char_tokenizer:
+            # BPE char tokenizer
+            assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer"
+            tokenizer_name = self.tokenizer.tokenizer_names[0]
+            tokenizer = self.tokenizer.tokenizers[tokenizer_name]
+            subword_vocab = tokenizer.get_vocab()
+            # special tokens will be stored as it is in the char_vocab
+            # Each special token will only be mapped to one char id
+            special_vocab = {
+                '<BOS>': self.bos_id,
+                '<EOS>': self.eos_id,
+                '<CFG_UNK>': self.cfg_unk_token_id,
+            }
+            self.cas_encoder = CharAwareSubwordEncoder(
+                d_embed=cfg.embedding_dim,
+                llm_tokenizer_vocab=subword_vocab,
+                subword_padding_idx=self.tokenizer.pad,
+                special_vocab=special_vocab,
+            )
+
+        # Projection from hidden_dim to audio_embedding_dim before final_proj (Identity if same)
+        if self.audio_embedding_dim != cfg.hidden_dim:
+            self.audio_out_projection = nn.Linear(cfg.hidden_dim, self.audio_embedding_dim)
+        else:
+            self.audio_out_projection = nn.Identity()
+
+        self.final_proj = nn.Linear(
+            self.audio_embedding_dim,
+            self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor,
+        )
+
+        self.local_transformer_type = LocalTransformerType(cfg.get('local_transformer_type', 'none').lower())
+        logging.info(f"Local transformer type: {self.local_transformer_type}")
+        if self.local_transformer_type != LocalTransformerType.NO_LT:
+            local_transformer_hidden_dim = cfg.get('local_transformer_hidden_dim', 256)
+            if local_transformer_hidden_dim != cfg.hidden_dim:
+                self.local_transformer_in_projection = nn.Linear(cfg.hidden_dim, local_transformer_hidden_dim)
+            else:
+                self.local_transformer_in_projection = nn.Identity()
+            self.local_transformer = transformer_2501.Transformer(
+                n_layers=self.cfg.get('local_transformer_n_layers', 2),
+                d_model=local_transformer_hidden_dim,
+                d_ffn=local_transformer_hidden_dim * 4,
+                sa_n_heads=self.cfg.get('local_transformer_n_heads', 1),
+                kernel_size=1,
+                is_causal=self.local_transformer_type == LocalTransformerType.AR,
+                max_length_causal_mask=self.num_audio_codebooks * self.frame_stacking_factor + 2,
+                use_learnable_pos_emb=True,
+            )
+            # Projection from local_transformer_hidden_dim to audio_embedding_dim (Identity if same)
+            if self.audio_embedding_dim != local_transformer_hidden_dim:
+                self.local_transformer_audio_out_projection = nn.Linear(
+                    local_transformer_hidden_dim, self.audio_embedding_dim
+                )
+            else:
+                self.local_transformer_audio_out_projection = nn.Identity()
+            local_transformer_out_projections = []
+            for _ in range(self.num_audio_codebooks * self.frame_stacking_factor):
+                # Have a separate projection layer for each codebook, to distinguish between them
+                local_transformer_out_projections.append(
+                    nn.Linear(self.audio_embedding_dim, self.num_all_tokens_per_codebook)
+                )
+            self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections)
+
+    def _get_state_dict_keys_to_exclude(self):
+        return [
+            '_codec_model',
+        ]
+
+    def codes_to_audio(self, codes, codes_len):
+        # codes: (B, C, T')
+        self._codec_model.eval()
+        if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor:
+            codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor)
+
+        with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
+            if self._codec_converter is not None:
+                codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len)
+            if codes_len.min() < 4:
+                codes = torch.nn.functional.pad(input=codes, pad=(0, 4 - codes_len.min()), value=0)
+                codes_len = torch.where(codes_len < 4, torch.ones_like(codes_len) * 4, codes_len)
+                codes = codes[:, :, : codes_len.max()]
+
+            audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len)
+            return audio, audio_len, codes
+
+    def embed_audio_tokens(self, audio_tokens):
+        # audio_tokens: (B, C, T')
+        # Add and average the embeddings of the audio tokens across the codebooks
+        audio_embedding = None
+        for c in range(audio_tokens.size(1)):
+            embedding = self.audio_embeddings[c](audio_tokens[:, c, :])
+            if audio_embedding is None:
+                audio_embedding = embedding
+            else:
+                audio_embedding = audio_embedding + embedding
+        audio_embedding = audio_embedding / audio_tokens.size(1)
+        # Project from audio_embedding_dim to embedding_dim
+        audio_embedding = self.audio_in_projection(audio_embedding)
+        return audio_embedding
+
+    def embed_phoneme_tokens(self, phoneme_tokens):
+        # phoneme_tokens: (B, S, T')
+        phoneme_embedding = None
+        for c in range(phoneme_tokens.size(1)):
+            embedding = self.phoneme_embeddings[c](phoneme_tokens[:, c, :])
+            if phoneme_embedding is None:
+                phoneme_embedding = embedding
+            else:
+                phoneme_embedding = phoneme_embedding + embedding
+        phoneme_embedding = phoneme_embedding / phoneme_tokens.size(1)
+        return phoneme_embedding
+
+    def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None, cache_position=None):
+        # Only pass cache_position for NemotronH (HF transformers may not accept it)
+        if self.decoder_type == 'nemotron_h':
+            backend_out = self.decoder(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+            )
+        else:
+            backend_out = self.decoder(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                past_key_values=past_key_values,
+            )
+        return backend_out
+
+    def logits_to_audio_codes(self, all_code_logits, audio_codes_lens):
+        # all_code_logits: (B, T', num_codebooks * num_tokens_per_codebook)
+        # audio_codes_lens: (B,)
+        all_preds = []
+        for idx in range(self.num_audio_codebooks * self.frame_stacking_factor):
+            si = idx * self.num_all_tokens_per_codebook
+            ei = si + self.num_all_tokens_per_codebook
+            codebook_logits = all_code_logits[:, :, si:ei]
+            codebook_probs = torch.softmax(codebook_logits, dim=-1)  # (B, T', num_tokens_per_codebook)
+            # argmax to get the tokens
+            codebook_preds = torch.argmax(codebook_probs, dim=-1)  # (B, T')
+            all_preds.append(codebook_preds)
+
+        all_preds = torch.stack(all_preds, dim=1)  # (B, C, T')
+        audio_mask = get_mask_from_lengths(audio_codes_lens)
+        all_preds = all_preds * audio_mask.unsqueeze(1)
+
+        return all_preds
+
+    def sample_codes_from_logits(
+        self, all_code_logits_t, temperature=0.7, topk=80, unfinished_items={}, finished_items={}
+    ):
+        # all_code_logits_t: (B, num_codebooks * num_tokens_per_codebook), logits at a given timestep
+        all_preds = []
+        for idx in range(self.num_audio_codebooks * self.frame_stacking_factor):
+            si = idx * self.num_all_tokens_per_codebook
+            ei = si + self.num_all_tokens_per_codebook
+            codebook_logits = all_code_logits_t[:, si:ei]  # (B, num_tokens_per_codebook)
+            # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax
+            codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
+            codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
+            for item_idx in unfinished_items:
+                codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
+            for item_idx in finished_items:
+                codebook_logits[item_idx, :] = float('-inf')
+                codebook_logits[item_idx, self.audio_eos_id] = 0.0
+            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]  # (B, topk)
+            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(
+                -1
+            )  # (B, num_tokens_per_codebook)
+            codebook_logits_rescored = codebook_logits.clone()
+            codebook_logits_rescored[indices_to_remove] = float('-inf')
+
+            if temperature <= 0.0:
+                # Argmax sampling for deterministic output
+                codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True)  # (B, 1)
+            else:
+                codebook_probs = torch.softmax(
+                    codebook_logits_rescored / temperature, dim=-1
+                )  # (B, num_tokens_per_codebook)
+                codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
+            all_preds.append(codebook_preds)
+        all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
+        return all_preds
+
+    def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, topk=80):
+        # all_code_logits_t: (B, phoneme_stacking_factor * phoneme_vocab_size), logits at a given timestep
+        all_preds = []
+        for idx in range(self.phoneme_stacking_factor):
+            si = idx * self.phoneme_vocab_size
+            ei = si + self.phoneme_vocab_size
+            codebook_logits = all_code_logits_t[:, si:ei]  # (B, num_tokens_per_codebook)
+            # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax
+            codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
+            codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
+            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]  # (B, topk)
+            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(
+                -1
+            )  # (B, num_tokens_per_codebook)
+            codebook_logits_rescored = codebook_logits.clone()
+            codebook_logits_rescored[indices_to_remove] = float('-inf')
+
+            if temperature <= 0.0:
+                # Argmax sampling for deterministic output
+                codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True)  # (B, 1)
+            else:
+                codebook_probs = torch.softmax(
+                    codebook_logits_rescored / temperature, dim=-1
+                )  # (B, num_tokens_per_codebook)
+                codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
+            all_preds.append(codebook_preds)
+        all_preds = torch.cat(all_preds, dim=1).long()  # (B, num_codebooks)
+        return all_preds
+
+    def join_embeddings_temporally(
+        self,
+        embeddings: Sequence[torch.Tensor],  # [ (B, Ti, E), … ]
+        lengths: Sequence[torch.Tensor],  # [ (B,), … ]  same order/size as `embeddings`
+        pad_embed: torch.Tensor | None = None,  # (E,)  defaults to zeros
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Merges Multiple Embedding sequences into a single Embedding Sequence.
+
+        Args:
+            embeddings  : Sequence of tensors, each of shape (B, Ti, E) — batch, time, embedding
+            lengths     : Sequence of tensors, each of shape (B,)
+            pad_embed   : (E,)  — embedding to use for padding, defaults to zeros
+
+        Returns:
+            joined      : (B, max_sum_len, E)  — merged & padded
+            out_lengths : (B,)  — total lengths of each batch element after merging
+        """
+        if len(embeddings) == 0:
+            raise ValueError("contexts must be non-empty")
+
+        B, _, E = embeddings[0].shape
+        device = embeddings[0].device
+        dtype = embeddings[0].dtype
+
+        # 1. compute output sizes
+        len_stack = torch.stack(tuple(lengths), dim=0)  # (N, B)
+        out_lengths = len_stack.sum(0)
+        max_len = int(out_lengths.max())
+
+        if pad_embed is None:
+            pad_embed = torch.zeros(E, dtype=dtype, device=device)
+
+        joined = pad_embed.expand(B, max_len, E).clone()  # (B,max_len,E)
+
+        # batch row indices
+        batch_rows = torch.arange(B, device=device).unsqueeze(1)  # (B,1)
+
+        # running offset keeps "write cursor" for each row
+        offset = torch.zeros(B, dtype=torch.long, device=device)  # (B,)
+
+        for i, (embedding_i, len_i) in enumerate(zip(embeddings, lengths)):
+            Ti = embedding_i.shape[1]
+            t_idx = torch.arange(Ti, device=device)  # (Ti,)
+            mask = t_idx.unsqueeze(0) < len_i.unsqueeze(1)  # (B,Ti)
+
+            # destination columns: offset + t
+            dest_cols = offset.unsqueeze(1) + t_idx  # (B,Ti)
+
+            # Assign embedding_i to the correct positions in joined
+            # Ensure dtype matches to avoid errors during mixed-precision training
+            joined[batch_rows.expand_as(mask)[mask], dest_cols[mask]] = embedding_i[mask].to(joined.dtype)
+
+            # move cursor past this segment
+            offset += len_i
+
+        return joined, out_lengths
+
+    def prepare_context_tensors(
+        self,
+        context_text_tokens: torch.Tensor,
+        context_text_tokens_lens: torch.Tensor,
+        context_audio_codes: Optional[torch.Tensor] = None,
+        context_audio_codes_lens: Optional[torch.Tensor] = None,
+        context_audio: Optional[torch.Tensor] = None,
+        context_audio_lens: Optional[torch.Tensor] = None,
+        training_mode: Optional[TrainingMode] = None,
+        dropout_conditional_input: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Prepare context tensors (without text) for the simplified process_batch.
+
+        This function processes context audio and context text to create the combined
+        context embedding.
+        Args:
+            context_text_tokens: Context text token IDs for speaker/style conditioning (B, L)
+            context_text_tokens_lens: Length of context text for each batch item (B,)
+            context_audio_codes: Pre-computed audio codes for context audio (B, C, T').
+                If None, will be computed from context_audio.
+            context_audio_codes_lens: Length of context audio codes (B,).
+                Required if context_audio_codes is provided.
+            context_audio: Raw context audio waveform (B, T).
+                Used to compute context_audio_codes if not provided.
+            context_audio_lens: Length of context audio (B,).
+                Required if context_audio is provided.
+            training_mode: Optional TrainingMode object specifying the mode to use.
+                If None, uses the first mode from training_modes as default.
+            dropout_conditional_input: If True, replace context with CFG unconditional token.
+
+        Returns:
+            Tuple of:
+                - context_embedding: Combined context embedding (B, T_context, E)
+                - context_lens: Total context length per batch item (B,)
+                - context_audio_codes: Processed audio codes with special tokens (B, C, T')
+                - context_audio_codes_lens: Length of processed context audio codes (B,)
+        """
+        # Determine the mode parameters to use
+        if training_mode is None:
+            training_mode = self.training_modes[0]
+
+        current_mode_idx = training_mode.mode_idx
+        batch_size = context_text_tokens.size(0)
+        device = context_text_tokens.device
+
+        # Context Audio
+        if context_audio_codes is None:
+            if context_audio is None:
+                raise ValueError("Either context_audio_codes or context_audio must be provided")
+            context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
+
+        if self._codec_converter is not None:
+            context_audio_codes = self._codec_converter.convert_original_to_new(
+                audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens
+            ).long()
+
+        context_audio_codes, context_audio_codes_lens = self.add_special_tokens(
+            codes=context_audio_codes,
+            codes_len=context_audio_codes_lens,
+            bos_id=self.context_audio_bos_id,
+            eos_id=self.context_audio_eos_id,
+        )
+
+        # Use legacy audio_bos_id/audio_eos_id if flag is set
+        stack_bos_id = (
+            self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id
+        )
+        stack_eos_id = (
+            self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id
+        )
+
+        context_audio_codes, context_audio_codes_lens = self.stack_codes(
+            context_audio_codes,
+            context_audio_codes_lens,
+            stack_bos_id,
+            stack_eos_id,
+            self.frame_stacking_factor,
+            self.num_audio_codebooks,
+        )
+        context_audio_embedded = self.embed_audio_tokens(context_audio_codes)  # (B, T', E)
+
+        # Context Text
+        context_text_lens = context_text_tokens_lens
+        context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens)  # (B, L, E)
+
+        # Prepare task embedding for multi-mode training
+        task_embedding = None
+        task_embedding_lens = None
+        if self.task_embedding is not None and current_mode_idx is not None:
+            mode_idx_tensor = torch.full((batch_size,), current_mode_idx, dtype=torch.long, device=device)
+            task_embedding = self.task_embedding(mode_idx_tensor).unsqueeze(1)  # (B, 1, E)
+            task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=device)  # (B,)
+
+        # Combine context embeddings: [task_embedding | context_audio | context_text]
+        if task_embedding is not None:
+            context_embedding, context_lens = self.join_embeddings_temporally(
+                embeddings=[task_embedding, context_audio_embedded, context_text_embedded],
+                lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens],
+            )
+        else:
+            context_embedding, context_lens = self.join_embeddings_temporally(
+                embeddings=[context_audio_embedded, context_text_embedded],
+                lengths=[context_audio_codes_lens, context_text_lens],
+            )
+
+        # Handle CFG unconditional dropout
+        if dropout_conditional_input:
+            cfg_token_id = self.cfg_unk_token_id
+            cfg_token_embedding = self.decoder.get_input_embeddings()(
+                torch.full((batch_size, 1), cfg_token_id, device=device)
+            )  # (B, 1, E)
+            # Expand CFG token to match context embedding size
+            context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1)  # (B, T_context, E)
+
+        return context_embedding, context_lens, context_audio_codes, context_audio_codes_lens
+
+    def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_codebooks):
+        """
+        Stack multiple time steps into the channel dimension to reduce sequence length.
+
+        This function reshapes audio/phoneme codes by grouping consecutive time steps together
+        and placing them in the channel dimension. This allows the model to process multiple
+        frames in parallel while reducing the sequence length.
+
+        Args:
+            codes: Input codes tensor of shape (B, C, T) where B is batch size,
+                   C is number of codebooks, and T is sequence length.
+            codes_lens: Length of valid codes for each batch item, shape (B,).
+            bos_id: Beginning-of-sequence token ID used to detect and handle BOS tokens.
+            eos_id: End-of-sequence token ID used for padding.
+            stacking_factor: Number of time steps to stack together. If 1, no stacking is performed.
+            num_codebooks: Number of codebooks in the input.
+
+        Returns:
+            Tuple of:
+                - stacked_codes: Reshaped codes of shape (B, C * stacking_factor, T // stacking_factor).
+                  If input contains BOS tokens, they are preserved at the beginning.
+                - new_lens: Updated sequence lengths after stacking, shape (B,).
+        """
+        if stacking_factor == 1:
+            return codes, codes_lens
+
+        contains_bos = codes[0, 0, 0].item() == bos_id
+        if contains_bos:
+            bos_tensor_repeated = torch.full(
+                (codes.size(0), (stacking_factor) * num_codebooks, 1), bos_id, device=codes.device
+            )  # (B,stacking_factor*C, 1)
+            codes = codes[:, :, 1:]  # Remove the bos token
+            codes_lens = codes_lens - 1  # Remove the bos token
+        B, C, T = codes.shape
+        s = int(stacking_factor)
+
+        # --- Compute max padding needed ---
+        pad_t = (-T) % s  # pad so that T' is divisible by s
+        pad_tail = torch.full((B, C, pad_t), eos_id, dtype=codes.dtype, device=codes.device)
+        codes = torch.cat([codes, pad_tail], dim=-1)
+
+        # --- Stack time into channel dimension ---
+        Tp = codes.shape[-1]
+        T_out = Tp // s
+        codes = codes.view(B, C, T_out, s)
+        codes = codes.permute(0, 1, 3, 2).reshape(B, C * s, T_out)
+
+        new_lens = torch.div(codes_lens + s - 1, s, rounding_mode='floor')
+        if contains_bos:
+            codes = torch.cat([bos_tensor_repeated, codes], dim=2)
+            new_lens = new_lens + 1
+
+        return codes, new_lens
+
+    def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor):
+        """
+        Reverse the stacking operation to recover the original time dimension.
+
+        This is the inverse of `stack_codes`. It takes codes that have been stacked
+        in the channel dimension and expands them back into the time dimension.
+
+        Args:
+            stacked_codes: Stacked codes tensor of shape (B, C * stacking_factor, T_stacked)
+                          where T_stacked = T_original // stacking_factor.
+            stacked_lens: Length of valid stacked sequences for each batch item, shape (B,).
+            stacking_factor: The stacking factor used in the original `stack_codes` call.
+                            If 1, no unstacking is performed.
+
+        Returns:
+            Tuple of:
+                - unstacked_codes: Codes with restored time dimension, shape (B, C, T_stacked * stacking_factor).
+                - orig_lens: Recovered sequence lengths, shape (B,). Note that these are the
+                  maximum possible lengths; actual valid lengths may be shorter due to
+                  padding applied during stacking.
+        """
+        if stacking_factor == 1:
+            return stacked_codes, stacked_lens
+
+        B, CxS, T_out = stacked_codes.shape
+        s = int(stacking_factor)
+        assert CxS % s == 0, f"Channel dim ({CxS}) must be divisible by stacking_factor ({s})"
+
+        C = CxS // s
+        # Reshape: split channels back into (C, s)
+        x = stacked_codes.view(B, C, s, T_out)
+        # Bring s back into time dimension
+        x = x.permute(0, 1, 3, 2).reshape(B, C, T_out * s)
+
+        # Recover original lengths (before padding)
+        orig_lens = stacked_lens * s
+
+        return x, orig_lens
+
+    def _sample_audio_codes(
+        self,
+        last_hidden: torch.Tensor,
+        all_code_logits_t: torch.Tensor,
+        temperature: float,
+        topk: int,
+        use_local_transformer_for_inference: bool,
+        use_cfg: bool,
+        cfg_scale: float,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Sample audio codes from logits using either local transformer or parallel sampling.
+
+        Returns:
+            audio_codes_next: Sampled codes with temperature/topk (B, num_codebooks)
+            all_codes_next_argmax: Argmax sampled codes for EOS detection (B, num_codebooks)
+        """
+        if use_local_transformer_for_inference:
+            if self.local_transformer_type == LocalTransformerType.AR:
+                audio_codes_next = self.local_transformer_sample_autoregressive(
+                    dec_output=last_hidden[:, -1, :],
+                    temperature=temperature,
+                    topk=topk,
+                    use_cfg=use_cfg,
+                    cfg_scale=cfg_scale,
+                )
+                # Base class returns (B, C, S); flatten to (B, C*S) for downstream code
+                audio_codes_next = audio_codes_next.permute(0, 2, 1)
+                audio_codes_next = audio_codes_next.reshape(audio_codes_next.size(0), -1)
+            else:
+                raise ValueError(
+                    f"Local transformer inference requested but local transformer type is {self.local_transformer_type}"
+                )
+            # TODO @rfejgin: should we add argmax sampling for EOS here too?
+            all_codes_next_argmax = audio_codes_next
+        else:
+            # Parallel sampling from all codebook logits
+            audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk)
+            # Argmax sampling for reliable EOS detection
+            if temperature <= 0.0:
+                all_codes_next_argmax = audio_codes_next  # already argmax
+            else:
+                all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01)
+
+        return audio_codes_next, all_codes_next_argmax
+
+    def streaming_init(
+        self,
+        context_audio_codes: torch.Tensor,
+        context_audio_codes_lens: torch.Tensor,
+        context_text_tokens: torch.Tensor,
+        context_text_tokens_lens: torch.Tensor,
+        inference_mode: Optional[str] = None,
+        use_cfg: bool = False,
+        cfg_scale: float = 1.0,
+        use_local_transformer: bool = False,
+        temperature: float = 0.7,
+        topk: int = 80,
+        phoneme_input_type: str = 'predicted',
+        phoneme_sampling_method: str = 'argmax',
+        gt_phoneme_tokens: Optional[torch.Tensor] = None,
+        gt_phoneme_tokens_lens: Optional[torch.Tensor] = None,
+        gt_audio_codes: Optional[torch.Tensor] = None,
+        gt_audio_codes_lens: Optional[torch.Tensor] = None,
+        use_inference_mode: bool = True,
+    ) -> StreamingState:
+        """
+        Initialize streaming TTS inference state.
+
+        This prepares the model for streaming inference by processing the context
+        (audio + context text) and returning a StreamingState that can be used
+        with streaming_step() to incrementally generate audio.
+
+        Note: This function does NOT take the main text input. Text tokens are
+        provided incrementally via streaming_step().
+
+        For batched inference, each batch item can have a different context length.
+        This function processes only up to the minimum context length across the batch,
+        storing the remaining context to be processed in streaming_step's context phase.
+
+        The streaming inference follows phases (per batch item):
+        1. Context phase: Processing remaining context (if any) for items with longer context.
+        2. Prompt phase: First `streaming_speech_delay` text tokens are processed
+           without generating audio (building up context).
+        3. Generation phase: Audio BOS is added and audio codes are generated
+           autoregressively, with remaining text tokens added to audio embeddings.
+
+        Args:
+            context_audio_codes: Pre-computed audio codes for context audio (B, C, T').
+            context_audio_codes_lens: Length of context audio codes (B,).
+            context_text_tokens: Context text token IDs for speaker/style conditioning (B, L).
+            context_text_tokens_lens: Length of context text (B,).
+            inference_mode: Name of the inference mode to use (e.g., "streaming_4_8").
+                If None, uses the default inference mode.
+            use_cfg: Whether to use classifier-free guidance.
+            cfg_scale: CFG scale factor (higher = stronger conditioning).
+            use_local_transformer: Whether to use local transformer for AR sampling.
+            temperature: Sampling temperature for audio codes.
+            topk: Top-k sampling parameter.
+            phoneme_input_type: 'gt' or 'predicted' for phoneme tokens (use 'predicted' for streaming).
+            phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
+            gt_phoneme_tokens: Optional GT phoneme tokens (B, L) with BOS/EOS for teacher forcing.
+            gt_phoneme_tokens_lens: Lengths of GT phoneme tokens (B,).
+            gt_audio_codes: Optional GT audio codes (B, C*S, T) already stacked with BOS/EOS,
+                input portion ([:, :, :-1]) for teacher forcing. Pre-processed by caller.
+            gt_audio_codes_lens: Lengths of GT audio codes (B,) after stacking.
+
+        Returns:
+            StreamingState: Initial state for streaming inference.
+        """
+        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
+        with grad_ctx():
+            batch_size = context_audio_codes.size(0)
+            device = context_audio_codes.device
+
+            # Resolve inference mode
+            mode_name = inference_mode if inference_mode is not None else self.default_inference_mode
+            if mode_name not in self.mode_name_to_mode:
+                available_modes = list(self.mode_name_to_mode.keys())
+                raise ValueError(f"Unknown inference mode '{mode_name}'. Available modes: {available_modes}")
+
+            selected_training_mode = self.mode_name_to_mode[mode_name]
+
+            # Prepare context embedding using shared helper
+            context_embedding, context_lens, context_audio_codes, context_audio_codes_lens = (
+                self.prepare_context_tensors(
+                    context_text_tokens=context_text_tokens,
+                    context_text_tokens_lens=context_text_tokens_lens,
+                    context_audio_codes=context_audio_codes,
+                    context_audio_codes_lens=context_audio_codes_lens,
+                    training_mode=selected_training_mode,
+                    dropout_conditional_input=False,
+                )
+            )
+
+            # Store full context embedding and lens before any CFG manipulation
+            full_context_embedding = context_embedding.clone()  # (B, T_max, E)
+            full_context_lens = context_lens.clone()  # (B,)
+
+            # Compute min context length - we only process up to this in init
+            min_context_len = context_lens.min().item()
+
+            # Setup classifier-free guidance if enabled
+            dummy_context_embedding_unconditional = None
+            if use_cfg:
+                dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()(
+                    torch.full((1, 1), self.cfg_unk_token_id, device=device)
+                )
+                # Create unconditional context (same length as conditional)
+                dummy_context_expanded = dummy_context_embedding_unconditional.expand(
+                    batch_size, context_embedding.size(1), -1
+                )
+                # Concatenate conditional and unconditional: (2*B, T, E)
+                context_embedding = torch.cat([context_embedding, dummy_context_expanded], dim=0)
+
+            # First forward pass to process context - only up to min_context_len
+            cache_position = torch.arange(min_context_len, device=device)
+            transformer_out = self.forward(
+                inputs_embeds=context_embedding[:, :min_context_len, :],
+                attention_mask=None,
+                use_cache=True,
+                past_key_values=None,
+                cache_position=cache_position,
+            )
+
+            last_hidden = transformer_out.last_hidden_state
+            past_kv = transformer_out.past_key_values
+            current_cache_seq_len = min_context_len
+
+            # Process GT phoneme tokens if provided (for teacher forcing)
+            gt_phoneme_embeddings = None
+            gt_phoneme_lens = None
+            if gt_phoneme_tokens is not None and gt_phoneme_tokens_lens is not None:
+                gt_phoneme_expanded = gt_phoneme_tokens.unsqueeze(1)  # (B, 1, L)
+                gt_phoneme_stacked, gt_phoneme_lens = self.stack_codes(
+                    gt_phoneme_expanded,
+                    gt_phoneme_tokens_lens,
+                    self.phoneme_tokenizer.bos_token_id,
+                    self.phoneme_tokenizer.eos_token_id,
+                    self.phoneme_stacking_factor,
+                    1,
+                )
+                gt_phoneme_embeddings = self.embed_phoneme_tokens(gt_phoneme_stacked)  # (B, T', E)
+
+            # Process GT audio codes if provided (for teacher forcing)
+            gt_audio_embeddings = None
+            gt_audio_lens_state = None
+            if gt_audio_codes is not None and gt_audio_codes_lens is not None:
+                gt_audio_embeddings = self.embed_audio_tokens(gt_audio_codes)  # (B, T', E)
+                gt_audio_lens_state = gt_audio_codes_lens
+
+            # Initialize streaming state with batch support
+            state = StreamingState(
+                batch_size=batch_size,
+                past_key_values=past_kv,
+                cache_seq_len=current_cache_seq_len,
+                all_predictions=[],
+                all_phoneme_predictions=[],
+                context_audio_codes=context_audio_codes,
+                context_audio_codes_lens=context_audio_codes_lens,
+                context_lens=context_lens,
+                full_context_embedding=full_context_embedding,
+                full_context_lens=full_context_lens,
+                context_position=torch.full((batch_size,), min_context_len, dtype=torch.long, device=device),
+                text_tokens_seen=torch.zeros(batch_size, dtype=torch.long, device=device),
+                phoneme_steps=torch.zeros(batch_size, dtype=torch.long, device=device),
+                audio_steps=torch.zeros(batch_size, dtype=torch.long, device=device),
+                phoneme_stream_ended=torch.zeros(batch_size, dtype=torch.bool, device=device),
+                phoneme_eos_detected=torch.zeros(batch_size, dtype=torch.bool, device=device),
+                finished=torch.zeros(batch_size, dtype=torch.bool, device=device),
+                device=device,
+                training_mode=selected_training_mode,
+                use_cfg=use_cfg,
+                cfg_scale=cfg_scale,
+                use_local_transformer=use_local_transformer,
+                temperature=temperature,
+                topk=topk,
+                dummy_context_embedding_unconditional=dummy_context_embedding_unconditional,
+                last_hidden=last_hidden,
+                text_finished=torch.zeros(batch_size, dtype=torch.bool, device=device),
+                phoneme_input_type=phoneme_input_type,
+                phoneme_sampling_method=phoneme_sampling_method,
+                last_phoneme_tokens=None,
+                last_audio_codes=None,
+                audio_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
+                audio_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
+                phoneme_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
+                phoneme_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device),
+                gt_phoneme_embeddings=gt_phoneme_embeddings,
+                gt_phoneme_lens=gt_phoneme_lens,
+                gt_audio_embeddings=gt_audio_embeddings,
+                gt_audio_lens=gt_audio_lens_state,
+            )
+
+            return state
+
+    def streaming_step(
+        self,
+        state: StreamingState,
+        text_tokens: Optional[torch.Tensor] = None,
+        force_dropout_text: bool = False,
+        use_inference_mode: bool = True,
+    ) -> Tuple[StreamingState, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform one streaming inference step with batch support.
+
+        This function processes one text token per batch item (or signals end of text with None)
+        and generates predictions according to the streaming delays. Each batch item can be
+        in a different phase.
+
+        The streaming operates in four phases per batch item:
+        1. Context phase (context_position < full_context_lens):
+           - Still processing remaining context from streaming_init
+           - Uses context embedding, ignores text_tokens for this item
+        2. Prompt phase (text_tokens_seen < phoneme_delay):
+           - Only text tokens are processed, KV cache is extended
+           - No phoneme or audio predictions
+        3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay):
+           - Starts with phoneme BOS on first step
+           - Only phoneme predictions (no audio)
+           - Input: text embedding + phoneme embedding
+        4. Audio phase (text_tokens_seen >= speech_delay):
+           - Starts with audio BOS on first step
+           - Both phoneme and audio predictions
+           - Input: text embedding + phoneme embedding + audio embedding
+
+        IMPORTANT: Only ONE forward call to the decoder per streaming_step.
+
+        Args:
+            state: Current StreamingState from streaming_init or previous streaming_step.
+            text_tokens: Next text token for each batch item, shape (B,), or None if text has finished.
+                For items still in context phase, the text_token value is ignored (can be 0).
+                When None is passed, the model continues generating until EOS.
+
+        Returns:
+            Tuple of:
+                - Updated StreamingState
+                - Predicted audio codes for this step (B, C, S) unstacked, or None if no items in audio phase
+                  where C = num_audio_codebooks and S = frame_stacking_factor
+                - Predicted phoneme tokens for this step (B, phoneme_stacking_factor) or None if no items in phoneme phase
+        """
+        if state.finished.all():
+            return state, None, None
+
+        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
+        with grad_ctx():
+            device = state.device
+            batch_size = state.batch_size
+            streaming_speech_delay = state.training_mode.streaming_speech_delay
+            streaming_phonemes_delay = state.training_mode.streaming_phonemes_delay
+
+            # ==================== DETERMINE PHASES PER BATCH ITEM ====================
+            needs_context = state.context_position < state.full_context_lens  # (B,) bool
+            needs_text = (~needs_context) & (~state.text_finished)
+            needs_phoneme = (
+                (~needs_context) & (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended)
+            )
+            needs_audio = (~needs_context) & (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished)
+
+            next_input = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
+            # --- Context phase items: use next context embedding ---
+            if needs_context.any():
+                # Gather context embeddings at current position for each item
+                # context_position: (B,) - position indices
+                # full_context_embedding: (B, T_max, E)
+                ctx_positions = state.context_position.clone()  # (B,)
+                # Clamp positions to valid range for gathering
+                ctx_positions = ctx_positions.clamp(max=state.full_context_embedding.size(1) - 1)
+                # Gather: need (B, 1, E) from (B, T, E) at positions (B,)
+                ctx_emb = state.full_context_embedding[
+                    torch.arange(batch_size, device=device), ctx_positions, :
+                ].unsqueeze(
+                    1
+                )  # (B, 1, E)
+                # Only apply to items in context phase
+                context_mask = needs_context.view(batch_size, 1, 1).float()
+                next_input = next_input + ctx_emb * context_mask
+
+            # --- Non-context phase items: handle text embedding ---
+            text_embedded = None
+            if text_tokens is not None and needs_text.any():
+                # Embed text tokens for all items (will be masked later)
+                text_tokens_2d = text_tokens.unsqueeze(1)  # (B, 1)
+                text_embedded = self.decoder.get_input_embeddings()(text_tokens_2d)  # (B, 1, E)
+
+                # Handle BPE char tokenizer
+                if self.use_bpe_char_tokenizer:
+                    text_mask = torch.ones_like(text_tokens_2d, dtype=torch.bool)
+                    cas_embedding = self.cas_encoder(text_tokens_2d, subword_mask=text_mask)  # (B, 1, E)
+                    text_embedded = text_embedded + cas_embedding
+
+                if force_dropout_text:
+                    text_embedded = text_embedded * 0
+
+                # Check for EOS tokens - mark those items as text_finished
+                # The EOS token itself IS embedded normally (matching process_batch behavior
+                # where EOS is part of the text sequence). After this step, text_finished is set
+                # so subsequent steps won't add any text embedding.
+                is_eos_token = (text_tokens == self.eos_id) & needs_text  # (B,) bool
+                text_add_mask = needs_text.view(batch_size, 1, 1).float()
+                next_input = next_input + text_embedded * text_add_mask
+                state.text_finished = state.text_finished | is_eos_token
+
+            elif text_tokens is None:
+                # Text finished signal for items not in context phase
+                state.text_finished = state.text_finished | ~needs_context
+
+            # --- Phoneme embedding for phoneme and audio phase items ---
+            if self.phoneme_tokenizer is not None:
+                if needs_phoneme.any():
+                    phoneme_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
+
+                    if state.phoneme_input_type == 'gt' and state.gt_phoneme_embeddings is not None:
+                        # Teacher forcing: use pre-computed GT phoneme embeddings
+                        # Only use GT embedding if within valid length, otherwise zero
+                        within_gt_len = state.phoneme_steps < state.gt_phoneme_lens  # (B,)
+                        positions = state.phoneme_steps.clamp(max=state.gt_phoneme_embeddings.size(1) - 1)
+                        gt_emb = state.gt_phoneme_embeddings[
+                            torch.arange(batch_size, device=device), positions, :
+                        ].unsqueeze(
+                            1
+                        )  # (B, 1, E)
+                        phoneme_mask = (needs_phoneme & within_gt_len).view(batch_size, 1, 1).float()
+                        phoneme_emb = phoneme_emb + gt_emb * phoneme_mask
+                    else:
+                        # Prediction mode: use BOS or last predicted phoneme
+                        first_phoneme_step = needs_phoneme & (state.phoneme_steps == 0)
+                        has_last_phoneme = (
+                            needs_phoneme & (~first_phoneme_step) & (state.last_phoneme_tokens is not None)
+                        )
+
+                        if first_phoneme_step.any():
+                            phoneme_bos = torch.full(
+                                (batch_size, self.phoneme_stacking_factor, 1),
+                                self.phoneme_tokenizer.bos_token_id,
+                                device=device,
+                            ).long()
+                            phoneme_bos_emb = self.embed_phoneme_tokens(phoneme_bos)  # (B, 1, E)
+                            first_mask = first_phoneme_step.view(batch_size, 1, 1).float()
+                            phoneme_emb = phoneme_emb + phoneme_bos_emb * first_mask
+
+                        if has_last_phoneme.any() and state.last_phoneme_tokens is not None:
+                            last_phoneme_emb = self.embed_phoneme_tokens(
+                                state.last_phoneme_tokens.unsqueeze(2)
+                            )  # (B, 1, E)
+                            last_mask = has_last_phoneme.view(batch_size, 1, 1).float()
+                            phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask
+
+                        # Only end phoneme stream in prediction mode when the phoneme EOS is detected
+                        state.phoneme_stream_ended = state.phoneme_stream_ended | state.phoneme_eos_detected
+
+                    next_input = next_input + phoneme_emb
+
+            # --- Audio embedding for audio phase items ---
+            if needs_audio.any():
+                audio_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device)
+
+                if state.gt_audio_embeddings is not None:
+                    # Teacher forcing: use pre-computed GT audio embeddings
+                    # Only use GT embedding if within valid length, otherwise zero
+                    within_gt_len = state.audio_steps < state.gt_audio_lens  # (B,)
+                    positions = state.audio_steps.clamp(max=state.gt_audio_embeddings.size(1) - 1)
+                    gt_emb = state.gt_audio_embeddings[
+                        torch.arange(batch_size, device=device), positions, :
+                    ].unsqueeze(
+                        1
+                    )  # (B, 1, E)
+                    audio_mask = (needs_audio & within_gt_len).view(batch_size, 1, 1).float()
+                    audio_emb = audio_emb + gt_emb * audio_mask
+                else:
+                    # Prediction mode: use BOS or last predicted audio
+                    first_audio_step = needs_audio & (state.audio_steps == 0)
+                    has_last_audio = needs_audio & ~first_audio_step & (state.last_audio_codes is not None)
+
+                    if first_audio_step.any():
+                        # Create BOS for items at first audio step
+                        audio_bos = torch.full(
+                            (batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1),
+                            self.audio_bos_id,
+                            device=device,
+                        ).long()
+                        audio_bos_emb = self.embed_audio_tokens(audio_bos)  # (B, 1, E)
+                        first_mask = first_audio_step.view(batch_size, 1, 1).float()
+                        audio_emb = audio_emb + audio_bos_emb * first_mask
+
+                    if has_last_audio.any() and state.last_audio_codes is not None:
+                        # Use last predicted audio
+                        last_audio_emb = self.embed_audio_tokens(state.last_audio_codes.unsqueeze(2))  # (B, 1, E)
+                        last_mask = has_last_audio.view(batch_size, 1, 1).float()
+                        audio_emb = audio_emb + last_audio_emb * last_mask
+
+                next_input = next_input + audio_emb
+
+            # ==================== HANDLE CFG ====================
+            if state.use_cfg:
+                # For unconditional branch, use dummy embedding for non-audio items
+                # and audio-only embedding for audio items
+                next_input_unconditional_context = state.dummy_context_embedding_unconditional.expand(
+                    batch_size, 1, -1
+                )
+                # After the context is finished, we use zero embedding for the unconditional branch until audio phase starts
+                next_input_unconditional_zeros = torch.zeros_like(next_input_unconditional_context)
+                context_mask = needs_context.view(batch_size, 1, 1).float()
+                next_input_unconditional = (
+                    context_mask * next_input_unconditional_context
+                    + (1 - context_mask) * next_input_unconditional_zeros
+                )
+
+                # For audio phase items, we use audio embedding for the unconditional branch
+                if needs_audio.any():
+                    audio_mask = needs_audio.view(batch_size, 1, 1).float()
+                    next_input_unconditional = next_input_unconditional * (1 - audio_mask) + audio_emb * audio_mask
+
+                # Concatenate conditional and unconditional: (2*B, 1, E)
+                next_input = torch.cat([next_input, next_input_unconditional], dim=0)
+
+            # ==================== FORWARD PASS ====================
+            cache_position = torch.tensor([state.cache_seq_len], device=device)
+            transformer_out = self.forward(
+                inputs_embeds=next_input,
+                attention_mask=None,
+                use_cache=True,
+                past_key_values=state.past_key_values,
+                cache_position=cache_position,
+            )
+
+            state.last_hidden = transformer_out.last_hidden_state
+            state.past_key_values = transformer_out.past_key_values
+            state.cache_seq_len += 1
+
+            # ==================== UPDATE STATE ====================
+            # Update context_position for items in context phase
+            state.context_position = state.context_position + needs_context.long()
+            # Keep updating text_tokens_seen for items once the context is finished
+            # This is because this counter is used to determine when to start predicting phonemes and audio
+            state.text_tokens_seen = state.text_tokens_seen + (~needs_context).long()
+
+            # Update phoneme_steps for items in phoneme or audio phase
+            state.phoneme_steps = state.phoneme_steps + needs_phoneme.long()
+
+            # Update audio_steps for items in audio phase
+            state.audio_steps = state.audio_steps + needs_audio.long()
+
+            # ==================== PREDICTIONS ====================
+            pred_phoneme_tokens = None
+            audio_codes_next = None
+
+            # Phoneme predictions for items in phoneme or audio phase
+            if needs_phoneme.any() and self.phoneme_tokenizer is not None:
+                # Track phoneme prediction start index for items just entering phoneme phase
+                first_phoneme_step = needs_phoneme & (state.phoneme_prediction_start_idx == -1)
+                if first_phoneme_step.any():
+                    current_phoneme_step_idx = len(state.all_phoneme_predictions)  # before append
+                    state.phoneme_prediction_start_idx = torch.where(
+                        first_phoneme_step,
+                        torch.full_like(state.phoneme_prediction_start_idx, current_phoneme_step_idx),
+                        state.phoneme_prediction_start_idx,
+                    )
+
+                # Check which items should predict phonemes (not ended)
+                pred_phoneme_tokens = self._predict_phoneme_tokens(state)  # (B, phoneme_stacking_factor)
+                state.last_phoneme_tokens = pred_phoneme_tokens
+                state.all_phoneme_predictions.append(pred_phoneme_tokens)
+
+                # Check for phoneme EOS per item
+                phoneme_eos_detected = needs_phoneme & (
+                    pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id
+                ).any(
+                    dim=1
+                )  # (B,)
+
+                state.phoneme_eos_detected = state.phoneme_eos_detected | phoneme_eos_detected
+
+                # Track phoneme prediction end index for items that just ended
+                newly_ended_phoneme = phoneme_eos_detected & (state.phoneme_prediction_end_idx == -1)
+                if newly_ended_phoneme.any():
+                    current_phoneme_step_idx = len(state.all_phoneme_predictions)  # after append
+                    state.phoneme_prediction_end_idx = torch.where(
+                        newly_ended_phoneme,
+                        torch.full_like(state.phoneme_prediction_end_idx, current_phoneme_step_idx),
+                        state.phoneme_prediction_end_idx,
+                    )
+
+            # Audio predictions for items in audio phase
+            if needs_audio.any():
+                # Track audio prediction start index for items just entering audio phase
+                first_audio_step = needs_audio & (state.audio_prediction_start_idx == -1)
+                if first_audio_step.any():
+                    # Track start in terms of frames (not steps)
+                    current_frame_idx = sum(p.size(-1) for p in state.all_predictions)  # total frames so far
+                    state.audio_prediction_start_idx = torch.where(
+                        first_audio_step,
+                        torch.full_like(state.audio_prediction_start_idx, current_frame_idx),
+                        state.audio_prediction_start_idx,
+                    )
+
+                audio_codes_next_stacked, all_codes_next_argmax = self._predict_audio_codes(state)  # (B, C*S)
+
+                # Unstack immediately: (B, C*S) -> (B, C, S) where S = frame_stacking_factor
+                S = self.frame_stacking_factor
+                C = self.num_audio_codebooks
+                audio_codes_unstacked = audio_codes_next_stacked.view(batch_size, C, S)  # (B, C, S)
+
+                # Update last_audio_codes with stacked format (needed for next step's embedding)
+                if state.last_audio_codes is None:
+                    state.last_audio_codes = audio_codes_next_stacked
+                else:
+                    update_mask = needs_audio.view(batch_size, 1).expand_as(audio_codes_next_stacked)
+                    state.last_audio_codes = torch.where(update_mask, audio_codes_next_stacked, state.last_audio_codes)
+
+                # Check for EOS in each frame and track exact end position
+                # Skip EOS detection in teacher-forced mode - rely on GT exhaustion instead
+                if state.gt_audio_embeddings is None:
+                    # all_codes_next_argmax is also (B, C*S), reshape to (B, C, S)
+                    all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S)
+
+                    # For each batch item, find if/where EOS occurs in this step's frames
+                    eos_in_sampled = audio_codes_unstacked == self.audio_eos_id  # (B, C, S)
+                    eos_in_argmax = all_codes_argmax_unstacked == self.audio_eos_id  # (B, C, S)
+                    eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1)  # (B, S)
+
+                    # Find first frame with EOS per batch item (or S if none)
+                    eos_frame_idx = torch.where(
+                        eos_any_codebook.any(dim=1),
+                        eos_any_codebook.int().argmax(dim=1),  # first frame with EOS
+                        torch.full((batch_size,), S, device=device),  # no EOS in this step
+                    )  # (B,)
+
+                    audio_eos_detected = eos_any_codebook.any(dim=1) & needs_audio
+                    state.finished = state.finished | audio_eos_detected
+
+                    # Track audio prediction end index (in frames) for items that just ended
+                    newly_ended_audio = audio_eos_detected & (state.audio_prediction_end_idx == -1)
+                    if newly_ended_audio.any():
+                        # End index = current frame count + frame offset where EOS was found
+                        current_frame_count = len(state.all_predictions) * self.frame_stacking_factor
+                        end_frame_idx = current_frame_count + eos_frame_idx
+                        state.audio_prediction_end_idx = torch.where(
+                            newly_ended_audio, end_frame_idx, state.audio_prediction_end_idx
+                        )
+
+                # Store unstacked codes
+                state.all_predictions.append(audio_codes_unstacked)
+                audio_codes_next = audio_codes_unstacked
+
+            # Force-finish items when GT audio is exhausted (teacher forcing).
+            # This is checked AFTER predictions so the last valid prediction is still made.
+            # audio_steps was already incremented above. When audio_steps >= gt_audio_lens,
+            # we've consumed all GT input positions and made all corresponding predictions.
+            if state.gt_audio_embeddings is not None and state.gt_audio_lens is not None:
+                gt_exhausted = needs_audio & (state.audio_steps >= state.gt_audio_lens)
+                state.finished = state.finished | gt_exhausted
+
+            return state, audio_codes_next, pred_phoneme_tokens
+
+    def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor:
+        """Predict phoneme tokens from the last hidden state."""
+        actual_batch_size = state.batch_size
+        last_hidden = state.last_hidden
+
+        # Get phoneme logits
+        all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :])
+        all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size]
+        phoneme_logits = all_code_logits_t_phoneme.view(
+            actual_batch_size, self.phoneme_stacking_factor, self.phoneme_vocab_size
+        )
+        max_probs = torch.softmax(phoneme_logits, dim=-1).max(dim=-1).values  # (B, phoneme_stacking_factor)
+
+        # Sample phonemes
+        if state.phoneme_sampling_method == 'argmax':
+            pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.0)
+        else:
+            pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(
+                all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk
+            )
+
+        # In prediction mode, low-confidence phoneme steps are replaced with UNK across
+        # all stacked channels (except steps where EOS is predicted).
+        if (
+            state.phoneme_input_type != 'gt'
+            and hasattr(self.phoneme_tokenizer, 'unk_token_id')
+            and self.phoneme_confidence_unk_threshold > 0.0
+        ):
+            underconfident_step = (max_probs < self.phoneme_confidence_unk_threshold).any(
+                dim=1, keepdim=True
+            )  # (B, 1)
+            eos_predicted_step = (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1, keepdim=True)
+            replace_with_unk = underconfident_step & (~eos_predicted_step)
+            if replace_with_unk.any():
+                unk_tokens = torch.full_like(pred_phoneme_tokens, self.phoneme_tokenizer.unk_token_id)
+                pred_phoneme_tokens = torch.where(replace_with_unk, unk_tokens, pred_phoneme_tokens)
+        # (B, phoneme_stacking_factor)
+        return pred_phoneme_tokens
+
+    def _predict_audio_codes(self, state: StreamingState) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predict audio codes from the last hidden state."""
+        actual_batch_size = state.batch_size
+        last_hidden = state.last_hidden
+
+        # Compute audio logits
+        last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :])
+        all_code_logits_t = self.final_proj(last_hidden_audio)
+
+        # Apply CFG if enabled
+        if state.use_cfg:
+            conditional_logits = all_code_logits_t[:actual_batch_size]
+            unconditional_logits = all_code_logits_t[actual_batch_size:]
+            all_code_logits_t = state.cfg_scale * conditional_logits + (1.0 - state.cfg_scale) * unconditional_logits
+
+        # Sample audio codes
+        audio_codes_next, all_codes_next_argmax = self._sample_audio_codes(
+            last_hidden=last_hidden,
+            all_code_logits_t=all_code_logits_t,
+            temperature=state.temperature,
+            topk=state.topk,
+            use_local_transformer_for_inference=state.use_local_transformer,
+            use_cfg=state.use_cfg,
+            cfg_scale=state.cfg_scale,
+        )
+
+        return audio_codes_next, all_codes_next_argmax
+
+    def streaming_finalize(
+        self,
+        state: StreamingState,
+        use_inference_mode: bool = True,
+    ) -> StreamingFinalizeOutput:
+        """
+        Finalize streaming and return the complete generated audio and phoneme predictions.
+
+        This function should be called after all streaming_step() calls are complete
+        (i.e., when state.finished.all() is True or max steps reached).
+
+        Args:
+            state: Final StreamingState after streaming is complete.
+
+        Returns:
+            StreamingFinalizeOutput containing audio, codes, and phoneme predictions.
+        """
+        batch_size = state.batch_size
+
+        # Extract and decode phoneme predictions
+        phoneme_tokens_list: List[List[int]] = []
+        phoneme_text_list: List[str] = []
+        if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0:
+            # Stack phoneme predictions: each is (B, phoneme_stacking_factor)
+            all_phonemes = torch.stack(state.all_phoneme_predictions, dim=-1)  # (B, S, T)
+            for i in range(batch_size):
+                start = max(0, state.phoneme_prediction_start_idx[i].item())
+                end = state.phoneme_prediction_end_idx[i].item()
+                if end < 0:
+                    end = all_phonemes.size(-1)
+                # Flatten stacked phonemes back to sequence
+                tokens = all_phonemes[i, :, start:end].T.reshape(-1).tolist()
+                # Remove special tokens (BOS, EOS, PAD)
+                special = {self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.eos_token_id}
+                if hasattr(self.phoneme_tokenizer, 'pad_token_id'):
+                    special.add(self.phoneme_tokenizer.pad_token_id)
+                tokens = [t for t in tokens if t not in special]
+                phoneme_tokens_list.append(tokens)
+                phoneme_text_list.append(self.phoneme_tokenizer.decode(tokens))
+        else:
+            phoneme_tokens_list = [[] for _ in range(batch_size)]
+            phoneme_text_list = ["" for _ in range(batch_size)]
+
+        if len(state.all_predictions) == 0:
+            return StreamingFinalizeOutput(
+                audio=torch.zeros(batch_size, 0, device=state.device),
+                audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
+                audio_codes=torch.zeros(batch_size, self.num_audio_codebooks, 0, device=state.device),
+                audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
+                phoneme_tokens=phoneme_tokens_list,
+                phoneme_text=phoneme_text_list,
+            )
+
+        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
+        with grad_ctx():
+            # Concatenate all predictions - each is (B, C, S), concat gives (B, C, T_total_frames)
+            all_codes = torch.cat(state.all_predictions, dim=-1)  # (B, C, T_total_frames)
+            total_frames = all_codes.size(-1)
+            num_codebooks = all_codes.size(1)
+
+            # Start and end indices are in frames (not steps)
+            # If start_idx is -1, item never started audio predictions - use 0
+            # If end_idx is -1, item never ended - use total_frames
+            start_indices = torch.clamp(state.audio_prediction_start_idx, min=0)
+            end_indices = torch.where(
+                state.audio_prediction_end_idx >= 0,
+                state.audio_prediction_end_idx,
+                torch.full_like(state.audio_prediction_end_idx, total_frames),
+            )
+
+            # Calculate per-item lengths (in frames)
+            predicted_codes_lens = end_indices - start_indices
+            max_len = predicted_codes_lens.max().item()
+
+            # Handle case where all items have zero-length predictions
+            if max_len == 0:
+                return StreamingFinalizeOutput(
+                    audio=torch.zeros(batch_size, 0, device=state.device),
+                    audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
+                    audio_codes=torch.zeros(batch_size, num_codebooks, 0, device=state.device, dtype=all_codes.dtype),
+                    audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device),
+                    phoneme_tokens=phoneme_tokens_list,
+                    phoneme_text=phoneme_text_list,
+                )
+
+            # Create padded output tensor and slice each item's valid predictions
+            predicted_codes = torch.zeros(
+                batch_size, num_codebooks, max_len, dtype=all_codes.dtype, device=state.device
+            )
+            for i in range(batch_size):
+                start = start_indices[i].item()
+                end = end_indices[i].item()
+                length = end - start
+                if length > 0:
+                    predicted_codes[i, :, :length] = all_codes[i, :, start:end]
+
+            # No need to remove EOS - end_indices already point to the frame before EOS
+            # Decode to audio (codes are already unstacked: B, C, T)
+            audio, audio_len, decoded_codes = self.codes_to_audio(predicted_codes, predicted_codes_lens)
+
+            return StreamingFinalizeOutput(
+                audio=audio,
+                audio_len=audio_len,
+                audio_codes=predicted_codes,
+                audio_codes_len=predicted_codes_lens,
+                phoneme_tokens=phoneme_tokens_list,
+                phoneme_text=phoneme_text_list,
+            )
+
+    def infer_batch(
+        self,
+        batch: Dict[str, torch.Tensor],
+        max_decoder_steps: int = 500,
+        temperature: float = 0.7,
+        topk: int = 80,
+        use_cfg: bool = False,
+        cfg_scale: float = 1.0,
+        use_local_transformer_for_inference: bool = False,
+        phoneme_input_type: str = 'pred',
+        phoneme_sampling_method: str = 'argmax',
+        force_dropout_text: bool = False,
+        use_teacher_forced: bool = False,
+        use_inference_mode: bool = True,
+    ) -> InferBatchOutput:
+        """
+        Batch inference using streaming infrastructure.
+
+        This is a simple wrapper around streaming_init, streaming_step, and streaming_finalize
+        that processes a batch dictionary similar to training_step/validation_step.
+
+        Args:
+            batch: Dictionary containing:
+                - text: Text token IDs (B, L)
+                - text_lens: Lengths (B,)
+                - context_text_tokens: Context text tokens (B, L')
+                - context_text_tokens_lens: Lengths (B,)
+                - context_audio_codes: Context audio codes (B, C, T) OR
+                - context_audio / context_audio_lens: Raw context audio to encode
+                - phoneme_tokens (optional): GT phoneme tokens (B, L'')
+                - phoneme_tokens_lens (optional): Lengths (B,)
+                For teacher forcing (use_teacher_forced=True), also requires:
+                - audio_codes / audio_codes_lens: GT audio codes (B, C, T) OR
+                - audio / audio_lens: Raw audio waveforms to encode
+            max_decoder_steps: Maximum number of decoder steps.
+            temperature: Sampling temperature for audio codes. Use 0.0 for argmax.
+            topk: Top-k sampling parameter.
+            use_cfg: Whether to use classifier-free guidance.
+            cfg_scale: CFG scale factor.
+            use_local_transformer_for_inference: Whether to use local transformer.
+            phoneme_input_type: 'gt' or 'pred' for phoneme tokens.
+            phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection.
+            force_dropout_text: Whether to dropout text embeddings.
+            use_teacher_forced: If True, feed GT audio codes (and force GT phonemes, argmax sampling)
+                instead of predicted codes at each streaming step.
+
+        Returns:
+            InferBatchOutput containing predicted audio, codes, and RTF metrics.
+        """
+        grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad
+        with grad_ctx():
+            start_time = time.time()
+
+            # Extract tensors from batch
+            text = batch['text']
+            text_lens = batch['text_lens']
+            context_text_tokens = batch['context_text_tokens']
+            context_text_tokens_lens = batch['context_text_tokens_lens']
+
+            # Handle context audio - either use codes directly or encode from audio
+            if 'context_audio_codes' in batch:
+                context_audio_codes = batch['context_audio_codes']
+                context_audio_codes_lens = batch['context_audio_codes_lens']
+            else:
+                context_audio = batch['context_audio']
+                context_audio_lens = batch['context_audio_lens']
+                context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
+
+            # Optional GT phoneme tokens for teacher forcing
+            gt_phoneme_tokens = batch.get('phoneme_tokens')
+            gt_phoneme_tokens_lens = batch.get('phoneme_tokens_lens')
+
+            # Prepare GT audio codes for teacher forcing if requested
+            gt_audio_codes_for_init = None
+            gt_audio_codes_lens_for_init = None
+            if use_teacher_forced:
+                # Force GT phoneme input and argmax sampling
+                phoneme_input_type = 'gt'
+                temperature = 0.0
+
+                # Get GT audio codes
+                if 'audio_codes' in batch:
+                    gt_audio_codes = batch['audio_codes']
+                    gt_audio_codes_lens = batch['audio_codes_lens']
+                elif 'audio' in batch:
+                    gt_audio = batch['audio']
+                    gt_audio_lens = batch['audio_lens']
+                    gt_audio_codes, gt_audio_codes_lens = self.audio_to_codes(gt_audio, gt_audio_lens)
+                else:
+                    raise ValueError("Teacher forcing requires 'audio_codes' or 'audio' in batch")
+
+                # Convert and add special tokens, then stack
+                if self._codec_converter is not None:
+                    gt_audio_codes = self._codec_converter.convert_original_to_new(
+                        audio_tokens=gt_audio_codes, audio_lens=gt_audio_codes_lens
+                    ).long()
+
+                gt_audio_codes_processed, gt_audio_codes_lens_processed = self.add_special_tokens(
+                    codes=gt_audio_codes,
+                    codes_len=gt_audio_codes_lens,
+                    bos_id=self.audio_bos_id,
+                    eos_id=self.audio_eos_id,
+                )
+                gt_audio_codes_processed, gt_audio_codes_lens_processed = self.stack_codes(
+                    gt_audio_codes_processed,
+                    gt_audio_codes_lens_processed,
+                    self.audio_bos_id,
+                    self.audio_eos_id,
+                    self.frame_stacking_factor,
+                    self.num_audio_codebooks,
+                )
+
+                # Input portion: all tokens except the last (teacher forcing shift)
+                gt_audio_codes_for_init = gt_audio_codes_processed[:, :, :-1]
+                gt_audio_codes_lens_for_init = gt_audio_codes_lens_processed - 1
+
+            batch_size = text.size(0)
+
+            # Initialize streaming state
+            state = self.streaming_init(
+                context_audio_codes=context_audio_codes,
+                context_audio_codes_lens=context_audio_codes_lens,
+                context_text_tokens=context_text_tokens,
+                context_text_tokens_lens=context_text_tokens_lens,
+                use_cfg=use_cfg,
+                cfg_scale=cfg_scale,
+                use_local_transformer=use_local_transformer_for_inference,
+                temperature=temperature,
+                topk=topk,
+                phoneme_input_type=phoneme_input_type,
+                phoneme_sampling_method=phoneme_sampling_method,
+                gt_phoneme_tokens=gt_phoneme_tokens,
+                gt_phoneme_tokens_lens=gt_phoneme_tokens_lens,
+                gt_audio_codes=gt_audio_codes_for_init,
+                gt_audio_codes_lens=gt_audio_codes_lens_for_init,
+                use_inference_mode=use_inference_mode,
+            )
+
+            time_to_first_prediction = None
+            generation_start_time = time.time()
+            device = text.device
+
+            # Generate until all items are finished or max steps reached
+            print("Generation started")
+            gen_step = 0
+            while not state.finished.all() and len(state.all_predictions) < max_decoder_steps:
+                gen_step += 1
+                if gen_step % 10 == 0:
+                    print(f"Generation step {gen_step} ")
+                # Gather the correct text token for each batch item based on text_tokens_seen
+                # Items in context phase will have their token ignored by streaming_step
+                positions = state.text_tokens_seen.clamp(max=text.size(1) - 1)
+                current_tokens = text[torch.arange(batch_size, device=device), positions]
+
+                # For items that have exhausted their text, provide EOS token
+                text_exhausted = state.text_tokens_seen >= text_lens
+                current_tokens = torch.where(
+                    text_exhausted, torch.full_like(current_tokens, self.eos_id), current_tokens
+                )
+
+                state, audio_codes, phoneme_tokens = self.streaming_step(
+                    state=state,
+                    text_tokens=current_tokens,
+                    force_dropout_text=force_dropout_text,
+                    use_inference_mode=use_inference_mode,
+                )
+
+                # Record time to first audio prediction
+                if time_to_first_prediction is None and audio_codes is not None:
+                    time_to_first_prediction = time.time() - start_time
+
+            tts_generation_time = time.time() - generation_start_time
+
+            # Finalize and decode audio
+            finalize_output = self.streaming_finalize(state, use_inference_mode=use_inference_mode)
+
+            end_time = time.time()
+            total_time = end_time - start_time
+
+            # Compute RTF metrics
+            total_audio_samples = finalize_output.audio_len.sum().item()
+            total_audio_duration = total_audio_samples / self.output_sample_rate
+            num_frames = len(state.all_predictions)
+            tts_generation_time_per_frame = tts_generation_time / num_frames if num_frames > 0 else 0.0
+
+            rtf_metrics = {
+                'rtf': total_audio_duration / total_time if total_time > 0 else 0.0,
+                'time_to_first_prediction': time_to_first_prediction,
+                'tts_generation_time': tts_generation_time,
+                'total_time': total_time,
+                'total_audio_duration': total_audio_duration,
+                'total_audio_samples': total_audio_samples,
+                'num_decoder_steps': num_frames,
+                'tts_generation_time_per_frame': tts_generation_time_per_frame,
+            }
+
+            # Prepare phoneme token output if available
+            predicted_phoneme_tokens = None
+            predicted_phoneme_tokens_lens = None
+            phoneme_prediction_start_idx_out = None
+            if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0:
+                predicted_phoneme_tokens = torch.stack(state.all_phoneme_predictions, dim=-1)  # (B, S, T)
+                # Per-item valid phoneme prediction lengths
+                phoneme_start = torch.clamp(state.phoneme_prediction_start_idx, min=0)
+                phoneme_end = torch.where(
+                    state.phoneme_prediction_end_idx >= 0,
+                    state.phoneme_prediction_end_idx,
+                    torch.full_like(
+                        state.phoneme_prediction_end_idx, predicted_phoneme_tokens.size(-1)
+                    ),
+                )
+                predicted_phoneme_tokens_lens = phoneme_end - phoneme_start
+                phoneme_prediction_start_idx_out = phoneme_start
+
+            return InferBatchOutput(
+                predicted_audio=finalize_output.audio,
+                predicted_audio_lens=finalize_output.audio_len,
+                predicted_codes=finalize_output.audio_codes,
+                predicted_codes_lens=finalize_output.audio_codes_len,
+                rtf_metrics=rtf_metrics,
+                predicted_phoneme_tokens=predicted_phoneme_tokens,
+                predicted_phoneme_tokens_lens=predicted_phoneme_tokens_lens,
+                phoneme_prediction_start_idx=phoneme_prediction_start_idx_out,
+            )
+
+    @staticmethod
+    def _load_audio_for_inference(audio_path: str, target_sample_rate: int) -> torch.Tensor:
+        audio_data, sr = sf.read(audio_path, dtype='float32')
+        if len(audio_data.shape) > 1:
+            audio_data = audio_data[:, 0]
+        audio_tensor = torch.tensor(audio_data).unsqueeze(0)
+        if sr != target_sample_rate:
+            import torchaudio
+
+            audio_tensor = torchaudio.functional.resample(audio_tensor, sr, target_sample_rate)
+        return audio_tensor.unsqueeze(0)
+
+    @staticmethod
+    def _adjust_audio_to_duration_for_inference(
+        audio: torch.Tensor, sample_rate: int, target_seconds: float, codec_model_samples_per_frame: int
+    ) -> torch.Tensor:
+        target_samples = int(target_seconds * sample_rate)
+        target_samples = (target_samples // codec_model_samples_per_frame) * codec_model_samples_per_frame
+        if audio.size(-1) > target_samples:
+            audio = audio[:, :, :target_samples]
+        elif audio.size(-1) < target_samples:
+            # repeat to fill
+            repeats = target_samples // audio.size(-1) + 1
+            audio = audio.repeat(1, 1, repeats)[:, :, :target_samples]
+        return audio
+
+    def do_tts(
+        self,
+        transcript: str,
+        context_audio_file_path: Optional[str] = None,
+        context_text: str = "[NO TEXT CONTEXT]",
+        main_tokenizer_name: Optional[str] = None,
+        context_audio_duration: float = 5.0,
+        use_cfg: bool = True,
+        cfg_scale: float = 2.5,
+        use_local_transformer: bool = True,
+        temperature: float = 0.7,
+        topk: int = 80,
+        max_steps: int = 330,
+        gt_phoneme_text: Optional[str] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Generate speech from transcript using EasyMagpie inference with optional context text/audio.
+        Optionally accepts ground-truth phoneme text (IPA string) for decoder-only inference.
+        """
+        if transcript is None or transcript.strip() == "":
+            raise ValueError("`transcript` must be a non-empty string.")
+
+        device = next(self.parameters()).device
+        transcript = transcript.strip()
+        context_text = (context_text or "[NO TEXT CONTEXT]").strip()
+
+        if main_tokenizer_name is None:
+            # Match model init behavior: default to first configured tokenizer.
+            main_tokenizer_name = list(self.cfg.text_tokenizers.keys())[0]
+        if main_tokenizer_name not in self.tokenizer.tokenizers:
+            raise ValueError(
+                f"Unknown main_tokenizer_name='{main_tokenizer_name}'. "
+                f"Available tokenizers: {list(self.tokenizer.tokenizers.keys())}"
+            )
+
+        text_tokens = self.tokenizer.encode(transcript, tokenizer_name=main_tokenizer_name) + [self.eos_id]
+        text = torch.tensor([text_tokens], dtype=torch.long, device=device)
+        text_lens = torch.tensor([len(text_tokens)], dtype=torch.long, device=device)
+
+        context_text_tokens = self.tokenizer.encode(context_text, tokenizer_name=self.text_conditioning_tokenizer_name)
+        context_text_tensor = torch.tensor([context_text_tokens], dtype=torch.long, device=device)
+        context_text_lens = torch.tensor([len(context_text_tokens)], dtype=torch.long, device=device)
+
+        if context_audio_file_path is not None and context_audio_file_path.strip() != "":
+            context_audio = self._load_audio_for_inference(context_audio_file_path, self.sample_rate)
+            context_audio = self._adjust_audio_to_duration_for_inference(
+                context_audio,
+                self.sample_rate,
+                context_audio_duration,
+                self.codec_model_samples_per_frame,
+            )
+            context_audio = context_audio.to(device)
+            context_audio_lens = torch.tensor([context_audio.size(1)], dtype=torch.long, device=device)
+            with torch.inference_mode():
+                context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
+        else:
+            context_audio_codes = torch.zeros(
+                1,
+                self.data_num_audio_codebooks,
+                0,
+                dtype=torch.long,
+                device=device,
+            )
+            context_audio_codes_lens = torch.zeros(1, dtype=torch.long, device=device)
+
+        batch = {
+            'text': text,
+            'text_lens': text_lens,
+            'context_text_tokens': context_text_tensor,
+            'context_text_tokens_lens': context_text_lens,
+            'context_audio_codes': context_audio_codes,
+            'context_audio_codes_lens': context_audio_codes_lens,
+        }
+        phoneme_input_type = 'pred'
+        if gt_phoneme_text is not None:
+            if self.phoneme_tokenizer is None:
+                raise ValueError(
+                    "Model does not have a phoneme tokenizer configured, but gt_phoneme_text was provided."
+                )
+            gt_phoneme_text = gt_phoneme_text.strip()
+            if gt_phoneme_text == "":
+                raise ValueError("`gt_phoneme_text` must be a non-empty string when provided.")
+            gt_phoneme_tokens = self.phoneme_tokenizer.encode(gt_phoneme_text)
+            gt_phoneme_tokens = (
+                [self.phoneme_tokenizer.bos_token_id] + gt_phoneme_tokens + [self.phoneme_tokenizer.eos_token_id]
+            )
+            if len(gt_phoneme_tokens) == 0:
+                raise ValueError("Failed to encode `gt_phoneme_text` into phoneme tokens.")
+            batch['phoneme_tokens'] = torch.tensor([gt_phoneme_tokens], dtype=torch.long, device=device)
+            batch['phoneme_tokens_lens'] = torch.tensor([len(gt_phoneme_tokens)], dtype=torch.long, device=device)
+            phoneme_input_type = 'gt'
+
+        with torch.inference_mode():
+            output = self.infer_batch(
+                batch=batch,
+                max_decoder_steps=max_steps,
+                temperature=temperature,
+                topk=topk,
+                use_cfg=use_cfg,
+                cfg_scale=cfg_scale,
+                use_local_transformer_for_inference=use_local_transformer,
+                phoneme_input_type=phoneme_input_type,
+                phoneme_sampling_method='argmax',
+                use_teacher_forced=False,
+                use_inference_mode=True,
+            )
+        return output.predicted_audio, output.predicted_audio_lens
+
+    @classmethod
+    def list_available_models(cls) -> List[PretrainedModelInfo]:
+        return []
diff --git a/nemo/collections/tts/models/magpietts.py b/nemo/collections/tts/models/magpietts.py
index 4d34471af5a1..28af39542f21 100644
--- a/nemo/collections/tts/models/magpietts.py
+++ b/nemo/collections/tts/models/magpietts.py
@@ -33,13 +33,12 @@
 from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
 from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
 from torch import nn
-from torch.utils.data import get_worker_info
-
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.collections.tts.data.text_to_speech_dataset_lhotse import MagpieTTSLhotseDataset, setup_tokenizers
 from nemo.collections.tts.losses.aligner_loss import ForwardSumLoss
 from nemo.collections.tts.losses.moe_loss import MoEAuxiliaryLoss, compute_expert_usage
 from nemo.collections.tts.models import AudioCodecModel
+from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel, worker_init_fn
 from nemo.collections.tts.modules import transformer_2501
 from nemo.collections.tts.modules.aligner import AlignmentEncoder
 from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter
@@ -48,7 +47,6 @@
     EOSDetectionMethod,
     LocalTransformerType,
     SpecialAudioToken,
-    cosine_schedule,
 )
 from nemo.collections.tts.parts.utils.helpers import (
     binarize_attention_parallel,
@@ -61,7 +59,6 @@
     get_tokenizer_for_language,
     stack_tensors,
 )
-from nemo.core.classes import ModelPT
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
 
@@ -302,17 +299,7 @@ def from_dict(cls, data: dict) -> 'ModelInferenceParameters':
         return cls(**filtered_data)
 
 
-def worker_init_fn(worker_id):
-    # For mp.set_start_method("spawn", force=True)
-    # The dataset class should be picklable, so we initialize non-picklable objects here
-    logging.info(f"Worker {worker_id} initializing...")
-    worker_info = get_worker_info()
-    dataset = worker_info.dataset  # Get the dataset instance in this worker
-    tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type)
-    dataset.text_tokenizer = tokenizer
-
-
-class MagpieTTSModel(ModelPT):
+class MagpieTTSModel(BaseMagpieTTSModel):
     """
     Magpie-TTS Model Base Class used for training a TTS model that can generate audio codes from transcript and a context
     audio/text
@@ -489,6 +476,11 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, cfg.embedding_dim))
         self.audio_embeddings = nn.ModuleList(audio_embeddings)
 
+        # Identity projections required by BaseMagpieTTSModel local transformer methods.
+        # MagpieTTSModel embeds directly in embedding_dim, so no projection is needed.
+        self.audio_in_projection = nn.Identity()
+        self.local_transformer_audio_out_projection = nn.Identity()
+
         if self.use_bpe_char_tokenizer:
             # BPE char tokenizer
             assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer"
@@ -753,29 +745,11 @@ def _setup_inference_parameters(self, cfg: DictConfig) -> None:
         """
         self.inference_parameters = ModelInferenceParameters.from_dict(cfg.get("inference_parameters", {}))
 
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        """
-        Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model
-        from the checkpoint. The codec model is saved in a separate checkpoint.
-
-        _speaker_verification_model is only included in older checkpoints with the older single_encoder_sv_tts
-        model_type that is no longer supported and can likely be removed in a future version.
-
-        If the model has a baked context embedding, the context_encoder weights are also excluded
-        since they are no longer needed for inference.
-        """
-        if hasattr(self, '_no_state_dict') and self._no_state_dict:
-            return {}
-        # Don't save the speaker verification and codec model in the state dict
-        state_dict = super().state_dict(destination, prefix, keep_vars)
-        keys_substrings_to_exclude = ['_speaker_verification_model', '_codec_model']
-        # If we have a baked context embedding, exclude context_encoder weights
+    def _get_state_dict_keys_to_exclude(self):
+        keys = ['_speaker_verification_model', '_codec_model']
         if self.has_baked_context_embedding:
-            keys_substrings_to_exclude.append('context_encoder')
-        for key in list(state_dict.keys()):
-            if any([substring in key for substring in keys_substrings_to_exclude]):
-                del state_dict[key]
-        return state_dict
+            keys.append('context_encoder')
+        return keys
 
     def check_frame_stacking_config_validity(self):
         """
@@ -1021,83 +995,6 @@ def load_state_dict(self, state_dict, strict=True):
                         new_state_dict[key[len(name_with_dot) :]] = state_dict[key]
                 child.load_state_dict(new_state_dict)
 
-    def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1):
-        # codes: (B, C, T')
-        # codes_len: (B,)
-        codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0)
-        codes_len = codes_len + num_eos_tokens
-        # Insert EOS token at new final token entry
-        for idx in range(codes.size(0)):
-            codes[idx, :, codes_len[idx] - 1] = eos_id
-
-        return codes, codes_len
-
-    def add_special_tokens(self, codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1):
-        # codes: (B, C, T')
-        # codes_len: (B,)
-        codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id)
-        codes_len = codes_len + num_bos_tokens
-        codes, codes_len = self.add_eos_token(
-            codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens
-        )
-        return codes, codes_len
-
-    def remove_bos_token(self, codes, codes_len, num_tokens=1):
-        # codes: (B, C, T')
-        # codes_len: (B,)
-        codes = codes[:, :, num_tokens:]
-        codes_len = codes_len - num_tokens
-        return codes, codes_len
-
-    def remove_embedded_bos_token(self, embedded, embedded_len):
-        # codes: (B, T', C)
-        # codes_len: (B,)
-        embedded = embedded[:, 1:, :]
-        embedded_len = embedded_len - 1
-        return embedded, embedded_len
-
-    def remove_eos_token(self, codes, codes_len):
-        # codes: (B, C, T')
-        # codes_len: (B,)
-        codes_len = codes_len - 1
-        codes = codes[:, :, :-1]
-        mask = get_mask_from_lengths(lengths=codes_len)
-        codes = codes * mask.unsqueeze(1)
-        return codes, codes_len
-
-    def remove_embedded_eos_token(self, embedded, embedded_len):
-        # embedded: (B, T', D)
-        # embedded_len: (B,)
-        embedded_len = embedded_len - 1
-        embedded = embedded[:, :-1, :]
-        mask = get_mask_from_lengths(lengths=embedded_len)
-        embedded = embedded * mask.unsqueeze(2)
-        return embedded, embedded_len
-
-    def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1):
-        codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens)
-        codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len)
-        return codes, codes_len
-
-    def audio_to_codes(self, audio, audio_len, sample_rate=None):
-        self._codec_model.eval()
-        with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32):
-            codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate)
-            return codes, codes_len
-
-    def codes_to_audio(self, codes, codes_len):
-        # codes: (B, C, T')
-        # codes_len: (B,)
-        self._codec_model.eval()
-        with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
-            # Pass the modified integer token IDs
-            if self._codec_converter is not None:
-                codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len)
-            audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len)
-            # audio: (B, T)
-            # audio_len: (B,)
-            return audio, audio_len, codes
-
     def embed_audio_tokens(self, audio_tokens, audio_tokens_lens):
         B, C, T = audio_tokens.shape
         audio_tokens = self.pad_audio_codes(audio_tokens).long()
@@ -1118,116 +1015,6 @@ def embed_audio_tokens(self, audio_tokens, audio_tokens_lens):
 
         return audio_embedding, audio_embedding_lens
 
-    def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False):
-        """
-        Predicts the logits for all codebooks using the local transformer. Used in both autoregressive (AR) and MaskGit (MG) modes.
-        This function is used in training and validation, not inference/sampling.
-        The sequence layout is slightly different between AR and MG modes, as shown in the diagram below,
-        (using an 8-codebook setup as an example):
-        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-        | AR target  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |   none  |
-        | codebook   |         |         |         |         |         |         |         |         |         |
-        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-        | MG target  |  none   |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
-        | codebook   |         |         |         |         |         |         |         |         |         |
-        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-        |  input     | Magpie  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
-        |  codebook  | latent  | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK |
-        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-        | seq. index |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |    8    |
-        +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-
-        Args:
-            dec_out: (B, T', E)
-            audio_codes_target: (B, C, T')
-            targets_offset_by_one: bool, if False, the target for index 0 is codebook 0, for index 1 is codebook 1, etc. (autoregressive)
-                if True,  the target for index 1 is codebook 0, for index 2 is codebook 1, etc. (MaskGit)
-        """
-        C = self.num_audio_codebooks
-        dec_out_all = dec_out.reshape(-1, dec_out.size(-1))  # (B*T', E)
-        local_transformer_input = [dec_out_all]
-        audio_codes_target = self.pad_audio_codes(audio_codes_target).long()
-        # Build the teacher-forced input to the LT.
-        for fs_index in range(self.frame_stacking_factor):
-            for codebook_num in range(C):
-                # Collect ground truth codes for the current codebook and frame stack index combintation.
-                codes = audio_codes_target[:, codebook_num, fs_index :: self.frame_stacking_factor]  # (B, T')
-                # Individual timesteps are independently handled by the LT fold time into the batch dimension.
-                codes = codes.reshape(-1)  # (B*T',)
-                # Embed the codes
-                codebook_embedding = self.audio_embeddings[codebook_num + fs_index * C](codes)  # (B*T', E)
-                local_transformer_input.append(codebook_embedding)
-        # Stack the input codes along dimension 1 (codebooks). This is the dimension along which the LT predicts iteratively.
-        local_transformer_input = torch.stack(local_transformer_input, dim=1)  # (B*T', C+1, E)
-        local_transformer_input = self.local_transformer_in_projection(local_transformer_input)  # (B*T', C+1, 128)
-        _mask = torch.ones(
-            local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
-        )
-        local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']  # (B*T', C+1, E)
-        if not targets_offset_by_one:
-            # for autoregressive local transformer the target for index 0 is codebook 0, for index 1 is codebook 1, etc.
-            local_transformer_output = local_transformer_output[:, :-1, :]  # (B*T', C, E)
-        else:
-            # for MaskGit the target for index **1** is codebook 0, for index 2 is codebook 1, etc.
-            local_transformer_output = local_transformer_output[:, 1:, :]  # (B*T', C, E)
-        all_code_logits = []
-        for fs_index in range(self.frame_stacking_factor):
-            for codebook_num in range(audio_codes_target.size(1)):
-                # Using a separate projection layer for each codebook (to distinguish between them)
-                # Checked the time - this loop is not taking much time (compared to the local transformer forward pass)
-                codebook_logits = self.local_transformer_out_projections[codebook_num + fs_index * C](
-                    local_transformer_output[:, codebook_num + fs_index * C, :]
-                )  # (B*T', num_all_tokens_per_codebook)
-                all_code_logits.append(codebook_logits)
-        all_code_logits = torch.cat(
-            all_code_logits, dim=1
-        )  # (B*T'/frame_stacking_factor, num_codebooks * num_all_tokens_per_codebook * frame_stacking_factor)
-
-        all_code_logits = all_code_logits.view(
-            audio_codes_target.size(0), audio_codes_target.size(2) // self.frame_stacking_factor, -1
-        )  # (B, T'/frame_stacking_factor, C * num_all_tokens_per_codebook * frame_stacking_factor)
-
-        return all_code_logits
-
-    def maskgit_create_random_mask(self, codes):
-        """
-        Creates a mask where True indicates the positions that should be replaced with a MASK_TOKEN.
-        """
-        # Codes: (B, C, T)
-        B, C, T = codes.shape
-        # get a uniform random vector uniformly sampled from [0,1) ## Todo does it need to be inclusive on the right?
-        rand_values = torch.rand(B, T, device=codes.device)
-        # apply the cosine schedule
-        frac_masked = cosine_schedule(rand_values)
-        # how many positions to mask
-        n_masked = torch.ceil(frac_masked * C).long()  # B,T
-        # The code further below is the vectorized version of this:
-        #  for b in range(B):
-        #      for t in range(T):
-        #          if n_masked[b,t] > 0:
-        #              # get a random permutation of the codebook indices
-        #              perm = torch.randperm(C)
-        #              # mask the top n_masked positions
-        #              mask[b, perm[:n_masked[b,t]], t] = True
-        #
-        # Create random permutations
-        random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1)  # (B, C, T)
-        # Create a mask tensor where each position indicates if it should be masked
-        mask_indices = torch.arange(C, device=codes.device).view(1, C, 1)
-        mask = mask_indices < n_masked.view(B, 1, T)  # (B, C, T)
-        # Apply the random permutations to the mask
-        mask = torch.gather(mask, 1, random_permutations)
-
-        return mask  # (B, C, T)
-
-    def maskgit_apply_random_mask(self, codes):
-        # Randomly replaces some codes with the MASK_TOKEN with a proportion following the cosine schedule.
-        # Codes: (B, C, T)
-        mask = self.maskgit_create_random_mask(codes)
-        # replace some tokens with MASK_TOKEN
-        codes_with_mask = torch.where(mask, self.mask_token_id, codes)
-        return codes_with_mask, mask
-
     def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=None, frame_stacking_factor=1):
         """
         Computes the audio codebook loss. Used by:
@@ -1373,376 +1160,6 @@ def code_to_str(code):
             output_str += c
         logging.debug(output_str)
 
-    def clear_forbidden_logits(self, logits: torch.Tensor, forbid_audio_eos: bool = False) -> torch.Tensor:
-        """
-        Sets logits of forbidden tokens to `-inf` so they will never be sampled.
-        Specifically, we forbid sampling of all special tokens except AUDIO_EOS
-        which is allowed by default.
-
-        Args:
-            logits: (B, C, num_audio_tokens_per_codebook)
-            forbid_audio_eos (bool, optional): If True, also forbid AUDIO_EOS tokens
-                from being sampled. Default: False.
-        """
-        logits[
-            :,
-            :,
-            SpecialAudioToken.get_forbidden_tokens(self.codebook_size, forbid_audio_eos=forbid_audio_eos),
-        ] = float('-inf')
-        return logits
-
-    def local_transformer_sample_maskgit(
-        self,
-        dec_output: torch.Tensor,
-        temperature: float = 0.7,
-        topk: int = 80,
-        unfinished_items: Dict[int, bool] = {},
-        finished_items: Dict[int, bool] = {},
-        use_cfg: bool = False,
-        cfg_scale: float = 1.0,
-        n_steps: int = 3,
-        noise_scale: float = 0.0,
-        fixed_schedule: Optional[List[int]] = None,
-        dynamic_cfg_scale: bool = False,
-        sampling_type: Optional[str] = None,
-        forbid_audio_eos: bool = False,
-    ) -> torch.Tensor:
-        """
-        Sample audio codes for the current timestep using MaskGit-like iterative
-        prediction with the local transformer. If frame-stacking is enabled, the
-        codes for all frames in the stack are sampled, treated as one long sequence.
-
-        The MaskGit process starts with all positions masked and iteratively unmasks the
-        most confident positions over multiple steps. By "masked" we mean that a
-        dedicated MASK token is used (as opposed to attention masking). The LT in this
-        case is a non-causal transformer decoder. At each step the model predicts all
-        positions at once.  Of those predictions, a subset of the most confident
-        previously-masked positions is kept and unmasked in the next step. The number of
-        positions that are unmasked at each step is determined by the unmasking
-        schedule. We support a cosine schedule and a fixed schedule provided by the
-        user.
-
-        Uses multinomial sampling with temperature, top-k, and classifier-free guidance (CFG).
-
-        Special handling:
-
-        * forbids special tokens (like AUDIO_BOS, AUDIO_CONTEXT_EOS, etc.) from being sampled
-        * forces / forbids EOS for finished / unfinished items respectively
-        * optionally, globally forbids audio EOS for all items in the batch.
-          This is useful early in the generation process.
-        * supports different unmasking methods, see `sampling_type` argument for details.
-
-        Args:
-            dec_output (torch.Tensor): Decoder output tensor with shape (B, E) where B is batch size
-                and E is primary decoder's embedding dimension.
-            temperature (float, optional): Sampling temperature
-            topk (int, optional): Number of top-probability tokens to consider in sampling.
-            unfinished_items (dict, optional): Dictionary containing indices of batch
-                items that we are confident have not completed generation. For these items, audio EOS
-                sampling is forbidden.
-            finished_items (dict, optional): Dictionary containing indices of batch
-                items that we are confident are completed. For these items, audio EOS sampling
-                is forced.
-            use_cfg (bool, optional): Whether to use classifier-free guidance. If True, expects batch size
-                to be doubled with conditional and unconditional outputs from the primary decoder.
-            cfg_scale (float, optional): Scale factor for classifier-free guidance. Only used if use_cfg=True.
-            n_steps (int, optional): Number of iterative refinement steps for MaskGit sampling.
-            noise_scale (float, optional): Scale factor for noise to add to confidence scores
-                during sampling (experimental).
-            fixed_schedule (list, optional): Fixed schedule for number of tokens to unmask at each step.
-                If None, uses cosine schedule.
-            dynamic_cfg_scale (bool, optional): Whether to dynamically adjust CFG scale during
-                sampling (experimental).
-            sampling_type (str, optional): Type of sampling strategy. Options are:
-                ["default", "causal", "purity_causal", "purity_default"].
-
-                * Purity refers to "purity sampling" from https://arxiv.org/abs/2304.01515. If "purity"
-                  is not specified, confidence sampling is used as in the original MaskGit paper.
-                * "default"/"causal": Controls the order of unmasking across frames when frame-stacking is enabled.
-                  If "causal" is specified, frames are unmasked in causal order. "default"
-                  doesn't impose any constraints on the unmasking order.
-            forbid_audio_eos (bool, optional): Whether to globally forbid audio EOS for the entire
-                batch.
-
-        Returns:
-            torch.Tensor: Sampled audio codes with shape (B, num_codebooks, frame_stacking_factor)
-        """
-        # dec_output: (B, E)
-        device = dec_output.device
-        # disable KV cache since our transformer is not causal
-        self.local_transformer.reset_cache(use_cache=False)
-        dec_output = dec_output.unsqueeze(1)  # (B, 1, E)
-        local_transformer_input_init = self.local_transformer_in_projection(
-            dec_output
-        )  # (B, 1, D) where D is the dimension of the local transformer
-        codebook_seq_len = self.num_audio_codebooks * self.frame_stacking_factor
-        B = dec_output.size(0)
-
-        min_confidence = 0
-        # this needs to be large enough that unmasked items will always remain unmasked (even after noise addition)
-        # Setting it smaller could allow "regret", i.e. re-masking a codebook that was previously unmasked; we might want to try that
-        max_confidence = 5
-        confidences = min_confidence * torch.ones(B, codebook_seq_len, device=device)
-        # initialize to all masked
-        codes = self.mask_token_id * torch.ones((B, codebook_seq_len), device=device, dtype=torch.long)
-        sampled_codes = codes.clone()
-        if fixed_schedule is not None:
-            n_steps = len(fixed_schedule)
-        for step in range(n_steps):
-            # how far along we are in the unmasking process
-            progress = step / n_steps
-            # get mask fraction
-            frac_masked = cosine_schedule(torch.tensor(progress))
-            if sampling_type == "causal" or sampling_type == "purity_causal":
-                frac_masked = torch.ones_like(frac_masked) * (1.0 - progress)
-            # how many codebooks to mask
-            if fixed_schedule is None:
-                n_masked = torch.ceil(codebook_seq_len * frac_masked).long()
-            else:
-                n_masked = codebook_seq_len - fixed_schedule[step]
-            n_unmasked = codebook_seq_len - n_masked
-
-            if (
-                sampling_type == "causal" or sampling_type == "purity_causal"
-            ):  # and n_unmasked <= self.num_audio_codebooks:
-                # force second frame not to be unmasked
-                n_frames_to_allow = int(np.floor(progress * self.frame_stacking_factor + 1))
-                confidences[:, n_frames_to_allow * self.num_audio_codebooks :] = (
-                    min_confidence - 1
-                )  # only tested for frame_stacking_factor=2
-
-            # pick top-confidence codebooks up to n_unmasked
-            _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1)
-            if use_cfg:
-                actual_batch_size = topk_indices.size(0) // 2
-                assert (
-                    topk_indices[actual_batch_size:] == topk_indices[:actual_batch_size]
-                ).all(), "Topk indices are not the same for conditional and unconditional codes"
-
-            # replace masks of the top-k confident codebooks with the codes that were sampled for them
-            unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices)
-            codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
-
-            # build transformer input
-            local_transformer_input = local_transformer_input_init
-            for codebook_num in range(codebook_seq_len):
-                next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(
-                    1
-                )  # (B, 1, 768)
-                next_local_transformer_input = self.local_transformer_in_projection(
-                    next_local_transformer_input
-                )  # (B, 1, d_local)
-                local_transformer_input = torch.cat(
-                    [local_transformer_input, next_local_transformer_input], dim=1
-                )  # (B, codebook_num+1, d_local)
-
-            # run transformer
-            _mask = torch.ones(B, codebook_seq_len + 1, device=device)
-            local_transformer_output = self.local_transformer(local_transformer_input, _mask)[
-                'output'
-            ]  # (B, C+1, d_local)
-
-            # get logits
-            logits = []
-            for codebook_num in range(codebook_seq_len):
-                # The `codebook_num+1` is to drop first position which corresponds to the magpie latent
-                codebook_logits = self.local_transformer_out_projections[codebook_num](
-                    local_transformer_output[:, codebook_num + 1, :]
-                )  # (B, num_audio_tokens_per_codebook)
-                logits.append(codebook_logits)
-            logits = torch.stack(logits, dim=1)  # (B, C*frame_stacking_factor, num_audio_tokens_per_codebook)
-
-            # apply CFG
-            if use_cfg:
-                actual_batch_size = logits.size(0) // 2
-                conditional_logits = logits[:actual_batch_size]
-                unconditional_logits = logits[actual_batch_size:]
-                if not dynamic_cfg_scale:
-                    current_cfg_scale = cfg_scale
-                else:
-                    # gradually increase the scale until mid point through sampling, then reduce it again
-                    progress = step / (n_steps - 1)
-                    # interp = -abs(progress-0.5)+0.5 # increase from 0..1 in the interval from start to midpoint and then go back to zero
-                    # interp = 1.0 - progress  # decrease from 1 to 0
-                    interp = progress  # gradually increase from 0 to 1
-                    current_cfg_scale = (cfg_scale - 1) * interp + 1.0  # 1.0 --> cfg_scale --> 1.0
-                cfg_logits = current_cfg_scale * conditional_logits + (1.0 - current_cfg_scale) * unconditional_logits
-                logits[:actual_batch_size] = cfg_logits
-
-            # Disallow generation of special tokens
-            logits = self.clear_forbidden_logits(logits, forbid_audio_eos=forbid_audio_eos)
-
-            # handle unfinished and finished items
-            for item_idx in unfinished_items:
-                logits[item_idx, self.audio_eos_id] = float('-inf')
-            for item_idx in finished_items:
-                logits[item_idx, :, :] = float('-inf')
-                logits[item_idx, :, self.audio_eos_id] = 0.0
-
-            # sample with top-k
-            logits_topk = torch.topk(logits, topk, dim=-1)[0]  # (B, C, topk)
-            indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1)  # (B, C, num_audio_tokens_per_codebook)
-            logits_rescored = logits.clone()
-            logits_rescored[indices_to_remove] = float('-inf')
-            probs = torch.softmax(logits_rescored / temperature, dim=-1)  # (B, C, num_audio_tokens_per_codebook)
-            sampled_codes = torch.multinomial(probs.view(B * codebook_seq_len, -1), 1).view(B, codebook_seq_len)
-            if use_cfg:
-                sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size]
-                probs[actual_batch_size:] = probs[:actual_batch_size]
-            if sampling_type != "purity_causal" and sampling_type != "purity_default":
-                confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1)
-            else:
-                # use the max probability across all tokens for each codebook as the confidence for each codebook; known as "purity sampling"
-                confidences = probs.max(dim=2)[0]
-            # replace entries in sampled_codes with previously unmasked codebooks
-            sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
-            #  add noise to confidences (as in token-critic paper, https://arxiv.org/abs/2209.04439)
-            if noise_scale > 0.0:
-                # get noise from uniform distribution in the interval [-0.5, 0.5), scale it by `noise_scale`,
-                # and anneal it to 0 as we approach the end of the unmasking process
-                noise = (
-                    (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps)
-                )  # the +2 makes sure that by the last iteration the noise is exactly 0
-                confidences += noise
-                # the conditional and unconditional get different noise and must be fixed to be the same again
-                confidences[actual_batch_size:] = confidences[:actual_batch_size]
-            confidence_eps = 0.1
-            assert (
-                confidences.max() + confidence_eps < max_confidence
-            ), f"Predicted confidence is approaching max_confidence: {confidences.max()}"
-            # for unmasked codebooks, set confidence to max so that they will remain unmasked
-            confidences.scatter_(
-                index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float)
-            )
-        codes = sampled_codes
-        assert not (
-            codes == self.mask_token_id
-        ).any(), "Codes contain mask tokens after completion of MaskGit sampling"
-
-        # break stacked groups of frames into individual frames
-        codes = codes.reshape(B, self.frame_stacking_factor, self.num_audio_codebooks).permute(
-            0, 2, 1
-        )  # B, C, frame_stacking_factor
-
-        if use_cfg:
-            # drop unconditional codes
-            codes = codes[:actual_batch_size]
-        return codes
-
-    def local_transformer_sample_autoregressive(
-        self,
-        dec_output: torch.Tensor,
-        temperature: float = 0.7,
-        topk: int = 80,
-        unfinished_items: Dict[int, bool] = {},
-        finished_items: Dict[int, bool] = {},
-        use_cfg: bool = False,
-        cfg_scale: float = 1.0,
-        use_kv_cache: bool = True,
-        forbid_audio_eos: bool = False,
-    ) -> torch.Tensor:
-        """
-        Sample audio codes autoregressively across codebooks using the local
-        transformer. Uses multinomial sampling with temperature, top-k, and
-        classifier-free guidance (CFG).
-
-        The sequence is initialized with the primary decoder's hidden output as the only
-        input and is gradually extended a code for one codebook at a time, appending the
-        sampled code as input sequence for the next step. At the last step the sequence
-        is `num_codebooks` long. If frame stacking is enabled, codes for all frames in
-        the stack are sampled as one long sequence and the final sequence length is
-        `num_codebooks * frame_stacking_factor` codes long.
-
-        Special handling:
-        * forbids special tokens (like AUDIO_BOS, AUDIO_CONTEXT_EOS, etc.) from being sampled
-        * forces / forbids EOS for finished / unfinished items respectively
-        * optionally, globally forbids audio EOS (useful early in the generation process)
-
-        Args:
-            dec_output (torch.Tensor): Decoder output tensor with shape (B, E) where B is batch size
-                and E is primary decoder's embedding dimension.
-            temperature (float, optional): Sampling temperature.
-            topk (int, optional): Number of top-probability tokens to consider in sampling.
-            unfinished_items (dict, optional): Dictionary containing indices of batch
-                items that we are confident have not completed generation. For these items, audio EOS
-                sampling is forbidden.
-            finished_items (dict, optional): Dictionary containing indices of batch
-                items that we are confident are completed. For these items, audio EOS sampling
-                is forced.
-            use_cfg (bool, optional): Whether to use classifier-free guidance. If True, expects batch size
-                to be doubled with conditional and unconditional outputs from the primary decoder.
-            cfg_scale (float, optional): Scale factor for classifier-free guidance. Only used if use_cfg=True.
-            use_kv_cache (bool, optional): Whether to use key-value caching in the transformer.
-            forbid_audio_eos (bool, optional): Whether to globally forbid audio EOS for the entire
-                batch.
-
-        Returns:
-            torch.Tensor: Sampled audio codes with shape (B, num_codebooks, frame_stacking_factor)
-                where B is batch size (or actual_batch_size if use_cfg=True).
-        """
-
-        self.local_transformer.reset_cache(use_cache=use_kv_cache)
-        dec_output = dec_output.unsqueeze(1)  # (B, 1, E)
-        local_transformer_input = self.local_transformer_in_projection(dec_output)  # (B, 1, 128)
-        all_preds = []
-        for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor):
-            _mask = torch.ones(
-                local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
-            )
-            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']  # (B, T, 128)
-            codebook_logits = self.local_transformer_out_projections[codebook_num](
-                local_transformer_output[:, -1, :]
-            )  # (B, num_all_tokens_per_codebook)
-            if use_cfg:
-                actual_batch_size = codebook_logits.size(0) // 2
-                conditional_logits = codebook_logits[:actual_batch_size]
-                unconditional_logits = codebook_logits[actual_batch_size:]
-                cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
-                codebook_logits[:actual_batch_size] = cfg_logits
-
-            for item_idx in unfinished_items:
-                codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
-            for item_idx in finished_items:
-                codebook_logits[item_idx, :] = float('-inf')
-                codebook_logits[item_idx, self.audio_eos_id] = 0.0
-
-            # Disallow generation of special tokens
-            codebook_logits = self.clear_forbidden_logits(
-                codebook_logits.unsqueeze(1), forbid_audio_eos=forbid_audio_eos
-            ).squeeze(1)
-
-            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]  # (B, topk)
-            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(
-                -1
-            )  # (B, num_tokens_per_codebook)
-            codebook_logits_rescored = codebook_logits.clone()
-            codebook_logits_rescored[indices_to_remove] = float('-inf')
-            codebook_probs = torch.softmax(
-                codebook_logits_rescored / temperature, dim=-1
-            )  # (B, num_tokens_per_codebook)
-            codebook_preds = torch.multinomial(codebook_probs, 1)  # (B, 1)
-            if use_cfg:
-                codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size]
-            all_preds.append(codebook_preds)
-            next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(
-                1
-            )  # (B, 1, 128)
-            next_local_transformer_input = self.local_transformer_in_projection(
-                next_local_transformer_input
-            )  # (B, 1, 128)
-            local_transformer_input = torch.cat(
-                [local_transformer_input, next_local_transformer_input], dim=1
-            )  # (B, T+1, 128)
-
-        all_preds = torch.cat(all_preds, dim=1)  # (B, num_codebooks * frame_stacking_factor)
-        all_preds = all_preds.reshape(-1, self.frame_stacking_factor, self.num_audio_codebooks).permute(
-            0, 2, 1
-        )  # (B, num_codebooks, frame_stacking_factor)
-        if use_cfg:
-            all_preds = all_preds[:actual_batch_size]
-
-        return all_preds
-
     def sample_codes_from_logits(
         self,
         all_code_logits_t: torch.Tensor,
@@ -2079,22 +1496,6 @@ def compute_alignment_loss(self, attention_scores, text_lens, audio_lens, dec_co
         )
         return alignment_loss
 
-    def pad_audio_codes(self, audio_codes: torch.Tensor):
-        """
-        Pads the time dimension of the audio codes to a multiple of the frame stacking factor.
-        Args:
-            audio_codes (torch.Tensor): B, C, T
-            frame_stacking_factor (int): The factor that frames will be stacked by.
-            pad_token (int): The token ID to pad with.
-        Returns:
-            B, C, T_padded
-        """
-        T = audio_codes.size(2)
-        T_padded = int(np.ceil(T / self.frame_stacking_factor) * self.frame_stacking_factor)
-        num_pad = T_padded - T
-        audio_codes = torch.nn.functional.pad(input=audio_codes, pad=(0, num_pad))
-        return audio_codes
-
     def embed_context_text(self, context_text_tokens):
         if self.legacy_text_conditioning:
             context_text_tokens = (
diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py
index d7dd672867c3..580a6e32ebc7 100644
--- a/nemo/collections/tts/modules/magpietts_inference/utils.py
+++ b/nemo/collections/tts/modules/magpietts_inference/utils.py
@@ -428,6 +428,9 @@ def log_model_architecture_summary(model: MagpieTTSModel) -> Tuple[str, Dict[str
             - moe_info: String for checkpoint naming (e.g., "MoE_8x2_d2048_softmax_"), empty for dense models
             - flops_per_component: Dict mapping component name (e.g., "decoder") to its FLOPs metrics dict
     """
+    if isinstance(model, EasyMagpieTTSModel):
+        return "", {}
+
     logging.info("=" * 60)
     logging.info("MODEL ARCHITECTURE SUMMARY")
     logging.info("=" * 60)

From f62752322ae38f7e362e00766105e0541144f590 Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Tue, 10 Mar 2026 02:25:00 +0000
Subject: [PATCH 81/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/base_magpietts.py | 22 +++++--------------
 .../tts/models/easy_magpietts_inference.py    |  9 ++------
 2 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py
index f3eacb945051..c2372a1f7980 100644
--- a/nemo/collections/tts/models/base_magpietts.py
+++ b/nemo/collections/tts/models/base_magpietts.py
@@ -18,14 +18,8 @@
 import torch
 from torch.utils.data import get_worker_info
 
-from nemo.collections.tts.data.text_to_speech_dataset_lhotse import (
-    instantiate_phoneme_tokenizer,
-    setup_tokenizers,
-)
-from nemo.collections.tts.modules.magpietts_modules import (
-    SpecialAudioToken,
-    cosine_schedule,
-)
+from nemo.collections.tts.data.text_to_speech_dataset_lhotse import instantiate_phoneme_tokenizer, setup_tokenizers
+from nemo.collections.tts.modules.magpietts_modules import SpecialAudioToken, cosine_schedule
 from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
 from nemo.core.classes import ModelPT
 from nemo.utils import logging
@@ -93,7 +87,7 @@ def load_state_dict(self, state_dict, strict=True):
                 for key in state_dict.keys():
                     name_with_dot = f"{name}."
                     if key.startswith(name_with_dot):
-                        new_state_dict[key[len(name_with_dot):]] = state_dict[key]
+                        new_state_dict[key[len(name_with_dot) :]] = state_dict[key]
                 child.load_state_dict(new_state_dict)
 
     def setup_optimizer_param_groups(self):
@@ -478,7 +472,7 @@ def local_transformer_sample_maskgit(
 
             if sampling_type == "causal" or sampling_type == "purity_causal":
                 n_frames_to_allow = int(np.floor(progress * self.frame_stacking_factor + 1))
-                confidences[:, n_frames_to_allow * self.num_audio_codebooks:] = min_confidence - 1
+                confidences[:, n_frames_to_allow * self.num_audio_codebooks :] = min_confidence - 1
 
             _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1)
             if use_cfg:
@@ -494,9 +488,7 @@ def local_transformer_sample_maskgit(
             for codebook_num in range(codebook_seq_len):
                 next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1)
                 next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input)
-                local_transformer_input = torch.cat(
-                    [local_transformer_input, next_local_transformer_input], dim=1
-                )
+                local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1)
 
             _mask = torch.ones(B, codebook_seq_len + 1, device=device)
             local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']
@@ -545,9 +537,7 @@ def local_transformer_sample_maskgit(
                 confidences = probs.max(dim=2)[0]
             sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
             if noise_scale > 0.0:
-                noise = (
-                    (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps)
-                )
+                noise = (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps)
                 confidences += noise
                 confidences[actual_batch_size:] = confidences[:actual_batch_size]
             confidence_eps = 0.1
diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py
index 5bab45559174..a58f12c19b89 100644
--- a/nemo/collections/tts/models/easy_magpietts_inference.py
+++ b/nemo/collections/tts/models/easy_magpietts_inference.py
@@ -25,10 +25,7 @@
 from torch import nn
 from transformers import AutoConfig, AutoModelForCausalLM
 
-from nemo.collections.tts.data.text_to_speech_dataset_lhotse import (
-    instantiate_phoneme_tokenizer,
-    setup_tokenizers,
-)
+from nemo.collections.tts.data.text_to_speech_dataset_lhotse import instantiate_phoneme_tokenizer, setup_tokenizers
 from nemo.collections.tts.models import AudioCodecModel
 from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel
 from nemo.collections.tts.modules import transformer_2501
@@ -1861,9 +1858,7 @@ def infer_batch(
                 phoneme_end = torch.where(
                     state.phoneme_prediction_end_idx >= 0,
                     state.phoneme_prediction_end_idx,
-                    torch.full_like(
-                        state.phoneme_prediction_end_idx, predicted_phoneme_tokens.size(-1)
-                    ),
+                    torch.full_like(state.phoneme_prediction_end_idx, predicted_phoneme_tokens.size(-1)),
                 )
                 predicted_phoneme_tokens_lens = phoneme_end - phoneme_start
                 phoneme_prediction_start_idx_out = phoneme_start

From c8437acc31e26de3fa9a1f3a27571dcb54683556 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Tue, 10 Mar 2026 02:10:45 -0400
Subject: [PATCH 82/94] cleanup

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/base_magpietts.py | 36 -------------------
 1 file changed, 36 deletions(-)

diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py
index c2372a1f7980..63db14afd264 100644
--- a/nemo/collections/tts/models/base_magpietts.py
+++ b/nemo/collections/tts/models/base_magpietts.py
@@ -49,10 +49,6 @@ class BaseMagpieTTSModel(ModelPT):
     ``__init__``, data loading, training/inference logic, etc.
     """
 
-    # ------------------------------------------------------------------
-    # State-dict exclusion – subclasses override
-    # ------------------------------------------------------------------
-
     def _get_state_dict_keys_to_exclude(self) -> List[str]:
         """Return list of key substrings to exclude from checkpoint save/load.
 
@@ -61,10 +57,6 @@ def _get_state_dict_keys_to_exclude(self) -> List[str]:
         """
         return ['_codec_model']
 
-    # ------------------------------------------------------------------
-    # state_dict / load_state_dict / optimizer param groups
-    # ------------------------------------------------------------------
-
     def state_dict(self, destination=None, prefix='', keep_vars=False):
         if hasattr(self, '_no_state_dict') and self._no_state_dict:
             return {}
@@ -109,10 +101,6 @@ def setup_optimizer_param_groups(self):
 
         self._optimizer_param_groups = [{"params": trainable_params}]
 
-    # ------------------------------------------------------------------
-    # Special token helpers
-    # ------------------------------------------------------------------
-
     def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1):
         # codes: (B, C, T')
         codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0)
@@ -160,10 +148,6 @@ def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1):
         codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len)
         return codes, codes_len
 
-    # ------------------------------------------------------------------
-    # Audio codec helpers
-    # ------------------------------------------------------------------
-
     def audio_to_codes(self, audio, audio_len, sample_rate=None):
         self._codec_model.eval()
         with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32):
@@ -179,10 +163,6 @@ def codes_to_audio(self, codes, codes_len):
             audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len)
             return audio, audio_len, codes
 
-    # ------------------------------------------------------------------
-    # Padding / forbidden-logits helpers
-    # ------------------------------------------------------------------
-
     def pad_audio_codes(self, audio_codes: torch.Tensor):
         """Pads the time dimension of the audio codes to a multiple of the frame stacking factor.
 
@@ -214,10 +194,6 @@ def clear_forbidden_logits(self, logits: torch.Tensor, forbid_audio_eos: bool =
         ] = float('-inf')
         return logits
 
-    # ------------------------------------------------------------------
-    # MaskGit helpers
-    # ------------------------------------------------------------------
-
     def maskgit_create_random_mask(self, codes):
         """Creates a mask where True indicates positions that should be replaced with MASK_TOKEN."""
         B, C, T = codes.shape
@@ -236,10 +212,6 @@ def maskgit_apply_random_mask(self, codes):
         codes_with_mask = torch.where(mask, self.mask_token_id, codes)
         return codes_with_mask, mask
 
-    # ------------------------------------------------------------------
-    # Local transformer – training
-    # ------------------------------------------------------------------
-
     def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False):
         """Predicts the logits for all codebooks using the local transformer.
 
@@ -306,10 +278,6 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_
 
         return all_code_logits
 
-    # ------------------------------------------------------------------
-    # Local transformer – AR sampling
-    # ------------------------------------------------------------------
-
     def local_transformer_sample_autoregressive(
         self,
         dec_output: torch.Tensor,
@@ -401,10 +369,6 @@ def local_transformer_sample_autoregressive(
 
         return all_preds
 
-    # ------------------------------------------------------------------
-    # Local transformer – MaskGit sampling
-    # ------------------------------------------------------------------
-
     def local_transformer_sample_maskgit(
         self,
         dec_output: torch.Tensor,

From dc52f0aecc41c6899d44e9f41e6bbf6c0a635c77 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Tue, 10 Mar 2026 02:39:37 -0400
Subject: [PATCH 83/94] sanitize logits only for easy magpie to preserve
 magpietts functionality

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/base_magpietts.py           | 7 +++++--
 nemo/collections/tts/models/easy_magpietts_inference.py | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py
index 63db14afd264..27073282da6c 100644
--- a/nemo/collections/tts/models/base_magpietts.py
+++ b/nemo/collections/tts/models/base_magpietts.py
@@ -289,6 +289,7 @@ def local_transformer_sample_autoregressive(
         cfg_scale: float = 1.0,
         use_kv_cache: bool = True,
         forbid_audio_eos: bool = False,
+        sanitize_logits: bool = False,
     ) -> torch.Tensor:
         """Sample audio codes autoregressively across codebooks using the local transformer.
 
@@ -305,6 +306,7 @@ def local_transformer_sample_autoregressive(
             cfg_scale: Scale factor for CFG.
             use_kv_cache: Whether to use key-value caching in the local transformer.
             forbid_audio_eos: Whether to globally forbid audio EOS.
+            sanitize_logits: Whether to clamp/clean logits before sampling.
 
         Returns:
             Sampled audio codes (B, num_codebooks, frame_stacking_factor).
@@ -329,8 +331,9 @@ def local_transformer_sample_autoregressive(
                 cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
                 codebook_logits[:actual_batch_size] = cfg_logits
 
-            codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
-            codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
+            if sanitize_logits:
+                codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
+                codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
 
             for item_idx in unfinished_items:
                 codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py
index a58f12c19b89..c5b0bf56112b 100644
--- a/nemo/collections/tts/models/easy_magpietts_inference.py
+++ b/nemo/collections/tts/models/easy_magpietts_inference.py
@@ -905,6 +905,7 @@ def _sample_audio_codes(
                     topk=topk,
                     use_cfg=use_cfg,
                     cfg_scale=cfg_scale,
+                    sanitize_logits=True,
                 )
                 # Base class returns (B, C, S); flatten to (B, C*S) for downstream code
                 audio_codes_next = audio_codes_next.permute(0, 2, 1)

From f680b8f62a6c602657e804feb10cf68691617677 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Tue, 10 Mar 2026 12:29:10 -0400
Subject: [PATCH 84/94] remove custom phoneme tokenizer instantiation and
 handle it in the tokenizer class

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 .../tokenizers/text_to_speech/tts_tokenizers.py      |  8 ++++++++
 .../tts/data/text_to_speech_dataset_lhotse.py        | 12 +-----------
 nemo/collections/tts/models/base_magpietts.py        |  5 +++--
 nemo/collections/tts/models/easy_magpietts.py        |  5 ++---
 .../tts/models/easy_magpietts_inference.py           |  4 ++--
 5 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
index 65b27bc6b62f..0b6988e3e9a8 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -1192,6 +1192,14 @@ def __init__(self, tokenizer_path: str):
 
         self._tokenizer = Tokenizer.from_file(tokenizer_file)
         self.tokens = self._tokenizer.get_vocab()
+        phoneme_vocab_size = len(self.tokens)
+        self.bos_token_id = phoneme_vocab_size
+        self.eos_token_id = phoneme_vocab_size + 1
+        self.unk_token_id = phoneme_vocab_size + 2
+        self.vocab_size = phoneme_vocab_size + 3
+        self.tokens["<sp_bos>"] = self.bos_token_id
+        self.tokens["<sp_eos>"] = self.eos_token_id
+        self.tokens["<sp_unk>"] = self.unk_token_id
         self.pad = self.tokens.get("<pad>", None)
 
     def encode(self, text: str) -> List[int]:
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
index c1ac9975d215..f2de40bdb180 100644
--- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
+++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -60,16 +60,6 @@ def setup_tokenizers(all_tokenizers_config, mode='train'):
     return aggregated_tokenizer
 
 
-def instantiate_phoneme_tokenizer(phoneme_tokenizer_config):
-    phoneme_tokenizer = instantiate(phoneme_tokenizer_config)
-    phoneme_vocab_size = len(phoneme_tokenizer.tokens)
-    phoneme_tokenizer.bos_token_id = phoneme_vocab_size
-    phoneme_tokenizer.eos_token_id = phoneme_vocab_size + 1
-    phoneme_tokenizer.unk_token_id = phoneme_vocab_size + 2
-    phoneme_tokenizer.vocab_size = phoneme_vocab_size + 3
-    return phoneme_tokenizer
-
-
 def check_speaker_format(item: str):
     # enforce the format as example like "| Language:en Dataset:HiFiTTS Speaker:9136_other |".
     pattern = r"\| Language:\w+ Dataset:[\w\d\W]+ Speaker:[\w\d\W]+ \|"
@@ -207,7 +197,7 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]:
 
         # initialize the phoneme tokenizer once per dataset/worker when config is available.
         if self.phoneme_tokenizer is None and self.phoneme_tokenizer_config is not None:
-            self.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.phoneme_tokenizer_config)
+            self.phoneme_tokenizer = instantiate(self.phoneme_tokenizer_config)
 
         # define list to store batched information
         dataset_name_list = []
diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py
index 27073282da6c..f031ebf98fab 100644
--- a/nemo/collections/tts/models/base_magpietts.py
+++ b/nemo/collections/tts/models/base_magpietts.py
@@ -16,9 +16,10 @@
 
 import numpy as np
 import torch
+from hydra.utils import instantiate
 from torch.utils.data import get_worker_info
 
-from nemo.collections.tts.data.text_to_speech_dataset_lhotse import instantiate_phoneme_tokenizer, setup_tokenizers
+from nemo.collections.tts.data.text_to_speech_dataset_lhotse import setup_tokenizers
 from nemo.collections.tts.modules.magpietts_modules import SpecialAudioToken, cosine_schedule
 from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
 from nemo.core.classes import ModelPT
@@ -37,7 +38,7 @@ def worker_init_fn(worker_id):
     tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type)
     dataset.text_tokenizer = tokenizer
     if hasattr(dataset, 'phoneme_tokenizer_config'):
-        dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config)
+        dataset.phoneme_tokenizer = instantiate(dataset.phoneme_tokenizer_config)
 
 
 class BaseMagpieTTSModel(ModelPT):
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 115b8e2d6a99..97e47284aac9 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -34,7 +34,6 @@
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.collections.tts.data.text_to_speech_dataset_lhotse import (
     MagpieTTSLhotseDataset,
-    instantiate_phoneme_tokenizer,
     setup_tokenizers,
 )
 from nemo.collections.tts.models.base_magpietts import worker_init_fn
@@ -1428,7 +1427,7 @@ def setup_training_data(self, dataset_cfg):
                     mode='train',
                 )
                 if self.cfg.get("phoneme_tokenizer", None) is not None:
-                    dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.cfg.phoneme_tokenizer)
+                    dataset.phoneme_tokenizer = instantiate(self.cfg.phoneme_tokenizer)
 
             self._train_dl = torch.utils.data.DataLoader(
                 dataset,
@@ -1450,7 +1449,7 @@ def _setup_test_dataloader(self, dataset_cfg) -> torch.utils.data.DataLoader:
                 # For num workers > 0 tokenizer will be assigned in worker_init_fn (since it is not picklable)
                 dataset.text_tokenizer = setup_tokenizers(all_tokenizers_config=self.cfg.text_tokenizers, mode='test')
                 if self.cfg.get("phoneme_tokenizer", None) is not None:
-                    dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.cfg.phoneme_tokenizer)
+                    dataset.phoneme_tokenizer = instantiate(self.cfg.phoneme_tokenizer)
 
             data_loader = torch.utils.data.DataLoader(
                 dataset,
diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py
index c5b0bf56112b..765c234e2683 100644
--- a/nemo/collections/tts/models/easy_magpietts_inference.py
+++ b/nemo/collections/tts/models/easy_magpietts_inference.py
@@ -25,7 +25,7 @@
 from torch import nn
 from transformers import AutoConfig, AutoModelForCausalLM
 
-from nemo.collections.tts.data.text_to_speech_dataset_lhotse import instantiate_phoneme_tokenizer, setup_tokenizers
+from nemo.collections.tts.data.text_to_speech_dataset_lhotse import setup_tokenizers
 from nemo.collections.tts.models import AudioCodecModel
 from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel
 from nemo.collections.tts.modules import transformer_2501
@@ -306,7 +306,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.cfg_unk_token_id = num_tokens - 1
         self.phoneme_tokenizer = None
         if cfg.get('phoneme_tokenizer', None) is not None:
-            self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer)
+            self.phoneme_tokenizer = instantiate(cfg.phoneme_tokenizer)
             self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1)
             self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size
             if cfg.get('phoneme_corruption_batch_prob', None) is None:

From 40fb7ebbc12bdc1fa4a819fe98ba45d7c71df783 Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Tue, 10 Mar 2026 16:30:23 +0000
Subject: [PATCH 85/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 97e47284aac9..5a117432b986 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -32,10 +32,7 @@
 from nemo.collections.asr.metrics.wer import word_error_rate
 from nemo.collections.asr.parts.mixins.transcription import TranscribeConfig
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
-from nemo.collections.tts.data.text_to_speech_dataset_lhotse import (
-    MagpieTTSLhotseDataset,
-    setup_tokenizers,
-)
+from nemo.collections.tts.data.text_to_speech_dataset_lhotse import MagpieTTSLhotseDataset, setup_tokenizers
 from nemo.collections.tts.models.base_magpietts import worker_init_fn
 from nemo.collections.tts.models.easy_magpietts_inference import (
     EasyMagpieTTSInferenceModel,

From c8ad57a3dc27f42b44cf68a4aa1d19a205add836 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Tue, 10 Mar 2026 15:57:51 -0700
Subject: [PATCH 86/94] remove streaming inference script

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 examples/tts/magpietts_streaming_inference.py | 1030 -----------------
 1 file changed, 1030 deletions(-)
 delete mode 100644 examples/tts/magpietts_streaming_inference.py

diff --git a/examples/tts/magpietts_streaming_inference.py b/examples/tts/magpietts_streaming_inference.py
deleted file mode 100644
index d25172d4e1f6..000000000000
--- a/examples/tts/magpietts_streaming_inference.py
+++ /dev/null
@@ -1,1030 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-MagpieTTS Streaming Inference Test Script.
-
-This script tests the streaming TTS inference functionality, supporting both
-single sample (batch_size=1) and batched inference (batch_size>1).
-
-For batched inference, each item in the batch can have different context lengths
-and be in different processing phases (context, prompt, phoneme-only, audio).
-
-Example usage:
-    # Single sample inference from checkpoint
-    python examples/tts/magpietts_streaming_inference.py \
-        --hparams_file /path/to/hparams.yaml \
-        --checkpoint_file /path/to/model.ckpt \
-        --codecmodel_path /path/to/codec.nemo \
-        --context_audio /path/to/context.wav \
-        --text "Hello, this is a test of streaming TTS inference." \
-        --output_path /path/to/output.wav
-
-    # Batched inference with multiple context audios
-    python examples/tts/magpietts_streaming_inference.py \
-        --nemo_file /path/to/model.nemo \
-        --codecmodel_path /path/to/codec.nemo \
-        --context_audio /path/to/context1.wav /path/to/context2.wav \
-        --context_duration 3.0 5.0 \
-        --text "First text to synthesize." "Second text to synthesize." \
-        --output_path /path/to/output.wav
-"""
-from __future__ import annotations
-
-import argparse
-import os
-import time
-from typing import Optional
-
-import numpy as np
-import soundfile as sf
-import torch
-from omegaconf import OmegaConf, open_dict
-
-from nemo.collections.tts.models import EasyMagpieTTSModel
-from nemo.utils import logging
-
-
-def load_model(
-    hparams_file: Optional[str],
-    checkpoint_file: Optional[str],
-    nemo_file: Optional[str],
-    codecmodel_path: str,
-    device: str = "cuda",
-) -> EasyMagpieTTSModel:
-    """
-    Load an EasyMagpieTTSModel from checkpoint or .nemo file.
-
-    Args:
-        hparams_file: Path to hparams.yaml (required with checkpoint_file).
-        checkpoint_file: Path to .ckpt file (required with hparams_file).
-        nemo_file: Path to .nemo file (alternative to hparams + checkpoint).
-        codecmodel_path: Path to the audio codec model.
-        device: Device to load model on.
-
-    Returns:
-        Loaded model ready for inference.
-    """
-    if hparams_file is not None and checkpoint_file is not None:
-        # Load from hparams + checkpoint
-        logging.info(f"Loading model from checkpoint: {checkpoint_file}")
-        model_cfg = OmegaConf.load(hparams_file)
-
-        # Handle different config structures
-        if "cfg" in model_cfg:
-            model_cfg = model_cfg.cfg
-
-        with open_dict(model_cfg):
-            # Override codec model path
-            model_cfg.codecmodel_path = codecmodel_path
-
-            # Disable training datasets
-            model_cfg.train_ds = None
-            model_cfg.validation_ds = None
-
-        model = EasyMagpieTTSModel(cfg=model_cfg)
-
-        # Load weights
-        ckpt = torch.load(checkpoint_file, weights_only=False)
-        state_dict = ckpt['state_dict']
-        model.load_state_dict(state_dict)
-
-    elif nemo_file is not None:
-        # Load from .nemo file
-        logging.info(f"Loading model from NeMo archive: {nemo_file}")
-        model_cfg = EasyMagpieTTSModel.restore_from(nemo_file, return_config=True)
-
-        with open_dict(model_cfg):
-            model_cfg.codecmodel_path = codecmodel_path
-            model_cfg.train_ds = None
-            model_cfg.validation_ds = None
-
-        model = EasyMagpieTTSModel.restore_from(nemo_file, override_config_path=model_cfg)
-
-    else:
-        raise ValueError("Must provide either (hparams_file + checkpoint_file) or nemo_file")
-
-    model.to(device)
-    model.eval()
-    logging.info("Model loaded and ready for streaming inference.")
-
-    return model
-
-
-def load_audio(audio_path: str, target_sample_rate: int) -> torch.Tensor:
-    """
-    Load audio file and resample if needed.
-
-    Args:
-        audio_path: Path to audio file.
-        target_sample_rate: Target sample rate.
-
-    Returns:
-        Audio tensor of shape (1, num_samples).
-    """
-    audio, sr = sf.read(audio_path, dtype='float32')
-
-    # Convert to mono if stereo
-    if len(audio.shape) > 1:
-        audio = audio.mean(axis=1)
-
-    # Resample if needed
-    if sr != target_sample_rate:
-        import librosa
-
-        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate)
-
-    return torch.from_numpy(audio).unsqueeze(0)  # (1, num_samples)
-
-
-def adjust_audio_to_duration(
-    audio: torch.Tensor,
-    sample_rate: int,
-    target_duration: float,
-    codec_model_samples_per_frame: int,
-) -> torch.Tensor:
-    """
-    Adjust audio to target_duration seconds, aligned to codec frame boundaries.
-
-    The target number of samples is calculated to align with codec frame boundaries:
-    1. Convert target_duration to number of codec frames
-    2. Convert codec frames back to samples
-
-    If audio is longer than target, take the first target_duration seconds.
-    If audio is shorter, repeat it until it reaches target_duration seconds.
-
-    Args:
-        audio: Audio tensor of shape (1, num_samples).
-        sample_rate: Sample rate of the audio.
-        target_duration: Target duration in seconds.
-        codec_model_samples_per_frame: Number of audio samples per codec frame
-            (codec downsampling factor).
-
-    Returns:
-        Audio tensor of shape (1, target_num_samples) where target_num_samples
-        is aligned to codec frame boundaries.
-    """
-    # Calculate target samples aligned to codec frame boundaries
-    # Same logic as text_to_speech_dataset.py
-    num_codec_frames = int(target_duration * sample_rate / codec_model_samples_per_frame)
-    target_num_samples = num_codec_frames * codec_model_samples_per_frame
-    current_num_samples = audio.size(1)
-
-    if current_num_samples >= target_num_samples:
-        # Audio is longer than target - take the first target_duration seconds
-        audio = audio[:, :target_num_samples]
-    else:
-        # Audio is shorter - repeat until we have enough samples
-        num_repeats = int(np.ceil(target_num_samples / current_num_samples))
-        audio_repeated = audio.repeat(1, num_repeats)
-        audio = audio_repeated[:, :target_num_samples]
-
-    return audio
-
-
-def run_streaming_inference(
-    model: EasyMagpieTTSModel,
-    context_audio: torch.Tensor,
-    context_audio_lens: torch.Tensor,
-    context_text: str,
-    text: str,
-    phoneme_text: Optional[str] = None,
-    use_gt_phonemes: bool = False,
-    inference_mode: Optional[str] = None,
-    use_cfg: bool = False,
-    cfg_scale: float = 1.5,
-    use_local_transformer: bool = False,
-    temperature: float = 0.7,
-    topk: int = 80,
-    max_steps: int = 500,
-    verbose: bool = True,
-    force_dropout_text: bool = False,
-) -> tuple:
-    """
-    Run streaming TTS inference.
-
-    Args:
-        model: The loaded EasyMagpieTTSModel.
-        context_audio: Context audio tensor (1, num_samples).
-        context_audio_lens: Length of context audio (1,).
-        context_text: Context text for speaker conditioning.
-        text: Main text to synthesize.
-        phoneme_text: Optional phoneme text for GT conditioning. If None, uses text.
-        use_gt_phonemes: If True, use GT phonemes as decoder input (teacher forcing).
-        inference_mode: Inference mode name (e.g., "streaming_4_8").
-        use_cfg: Whether to use classifier-free guidance.
-        cfg_scale: CFG scale factor.
-        use_local_transformer: Whether to use local transformer.
-        temperature: Sampling temperature.
-        topk: Top-k sampling parameter.
-        max_steps: Maximum generation steps.
-        verbose: Whether to print progress.
-
-    Returns:
-        Tuple of (output, timing_info, context_audio_decoded, context_audio_decoded_lens).
-        output is StreamingFinalizeOutput with audio, codes, and phoneme predictions.
-        context_audio_decoded is the decoded context audio from the model's internal codes (for sanity checking).
-    """
-    device = next(model.parameters()).device
-
-    # Encode context audio to codes
-    context_audio = context_audio.to(device)
-    context_audio_lens = context_audio_lens.to(device)
-
-    with torch.inference_mode():
-        context_audio_codes, context_audio_codes_lens = model.audio_to_codes(context_audio, context_audio_lens)
-
-    # Tokenize context text
-    # Use the text conditioning tokenizer
-    tokenizer_name = model.text_conditioning_tokenizer_name
-    context_text_tokens = model.tokenizer.encode(context_text, tokenizer_name=tokenizer_name)
-    context_text_tokens = torch.tensor([context_text_tokens], dtype=torch.long, device=device)
-    context_text_tokens_lens = torch.tensor([context_text_tokens.size(1)], dtype=torch.long, device=device)
-
-    # Tokenize main text
-    # Get the appropriate tokenizer name for main text
-    if hasattr(model.tokenizer, 'tokenizers') and 'english_phoneme' in model.tokenizer.tokenizers:
-        main_tokenizer_name = 'english_phoneme'
-    else:
-        main_tokenizer_name = tokenizer_name
-
-    text_tokens = model.tokenizer.encode(text, tokenizer_name=main_tokenizer_name)
-    text_tokens = text_tokens + [model.eos_id]
-    text_tokens = torch.tensor(text_tokens, dtype=torch.long, device=device)
-
-    # Tokenize phoneme text if provided (for GT phoneme conditioning)
-    gt_phoneme_tokens = None
-    gt_phoneme_tokens_lens = None
-    if model.phoneme_tokenizer is not None:
-        phoneme_source = phoneme_text if phoneme_text is not None else text
-        phoneme_tokens_list = model.phoneme_tokenizer.encode(phoneme_source)
-        # Add BOS and EOS
-        bos_id = model.phoneme_tokenizer.bos_token_id
-        eos_id = model.phoneme_tokenizer.eos_token_id
-        phoneme_tokens_list = [bos_id] + phoneme_tokens_list + [eos_id]
-        gt_phoneme_tokens = torch.tensor([phoneme_tokens_list], dtype=torch.long, device=device)
-        gt_phoneme_tokens_lens = torch.tensor([len(phoneme_tokens_list)], dtype=torch.long, device=device)
-
-    phoneme_input_type = 'gt' if use_gt_phonemes else 'pred'
-
-    # Get streaming delays for logging
-    mode_name = inference_mode or model.default_inference_mode
-    training_mode = model.mode_name_to_mode.get(mode_name, model.training_modes[0])
-    phoneme_delay = training_mode.streaming_phonemes_delay
-    speech_delay = training_mode.streaming_speech_delay
-
-    if verbose:
-        logging.info(f"Context audio codes shape: {context_audio_codes.shape}")
-        logging.info(f"Context text tokens: {context_text_tokens.shape}")
-        logging.info(f"Main text tokens: {text_tokens.shape} ({len(text_tokens)} tokens)")
-        if gt_phoneme_tokens is not None:
-            logging.info(f"GT phoneme tokens: {gt_phoneme_tokens.shape} ({gt_phoneme_tokens_lens[0].item()} tokens)")
-        logging.info(f"Phoneme input type: {phoneme_input_type}")
-        logging.info(f"Using inference mode: {mode_name}")
-        logging.info(f"Phoneme delay: {phoneme_delay}, Speech delay: {speech_delay}")
-        logging.info("Phases: Prompt (0 to phoneme_delay) -> Phoneme-only (phoneme_delay to speech_delay) -> Audio")
-
-    # Initialize streaming state
-    start_time = time.time()
-
-    state = model.streaming_init(
-        context_audio_codes=context_audio_codes,
-        context_audio_codes_lens=context_audio_codes_lens,
-        context_text_tokens=context_text_tokens,
-        context_text_tokens_lens=context_text_tokens_lens,
-        inference_mode=inference_mode,
-        use_cfg=use_cfg,
-        cfg_scale=cfg_scale,
-        use_local_transformer=use_local_transformer,
-        temperature=temperature,
-        topk=topk,
-        phoneme_input_type=phoneme_input_type,
-        gt_phoneme_tokens=gt_phoneme_tokens,
-        gt_phoneme_tokens_lens=gt_phoneme_tokens_lens,
-    )
-
-    init_time = time.time() - start_time
-    if verbose:
-        logging.info(f"Streaming init completed in {init_time:.3f}s")
-
-    # Decode and return context audio for sanity check
-    # The context_audio_codes in state have special tokens and are stacked
-    # We need to remove special tokens and decode them
-    with torch.inference_mode():
-        ctx_codes = state.context_audio_codes.clone()
-        ctx_codes_lens = state.context_audio_codes_lens.clone()
-        # Remove special tokens (BOS and EOS)
-        ctx_codes, ctx_codes_lens = model.remove_special_tokens(
-            codes=ctx_codes,
-            codes_len=ctx_codes_lens,
-        )
-        # codes_to_audio will handle unstacking internally
-        context_audio_decoded, context_audio_decoded_lens, _ = model.codes_to_audio(ctx_codes, ctx_codes_lens)
-
-    # Feed text tokens one at a time
-    generation_start = time.time()
-    num_audio_frames = 0
-    num_phoneme_frames = 0
-    prompt_phase_tokens = 0
-    phoneme_only_phase_tokens = 0
-
-    for i, token in enumerate(text_tokens):
-        state, audio_codes, phoneme_tokens = model.streaming_step(
-            state, text_tokens=token.unsqueeze(0), force_dropout_text=force_dropout_text
-        )
-
-        # Track which phase we're in
-        if audio_codes is None and phoneme_tokens is None:
-            prompt_phase_tokens += 1
-        elif audio_codes is None and phoneme_tokens is not None:
-            phoneme_only_phase_tokens += 1
-            num_phoneme_frames += 1
-        else:
-            if audio_codes is not None:
-                num_audio_frames += 1
-            if phoneme_tokens is not None:
-                num_phoneme_frames += 1
-
-        if verbose and (i + 1) % 10 == 0:
-            phase = (
-                "prompt"
-                if audio_codes is None and phoneme_tokens is None
-                else ("phoneme-only" if audio_codes is None else "audio")
-            )
-            logging.info(
-                f"Processed {i + 1}/{len(text_tokens)} text tokens (phase: {phase}), "
-                f"audio frames: {num_audio_frames}, phoneme frames: {num_phoneme_frames}"
-            )
-
-        if state.finished:
-            if verbose:
-                logging.info(f"EOS detected at text token {i + 1}")
-            break
-
-    # Continue generating until finished (text has ended)
-    continuation_steps = 0
-    while not state.finished and continuation_steps < max_steps:
-        state, audio_codes, phoneme_tokens = model.streaming_step(
-            state, text_tokens=None, force_dropout_text=force_dropout_text
-        )
-
-        if audio_codes is not None:
-            num_audio_frames += 1
-        if phoneme_tokens is not None:
-            num_phoneme_frames += 1
-
-        continuation_steps += 1
-
-        if verbose and continuation_steps % 20 == 0:
-            logging.info(
-                f"Continuation step {continuation_steps}, "
-                f"audio frames: {num_audio_frames}, phoneme frames: {num_phoneme_frames}"
-            )
-
-    generation_time = time.time() - generation_start
-
-    if verbose:
-        logging.info(f"Generation completed in {generation_time:.3f}s")
-        logging.info(f"Prompt phase tokens: {prompt_phase_tokens}")
-        logging.info(f"Phoneme-only phase tokens: {phoneme_only_phase_tokens}")
-        logging.info(f"Audio frames generated: {num_audio_frames}")
-        logging.info(f"Phoneme frames generated: {num_phoneme_frames}")
-        logging.info(f"Continuation steps: {continuation_steps}")
-
-    # Finalize and get complete audio
-    output = model.streaming_finalize(state)
-
-    total_time = time.time() - start_time
-
-    if verbose and output.phoneme_text:
-        logging.info(f"Predicted phoneme text: {output.phoneme_text[0]}")
-
-    timing_info = {
-        'init_time': init_time,
-        'generation_time': generation_time,
-        'total_time': total_time,
-        'num_text_tokens': len(text_tokens),
-        'prompt_phase_tokens': prompt_phase_tokens,
-        'phoneme_only_phase_tokens': phoneme_only_phase_tokens,
-        'num_audio_frames': num_audio_frames,
-        'num_phoneme_frames': num_phoneme_frames,
-        'continuation_steps': continuation_steps,
-    }
-
-    return output, timing_info, context_audio_decoded, context_audio_decoded_lens
-
-
-def run_batched_streaming_inference(
-    model: EasyMagpieTTSModel,
-    context_audios: list[torch.Tensor],
-    context_audio_lens_list: list[torch.Tensor],
-    context_texts: list[str],
-    texts: list[str],
-    phoneme_texts: Optional[list[str]] = None,
-    use_gt_phonemes: bool = False,
-    inference_mode: Optional[str] = None,
-    use_cfg: bool = False,
-    cfg_scale: float = 1.5,
-    use_local_transformer: bool = False,
-    temperature: float = 0.7,
-    topk: int = 80,
-    max_steps: int = 500,
-    verbose: bool = True,
-    force_dropout_text: bool = False,
-) -> tuple:
-    """
-    Run batched streaming TTS inference.
-
-    Each batch item can have different context lengths. The streaming processes
-    only the minimum context length initially, then continues processing remaining
-    context per-item in the "context phase" before moving to prompt/audio phases.
-
-    Args:
-        model: The loaded EasyMagpieTTSModel.
-        context_audios: List of context audio tensors, each (1, num_samples).
-        context_audio_lens_list: List of context audio lengths, each (1,).
-        context_texts: List of context texts for speaker conditioning.
-        texts: List of main texts to synthesize.
-        phoneme_texts: Optional list of phoneme texts for GT conditioning. If None, uses texts.
-        use_gt_phonemes: If True, use GT phonemes as decoder input (teacher forcing).
-        inference_mode: Inference mode name (e.g., "streaming_4_8").
-        use_cfg: Whether to use classifier-free guidance.
-        cfg_scale: CFG scale factor.
-        use_local_transformer: Whether to use local transformer.
-        temperature: Sampling temperature.
-        topk: Top-k sampling parameter.
-        max_steps: Maximum generation steps.
-        verbose: Whether to print progress.
-
-    Returns:
-        Tuple of (output, timing_info) where output is StreamingFinalizeOutput.
-    """
-    device = next(model.parameters()).device
-    batch_size = len(context_audios)
-
-    assert len(context_texts) == batch_size, "Number of context texts must match batch size"
-    assert len(texts) == batch_size, "Number of texts must match batch size"
-
-    # Encode context audio to codes for each item
-    context_audio_codes_list = []
-    context_audio_codes_lens_list = []
-
-    with torch.inference_mode():
-        for i in range(batch_size):
-            context_audio = context_audios[i].to(device)
-            context_audio_lens = context_audio_lens_list[i].to(device)
-            codes, codes_lens = model.audio_to_codes(context_audio, context_audio_lens)
-            context_audio_codes_list.append(codes)
-            context_audio_codes_lens_list.append(codes_lens)
-
-    # Pad and batch context audio codes
-    max_context_len = max(c.size(-1) for c in context_audio_codes_list)
-    num_codebooks = context_audio_codes_list[0].size(1)
-
-    context_audio_codes = torch.zeros(batch_size, num_codebooks, max_context_len, dtype=torch.long, device=device)
-    context_audio_codes_lens = torch.zeros(batch_size, dtype=torch.long, device=device)
-
-    for i in range(batch_size):
-        codes = context_audio_codes_list[i]
-        codes_len = context_audio_codes_lens_list[i]
-        context_audio_codes[i, :, : codes.size(-1)] = codes[0]
-        context_audio_codes_lens[i] = codes_len[0]
-
-    # Tokenize context texts
-    tokenizer_name = model.text_conditioning_tokenizer_name
-    context_text_tokens_list = []
-    for ctx_text in context_texts:
-        tokens = model.tokenizer.encode(ctx_text, tokenizer_name=tokenizer_name)
-        context_text_tokens_list.append(tokens)
-
-    # Pad and batch context text tokens
-    max_context_text_len = max(len(t) for t in context_text_tokens_list)
-    context_text_tokens = torch.zeros(batch_size, max_context_text_len, dtype=torch.long, device=device)
-    context_text_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device)
-
-    for i, tokens in enumerate(context_text_tokens_list):
-        context_text_tokens[i, : len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device)
-        context_text_tokens_lens[i] = len(tokens)
-
-    # Tokenize main texts
-    if hasattr(model.tokenizer, 'tokenizers') and 'english_phoneme' in model.tokenizer.tokenizers:
-        main_tokenizer_name = 'english_phoneme'
-    else:
-        main_tokenizer_name = tokenizer_name
-
-    text_tokens_list = []
-    for text in texts:
-        tokens = model.tokenizer.encode(text, tokenizer_name=main_tokenizer_name)
-        tokens = tokens + [model.eos_id]
-        text_tokens_list.append(torch.tensor(tokens, dtype=torch.long, device=device))
-
-    max_text_len = max(len(t) for t in text_tokens_list)
-
-    # Tokenize phoneme texts if model has phoneme tokenizer
-    gt_phoneme_tokens = None
-    gt_phoneme_tokens_lens = None
-    if model.phoneme_tokenizer is not None:
-        phoneme_sources = phoneme_texts if phoneme_texts is not None else texts
-        bos_id = model.phoneme_tokenizer.bos_token_id
-        eos_id = model.phoneme_tokenizer.eos_token_id
-        phoneme_tokens_lists = []
-        for ptext in phoneme_sources:
-            tokens = model.phoneme_tokenizer.encode(ptext)
-            tokens = [bos_id] + tokens + [eos_id]
-            phoneme_tokens_lists.append(tokens)
-        max_phoneme_len = max(len(t) for t in phoneme_tokens_lists)
-        gt_phoneme_tokens = torch.zeros(batch_size, max_phoneme_len, dtype=torch.long, device=device)
-        gt_phoneme_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device)
-        for i, tokens in enumerate(phoneme_tokens_lists):
-            gt_phoneme_tokens[i, : len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device)
-            gt_phoneme_tokens_lens[i] = len(tokens)
-
-    phoneme_input_type = 'gt' if use_gt_phonemes else 'pred'
-
-    # Get streaming delays for logging
-    mode_name = inference_mode or model.default_inference_mode
-    training_mode = model.mode_name_to_mode.get(mode_name, model.training_modes[0])
-    phoneme_delay = training_mode.streaming_phonemes_delay
-    speech_delay = training_mode.streaming_speech_delay
-
-    if verbose:
-        logging.info(f"Batch size: {batch_size}")
-        logging.info(f"Context audio codes shape: {context_audio_codes.shape}")
-        logging.info(f"Context audio codes lens: {context_audio_codes_lens.tolist()}")
-        logging.info(f"Context text tokens shape: {context_text_tokens.shape}")
-        logging.info(f"Context text tokens lens: {context_text_tokens_lens.tolist()}")
-        logging.info(f"Max text tokens: {max_text_len}")
-        logging.info(f"Text tokens per item: {[len(t) for t in text_tokens_list]}")
-        if gt_phoneme_tokens is not None:
-            logging.info(f"GT phoneme tokens shape: {gt_phoneme_tokens.shape}")
-            logging.info(f"GT phoneme tokens lens: {gt_phoneme_tokens_lens.tolist()}")
-        logging.info(f"Phoneme input type: {phoneme_input_type}")
-        logging.info(f"Using inference mode: {mode_name}")
-        logging.info(f"Phoneme delay: {phoneme_delay}, Speech delay: {speech_delay}")
-
-    # Initialize streaming state
-    start_time = time.time()
-
-    state = model.streaming_init(
-        context_audio_codes=context_audio_codes,
-        context_audio_codes_lens=context_audio_codes_lens,
-        context_text_tokens=context_text_tokens,
-        context_text_tokens_lens=context_text_tokens_lens,
-        inference_mode=inference_mode,
-        use_cfg=use_cfg,
-        cfg_scale=cfg_scale,
-        use_local_transformer=use_local_transformer,
-        temperature=temperature,
-        topk=topk,
-        phoneme_input_type=phoneme_input_type,
-        gt_phoneme_tokens=gt_phoneme_tokens,
-        gt_phoneme_tokens_lens=gt_phoneme_tokens_lens,
-    )
-
-    init_time = time.time() - start_time
-    if verbose:
-        logging.info(f"Streaming init completed in {init_time:.3f}s")
-        logging.info(f"Initial context_position: {state.context_position.tolist()}")
-        logging.info(f"Full context lens: {state.full_context_lens.tolist()}")
-
-    # Feed text tokens one at a time
-    generation_start = time.time()
-    step_count = 0
-    num_audio_frames = 0
-
-    # Track which items have finished their text
-    text_positions = torch.zeros(batch_size, dtype=torch.long, device=device)
-    text_finished_mask = torch.zeros(batch_size, dtype=torch.bool, device=device)
-
-    # Main streaming loop
-    while not state.finished.all() and step_count < max_steps + max_text_len:
-        # Determine which items are in context phase
-        in_context_phase = state.context_position < state.full_context_lens
-
-        # Prepare text tokens for this step
-        # Items in context phase: use 0 (will be ignored)
-        # Items not in context phase: use their next text token or 0 if text finished
-        text_tokens_batch = torch.zeros(batch_size, dtype=torch.long, device=device)
-
-        for i in range(batch_size):
-            if not in_context_phase[i] and not text_finished_mask[i]:
-                if text_positions[i] < len(text_tokens_list[i]):
-                    text_tokens_batch[i] = text_tokens_list[i][text_positions[i]]
-                    text_positions[i] += 1
-                else:
-                    text_finished_mask[i] = True
-
-        # Determine if we should pass None (all items have finished text and exited context)
-        all_text_done = text_finished_mask.all() and not in_context_phase.any()
-
-        if all_text_done:
-            state, audio_codes, phoneme_tokens = model.streaming_step(
-                state, text_tokens=None, force_dropout_text=force_dropout_text
-            )
-        else:
-            state, audio_codes, phoneme_tokens = model.streaming_step(
-                state, text_tokens=text_tokens_batch, force_dropout_text=force_dropout_text
-            )
-
-        if audio_codes is not None:
-            num_audio_frames += 1
-
-        step_count += 1
-
-        if verbose and step_count % 20 == 0:
-            in_ctx = state.context_position < state.full_context_lens
-            logging.info(
-                f"Step {step_count}: "
-                f"in_context_phase={in_ctx.tolist()}, "
-                f"text_positions={text_positions.tolist()}, "
-                f"audio_frames={num_audio_frames}, "
-                f"finished={state.finished.tolist()}"
-            )
-
-    generation_time = time.time() - generation_start
-
-    if verbose:
-        logging.info(f"Generation completed in {generation_time:.3f}s")
-        logging.info(f"Total steps: {step_count}")
-        logging.info(f"Audio frames generated: {num_audio_frames}")
-
-    # Finalize and get complete audio
-    output = model.streaming_finalize(state)
-
-    total_time = time.time() - start_time
-
-    if verbose and output.phoneme_text:
-        for i, ptext in enumerate(output.phoneme_text):
-            logging.info(f"Predicted phoneme text [{i}]: {ptext}")
-
-    timing_info = {
-        'init_time': init_time,
-        'generation_time': generation_time,
-        'total_time': total_time,
-        'num_text_tokens': [len(t) for t in text_tokens_list],
-        'num_audio_frames': num_audio_frames,
-        'total_steps': step_count,
-    }
-
-    return output, timing_info
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="MagpieTTS Streaming Inference Test Script",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-
-    # Model loading arguments
-    model_group = parser.add_argument_group('Model Loading')
-    model_group.add_argument(
-        '--hparams_file',
-        type=str,
-        default=None,
-        help='Path to hparams.yaml file',
-    )
-    model_group.add_argument(
-        '--checkpoint_file',
-        type=str,
-        default=None,
-        help='Path to .ckpt checkpoint file',
-    )
-    model_group.add_argument(
-        '--nemo_file',
-        type=str,
-        default=None,
-        help='Path to .nemo model file',
-    )
-    model_group.add_argument(
-        '--codecmodel_path',
-        type=str,
-        required=True,
-        help='Path to audio codec model (.nemo)',
-    )
-
-    # Input arguments
-    input_group = parser.add_argument_group('Input')
-    input_group.add_argument(
-        '--context_audio',
-        type=str,
-        nargs='+',
-        required=True,
-        help='Path(s) to context audio file(s) for speaker cloning. ' 'Multiple files enable batched inference.',
-    )
-    input_group.add_argument(
-        '--context_text',
-        type=str,
-        nargs='+',
-        default=["[NO TEXT CONTEXT]"],
-        help='Context text(s) for speaker conditioning. Provide one per context audio, '
-        'or a single value to use for all. (default: "[NO TEXT CONTEXT]")',
-    )
-    input_group.add_argument(
-        '--context_duration',
-        type=float,
-        nargs='+',
-        default=[5.0],
-        help='Target duration(s) for context audio in seconds. Provide one per context audio, '
-        'or a single value to use for all. If audio is longer, '
-        'first N seconds are used. If shorter, audio is repeated. (default: 5.0)',
-    )
-    input_group.add_argument(
-        '--text',
-        type=str,
-        nargs='+',
-        required=True,
-        help='Text(s) to synthesize. Provide one per context audio for batched inference.',
-    )
-    input_group.add_argument(
-        '--phoneme_text',
-        type=str,
-        nargs='+',
-        default=None,
-        help='Phoneme text(s) for GT phoneme conditioning. If not provided, uses --text. '
-        'Provide one per context audio for batched inference.',
-    )
-    input_group.add_argument(
-        '--use_gt_phonemes',
-        action='store_true',
-        help='Use ground-truth phonemes as decoder input (teacher forcing). '
-        'If not set, uses model-predicted phonemes.',
-    )
-
-    # Output arguments
-    output_group = parser.add_argument_group('Output')
-    output_group.add_argument(
-        '--output_path',
-        type=str,
-        default='streaming_output.wav',
-        help='Path for output audio file',
-    )
-
-    # Inference arguments
-    infer_group = parser.add_argument_group('Inference Parameters')
-    infer_group.add_argument(
-        '--inference_mode',
-        type=str,
-        default=None,
-        help='Inference mode name (e.g., "streaming_4_8"). Uses model default if not specified.',
-    )
-    infer_group.add_argument(
-        '--use_cfg',
-        action='store_true',
-        help='Enable classifier-free guidance',
-    )
-    infer_group.add_argument(
-        '--cfg_scale',
-        type=float,
-        default=1.5,
-        help='CFG scale factor (higher = stronger conditioning)',
-    )
-    infer_group.add_argument(
-        '--use_local_transformer',
-        action='store_true',
-        help='Use local transformer for inference',
-    )
-    infer_group.add_argument(
-        '--temperature',
-        type=float,
-        default=0.7,
-        help='Sampling temperature',
-    )
-    infer_group.add_argument(
-        '--topk',
-        type=int,
-        default=80,
-        help='Top-k sampling parameter',
-    )
-    infer_group.add_argument(
-        '--max_steps',
-        type=int,
-        default=500,
-        help='Maximum generation steps after text ends',
-    )
-    infer_group.add_argument(
-        '--device',
-        type=str,
-        default='cuda',
-        choices=['cuda', 'cpu'],
-        help='Device to run inference on',
-    )
-    infer_group.add_argument(
-        '--verbose',
-        action='store_true',
-        help='Print detailed progress information',
-    )
-    infer_group.add_argument(
-        '--force_dropout_text',
-        action='store_true',
-        help='Force dropout of text embeddings (pass zeros) to test phoneme-only inference',
-    )
-
-    args = parser.parse_args()
-
-    # Validate arguments
-    has_ckpt_mode = args.hparams_file is not None and args.checkpoint_file is not None
-    has_nemo_mode = args.nemo_file is not None
-
-    if not (has_ckpt_mode or has_nemo_mode):
-        parser.error("Must provide either (--hparams_file and --checkpoint_file) or --nemo_file")
-
-    # Load model
-    model = load_model(
-        hparams_file=args.hparams_file,
-        checkpoint_file=args.checkpoint_file,
-        nemo_file=args.nemo_file,
-        codecmodel_path=args.codecmodel_path,
-        device=args.device,
-    )
-
-    model = model.float()
-
-    # Determine batch size from number of context audios
-    batch_size = len(args.context_audio)
-
-    # Expand context_text, context_duration, and text to match batch_size
-    context_texts = args.context_text
-    if len(context_texts) == 1 and batch_size > 1:
-        context_texts = context_texts * batch_size
-    elif len(context_texts) != batch_size:
-        parser.error(
-            f"Number of context_texts ({len(context_texts)}) must match number of context_audios ({batch_size}) or be 1"
-        )
-
-    context_durations = args.context_duration
-    if len(context_durations) == 1 and batch_size > 1:
-        context_durations = context_durations * batch_size
-    elif len(context_durations) != batch_size:
-        parser.error(
-            f"Number of context_durations ({len(context_durations)}) must match number of context_audios ({batch_size}) or be 1"
-        )
-
-    texts = args.text
-    if len(texts) == 1 and batch_size > 1:
-        texts = texts * batch_size
-    elif len(texts) != batch_size:
-        parser.error(f"Number of texts ({len(texts)}) must match number of context_audios ({batch_size}) or be 1")
-
-    # Handle phoneme_text - default to text if not provided
-    phoneme_texts = args.phoneme_text
-    if phoneme_texts is None:
-        phoneme_texts = texts
-    elif len(phoneme_texts) == 1 and batch_size > 1:
-        phoneme_texts = phoneme_texts * batch_size
-    elif len(phoneme_texts) != batch_size:
-        parser.error(
-            f"Number of phoneme_texts ({len(phoneme_texts)}) must match number of context_audios ({batch_size}) or be 1"
-        )
-
-    # Load and process context audios
-    context_audios = []
-    context_audio_lens_list = []
-
-    for i, (audio_path, duration) in enumerate(zip(args.context_audio, context_durations)):
-        logging.info(f"Loading context audio {i+1}/{batch_size} from: {audio_path}")
-        audio = load_audio(audio_path, model.sample_rate)
-        original_duration = audio.size(1) / model.sample_rate
-        logging.info(f"  Original duration: {original_duration:.2f}s")
-
-        # Adjust to target duration (aligned to codec frame boundaries)
-        audio = adjust_audio_to_duration(audio, model.sample_rate, duration, model.codec_model_samples_per_frame)
-        adjusted_duration = audio.size(1) / model.sample_rate
-        logging.info(f"  Adjusted duration: {adjusted_duration:.2f}s (target: {duration}s, codec-aligned)")
-
-        context_audios.append(audio)
-        context_audio_lens_list.append(torch.tensor([audio.size(1)], dtype=torch.long))
-
-    logging.info(f"\nBatch size: {batch_size}")
-    logging.info(f"Context texts: {context_texts}")
-    logging.info(f"Texts to synthesize: {texts}")
-    logging.info(f"Phoneme texts: {phoneme_texts}")
-    logging.info(f"Use GT phonemes: {args.use_gt_phonemes}")
-
-    # Use single-sample or batched inference
-    if batch_size == 1:
-        logging.info("\n=== Running single-sample streaming inference ===")
-        output, timing_info, context_audio_decoded, context_audio_decoded_lens = run_streaming_inference(
-            model=model,
-            context_audio=context_audios[0],
-            context_audio_lens=context_audio_lens_list[0],
-            context_text=context_texts[0],
-            text=texts[0],
-            phoneme_text=phoneme_texts[0],
-            use_gt_phonemes=args.use_gt_phonemes,
-            inference_mode=args.inference_mode,
-            use_cfg=args.use_cfg,
-            cfg_scale=args.cfg_scale,
-            use_local_transformer=args.use_local_transformer,
-            temperature=args.temperature,
-            topk=args.topk,
-            max_steps=args.max_steps,
-            verbose=args.verbose,
-            force_dropout_text=args.force_dropout_text,
-        )
-
-        # Save output
-        output_dir = os.path.dirname(args.output_path)
-        if output_dir and not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-
-        audio_np = output.audio[0, : output.audio_len[0].item()].cpu().numpy()
-        sf.write(args.output_path, audio_np, model.output_sample_rate)
-        logging.info(f"Output saved to: {args.output_path}")
-
-        # Save decoded context audio for sanity check
-        output_base, output_ext = os.path.splitext(args.output_path)
-        context_output_path = f"{output_base}_context_decoded{output_ext}"
-        context_audio_np = context_audio_decoded[0, : context_audio_decoded_lens[0].item()].cpu().numpy()
-        sf.write(context_output_path, context_audio_np, model.output_sample_rate)
-
-        logging.info(f"Context audio (decoded from codes) saved to: {context_output_path}")
-        logging.info(f"Context audio duration: {context_audio_decoded_lens[0].item() / model.output_sample_rate:.2f}s")
-        logging.info(f"Audio duration: {output.audio_len[0].item() / model.output_sample_rate:.2f}s")
-        logging.info(f"Generated codes shape: {output.audio_codes.shape}")
-        if output.phoneme_text:
-            logging.info(f"Predicted phoneme text: {output.phoneme_text[0]}")
-
-        # Print timing summary
-        logging.info("\n=== Timing Summary ===")
-        logging.info(f"Init time: {timing_info['init_time']:.3f}s")
-        logging.info(f"Generation time: {timing_info['generation_time']:.3f}s")
-        logging.info(f"Total time: {timing_info['total_time']:.3f}s")
-        logging.info(f"Text tokens processed: {timing_info['num_text_tokens']}")
-        logging.info(f"  - Prompt phase tokens: {timing_info['prompt_phase_tokens']}")
-        logging.info(f"  - Phoneme-only phase tokens: {timing_info['phoneme_only_phase_tokens']}")
-        logging.info(f"Audio frames generated: {timing_info['num_audio_frames']}")
-        logging.info(f"Phoneme frames generated: {timing_info['num_phoneme_frames']}")
-        logging.info(f"Continuation steps: {timing_info['continuation_steps']}")
-
-        # Calculate RTF
-        audio_duration = output.audio_len[0].item() / model.output_sample_rate
-        rtf = audio_duration / timing_info['total_time']
-        logging.info(f"Real-time factor (RTF): {rtf:.2f}x")
-
-    else:
-        logging.info(f"\n=== Running batched streaming inference (batch_size={batch_size}) ===")
-        output, timing_info = run_batched_streaming_inference(
-            model=model,
-            context_audios=context_audios,
-            context_audio_lens_list=context_audio_lens_list,
-            context_texts=context_texts,
-            texts=texts,
-            phoneme_texts=phoneme_texts,
-            use_gt_phonemes=args.use_gt_phonemes,
-            inference_mode=args.inference_mode,
-            use_cfg=args.use_cfg,
-            cfg_scale=args.cfg_scale,
-            use_local_transformer=args.use_local_transformer,
-            temperature=args.temperature,
-            topk=args.topk,
-            max_steps=args.max_steps,
-            verbose=args.verbose,
-            force_dropout_text=args.force_dropout_text,
-        )
-
-        # Save outputs for each batch item
-        output_dir = os.path.dirname(args.output_path)
-        if output_dir and not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-
-        output_base, output_ext = os.path.splitext(args.output_path)
-
-        for i in range(batch_size):
-            output_path_i = f"{output_base}_{i}{output_ext}"
-            audio_np = output.audio[i, : output.audio_len[i].item()].cpu().numpy()
-            sf.write(output_path_i, audio_np, model.output_sample_rate)
-            audio_duration_i = output.audio_len[i].item() / model.output_sample_rate
-            logging.info(f"Output {i+1}/{batch_size} saved to: {output_path_i} (duration: {audio_duration_i:.2f}s)")
-            if output.phoneme_text and i < len(output.phoneme_text):
-                logging.info(f"  Predicted phoneme text: {output.phoneme_text[i]}")
-
-        logging.info(f"\nGenerated codes shape: {output.audio_codes.shape}")
-
-        # Print timing summary
-        logging.info("\n=== Timing Summary ===")
-        logging.info(f"Init time: {timing_info['init_time']:.3f}s")
-        logging.info(f"Generation time: {timing_info['generation_time']:.3f}s")
-        logging.info(f"Total time: {timing_info['total_time']:.3f}s")
-        logging.info(f"Text tokens per item: {timing_info['num_text_tokens']}")
-        logging.info(f"Audio frames generated: {timing_info['num_audio_frames']}")
-        logging.info(f"Total steps: {timing_info['total_steps']}")
-
-        # Calculate average RTF
-        total_audio_duration = sum(output.audio_len[i].item() for i in range(batch_size)) / model.output_sample_rate
-        avg_rtf = total_audio_duration / timing_info['total_time']
-        logging.info(f"Average real-time factor (RTF): {avg_rtf:.2f}x")
-        logging.info(f"Total audio duration (all items): {total_audio_duration:.2f}s")
-
-
-if __name__ == "__main__":
-    main()

From cfa582f81fca415dc23514bf095ddcd76e7e85f1 Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Wed, 11 Mar 2026 10:30:34 -0700
Subject: [PATCH 87/94] Magpietts decoderonly 2601 inference refactoring (#69)

* undo model pt

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* remove test infer vs proces batch

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* undo inference changes for easy magpie to start fresh

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* inference refactoring

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

---------

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 docs/source/tts/magpietts-longform.rst        |   6 +-
 docs/source/tts/magpietts.rst                 |   2 +-
 examples/tts/evalset_config.json              |   1 +
 .../{magpietts_inference.py => tts_infer.py}  | 421 ++++++-----
 nemo/collections/tts/models/easy_magpietts.py |  25 +-
 .../modules/magpietts_inference/__init__.py   |  54 +-
 .../modules/magpietts_inference/inference.py  | 682 +++++++++---------
 .../tts/modules/magpietts_inference/utils.py  | 113 ++-
 nemo/core/classes/modelPT.py                  |   2 +-
 .../tts/test_infer_vs_process_batch.py        | 491 -------------
 ...S_InferEvaluate_Magpietts_FrameStacking.sh |   2 +-
 ...TS_InferEvaluate_Magpietts_MoE_ZeroShot.sh |   2 +-
 ...TS_InferEvaluate_Magpietts_SeenSpeakers.sh |   2 +-
 ...L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh |   2 +-
 ...Evaluatelongform_Magpietts_MoE_ZeroShot.sh |   2 +-
 ...nferEvaluatelongform_Magpietts_ZeroShot.sh |   2 +-
 16 files changed, 736 insertions(+), 1073 deletions(-)
 rename examples/tts/{magpietts_inference.py => tts_infer.py} (68%)
 delete mode 100644 tests/collections/tts/test_infer_vs_process_batch.py

diff --git a/docs/source/tts/magpietts-longform.rst b/docs/source/tts/magpietts-longform.rst
index 33aef42a5abe..fb3eeb659d33 100644
--- a/docs/source/tts/magpietts-longform.rst
+++ b/docs/source/tts/magpietts-longform.rst
@@ -169,7 +169,7 @@ The ``do_tts`` method automatically detects whether longform inference is needed
     sf.write("output.wav", long_audio[0].cpu().numpy(), 22050)
 
 
-Method 2: Using CLI (``magpietts_inference.py``)
+Method 2: Using CLI (``tts_infer.py``)
 ------------------------------------------------
 
 For batch inference from manifests:
@@ -177,7 +177,7 @@ For batch inference from manifests:
 .. code-block:: bash
 
     # Auto-detect longform based on text length (default)
-    python examples/tts/magpietts_inference.py \
+    python examples/tts/tts_infer.py \
         --nemo_files /path/to/magpietts.nemo \
         --datasets_json_path /path/to/evalset_config.json \
         --out_dir /path/to/output \
@@ -185,7 +185,7 @@ For batch inference from manifests:
         --longform_mode auto
 
     # Force longform inference for all inputs
-    python examples/tts/magpietts_inference.py \
+    python examples/tts/tts_infer.py \
         --nemo_files /path/to/magpietts.nemo \
         --datasets_json_path /path/to/evalset_config.json \
         --out_dir /path/to/output \
diff --git a/docs/source/tts/magpietts.rst b/docs/source/tts/magpietts.rst
index b79c11ea88ff..6d297a694596 100644
--- a/docs/source/tts/magpietts.rst
+++ b/docs/source/tts/magpietts.rst
@@ -130,7 +130,7 @@ Several parameters control the generation behavior. The temperature setting affe
 
 .. code-block:: bash
 
-    python examples/tts/magpietts_inference.py \
+    python examples/tts/tts_infer.py \
         --nemo_files /path/to/magpietts_model.nemo \
         --codecmodel_path /path/to/audio_codec.nemo \
         --datasets your_evaluation_set \
diff --git a/examples/tts/evalset_config.json b/examples/tts/evalset_config.json
index 4be3056020ce..2d61a601f880 100644
--- a/examples/tts/evalset_config.json
+++ b/examples/tts/evalset_config.json
@@ -15,3 +15,4 @@
         "feature_dir": null
     }
 }
+
diff --git a/examples/tts/magpietts_inference.py b/examples/tts/tts_infer.py
similarity index 68%
rename from examples/tts/magpietts_inference.py
rename to examples/tts/tts_infer.py
index f1ed60c27428..2c3bec0aa7f7 100644
--- a/examples/tts/magpietts_inference.py
+++ b/examples/tts/tts_infer.py
@@ -12,25 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-MagpieTTS Inference and Evaluation Script.
+TTS Inference and Evaluation Script.
 
-Supports both standard and Mixture of Experts (MoE) models with:
+Supports both encoder-decoder MagpieTTS and decoder-only EasyMagpieTTS models
+with:
 - Automatic MoE detection and FLOPs calculation
 - Comprehensive evaluation metrics (RTF, FLOPs, CER, SSIM, etc.)
 
-This script provides a clean CLI for running MagpieTTS inference with optional evaluation.
-It decouples inference and evaluation into separate modules for better maintainability.
+This script provides a clean CLI for running TTS inference with optional
+evaluation. Model-specific behaviour (dataset creation, inference loop, CLI
+arguments) is handled by separate runner classes so there is no scattered
+if/else branching.
 
 Example usage:
-    # Inference only (from .nemo file) - default behavior
-    python examples/tts/magpietts_inference.py \\
+    # MagpieTTS inference (encoder-decoder, default)
+    python examples/tts/tts_infer.py \\
+        --model_type magpie \\
         --nemo_files /path/to/model.nemo \\
         --datasets_json_path /path/to/evalset_config.json \\
         --out_dir /path/to/output \\
         --codecmodel_path /path/to/codec.nemo
 
-    # Inference with evaluation (from checkpoint)
-    python examples/tts/magpietts_inference.py \\
+    # EasyMagpieTTS inference (decoder-only)
+    python examples/tts/tts_infer.py \\
+        --model_type easy_magpie \\
+        --nemo_files /path/to/model.nemo \\
+        --datasets_json_path /path/to/evalset_config.json \\
+        --out_dir /path/to/output \\
+        --codecmodel_path /path/to/codec.nemo
+
+    # With evaluation
+    python examples/tts/tts_infer.py \\
+        --model_type magpie \\
         --hparams_files /path/to/hparams.yaml \\
         --checkpoint_files /path/to/model.ckpt \\
         --datasets_json_path /path/to/evalset_config.json \\
@@ -53,20 +66,27 @@
 import numpy as np
 
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
+from nemo.collections.tts.models.easy_magpietts import EasyModelInferenceParameters
 from nemo.collections.tts.models.magpietts import ModelInferenceParameters
 from nemo.collections.tts.modules.magpietts_inference.evaluate_generated_audio import load_evalset_config
-
-# Import the modular components
 from nemo.collections.tts.modules.magpietts_inference.evaluation import (
     DEFAULT_VIOLIN_METRICS,
     EvaluationConfig,
     compute_mean_with_confidence_interval,
     evaluate_generated_audio_dir,
 )
-from nemo.collections.tts.modules.magpietts_inference.inference import InferenceConfig, MagpieInferenceRunner
+from nemo.collections.tts.modules.magpietts_inference.inference import (
+    BaseInferenceConfig,
+    BaseInferenceRunner,
+    EasyMagpieInferenceConfig,
+    EasyMagpieInferenceRunner,
+    MagpieInferenceConfig,
+    MagpieInferenceRunner,
+)
 from nemo.collections.tts.modules.magpietts_inference.utils import (
     ModelLoadConfig,
     get_experiment_name_from_checkpoint_path,
+    load_easy_magpie_model,
     load_magpie_model,
     log_model_architecture_summary,
 )
@@ -132,50 +152,54 @@ def create_formatted_metrics_mean_ci(metrics_mean_ci: dict) -> dict:
 def filter_datasets(dataset_meta_info: dict, datasets: Optional[List[str]]) -> List[str]:
     """Select datasets from the dataset meta info."""
     if datasets is None:
-        # Dataset filtering not specified, return all datasets
         return list(dataset_meta_info.keys())
     else:
         datasets = datasets.split(",")
-        # Check if datasets are valid
         for dataset in datasets:
             if dataset not in dataset_meta_info:
                 raise ValueError(f"Dataset {dataset} not found in dataset meta info")
-        # Return all requsted datasets
         return datasets
 
 
+# ---------------------------------------------------------------------------
+# Core inference + evaluation orchestration (model-type agnostic)
+# ---------------------------------------------------------------------------
+
+
 def run_inference_and_evaluation(
-    model_config: ModelLoadConfig,
-    inference_config: InferenceConfig,
+    runner: BaseInferenceRunner,
+    checkpoint_name: str,
+    inference_config: BaseInferenceConfig,
     eval_config: EvaluationConfig,
     dataset_meta_info: dict,
-    datasets: Optional[List[str]],
+    datasets: List[str],
     out_dir: str,
+    flops_per_component: dict,
+    moe_info: str,
     num_repeats: int = 1,
     confidence_level: float = 0.95,
     violin_plot_metrics: Optional[List[str]] = None,
-    log_exp_name: bool = False,
     clean_up_disk: bool = False,
     skip_evaluation: bool = False,
 ) -> Tuple[Optional[float], Optional[float]]:
     """Run inference and optional evaluation on specified datasets.
 
-    Uses unified inference path with automatic text chunking based on
-    per-sample language thresholds. Short texts are processed as single chunks,
-    long texts are automatically split into sentences.
+    This function is model-type agnostic -- it delegates dataset creation
+    and batch inference to the provided ``runner``.
 
     Args:
-        model_config: Configuration for loading the model.
+        runner: Concrete inference runner (MagpieInferenceRunner or EasyMagpieInferenceRunner).
+        checkpoint_name: Human-readable checkpoint identifier for output naming.
         inference_config: Configuration for inference.
         eval_config: Configuration for evaluation.
         dataset_meta_info: Dictionary containing dataset metadata.
-        datasets: List of dataset names to run inference and evaluation on. If None, all datasets in the
-                  dataset meta info will be processed.
+        datasets: List of dataset names to process.
         out_dir: Output directory for results.
+        flops_per_component: FLOPs info dict from log_model_architecture_summary.
+        moe_info: MoE identifier string from log_model_architecture_summary.
         num_repeats: Number of times to repeat inference (for CI estimation).
         confidence_level: Confidence level for CI calculation.
         violin_plot_metrics: Metrics to include in violin plots.
-        log_exp_name: Whether to include experiment name in output paths.
         clean_up_disk: Whether to clean up output directory after completion.
         skip_evaluation: Whether to skip evaluation (inference only mode).
 
@@ -185,40 +209,17 @@ def run_inference_and_evaluation(
     if violin_plot_metrics is None:
         violin_plot_metrics = list(DEFAULT_VIOLIN_METRICS)
 
-    # Remove UTMOSv2 from plots if disabled
     if not eval_config.with_utmosv2 and 'utmosv2' in violin_plot_metrics:
         violin_plot_metrics.remove('utmosv2')
 
-    # Load model
-    model, checkpoint_name = load_magpie_model(
-        model_config, is_decoder_only_model=inference_config.is_decoder_only_model
-    )
-    # change model to fp32 for inference
-    model = model.float()
-
-    # Log architecture summary and get MoE info + FLOPs metrics
-    moe_info, flops_per_component = log_model_architecture_summary(model)
-
-    # Add experiment name prefix if requested
-    if log_exp_name and model_config.checkpoint_file:
-        exp_name = get_experiment_name_from_checkpoint_path(model_config.checkpoint_file)
-        checkpoint_name = f"{exp_name}__{checkpoint_name}"
-
-    # Build full checkpoint identifier (include MoE info if present)
     full_checkpoint_name = (
         f"{checkpoint_name}_{moe_info}{inference_config.build_identifier()}_SV_{eval_config.sv_model}"
     )
 
-    # Create inference runner (uses unified path with automatic text chunking)
-    logging.info("Using unified inference with automatic text chunking based on language thresholds")
-    runner = MagpieInferenceRunner(model, inference_config)
-
-    # Tracking metrics across datasets
     ssim_per_dataset = []
     cer_per_dataset = []
     all_datasets_filewise_metrics = {}
 
-    # CSV headers
     csv_header = (
         "checkpoint_name,dataset,cer_filewise_avg,wer_filewise_avg,cer_cumulative,"
         "wer_cumulative,ssim_pred_gt_avg,ssim_pred_context_avg,ssim_gt_context_avg,"
@@ -234,17 +235,14 @@ def run_inference_and_evaluation(
         manifest_records = read_manifest(meta['manifest_path'])
         language = meta.get('whisper_language', 'en')
 
-        # Prepare dataset metadata (remove evaluation-specific keys)
         dataset_meta_for_dl = copy.deepcopy(meta)
         for key in ["whisper_language", "load_cached_codes_if_available"]:
             dataset_meta_for_dl.pop(key, None)
 
-        # Setup output directories
         eval_dir = os.path.join(out_dir, f"{full_checkpoint_name}_{dataset}")
         audio_dir = os.path.join(eval_dir, "audio")
         os.makedirs(eval_dir, exist_ok=True)
 
-        # Setup CSV files
         per_run_csv = os.path.join(eval_dir, "all_experiment_metrics.csv")
         write_csv_header_if_needed(per_run_csv, csv_header)
 
@@ -257,7 +255,6 @@ def run_inference_and_evaluation(
             repeat_audio_dir = os.path.join(audio_dir, f"repeat_{repeat_idx}")
             os.makedirs(repeat_audio_dir, exist_ok=True)
 
-            # Create dataset and run inference
             test_dataset = runner.create_dataset({dataset: dataset_meta_for_dl})
 
             if len(test_dataset) != len(manifest_records):
@@ -271,14 +268,12 @@ def run_inference_and_evaluation(
                 manifest_records=manifest_records,
                 audio_base_dir=meta['audio_dir'],
                 save_cross_attention_maps=True,
-                save_context_audio=(repeat_idx == 0),  # Only save context audio once
-                save_predicted_codes=eval_config.with_fcd,  # Code files are only needed for FCD computation
+                save_context_audio=(repeat_idx == 0),
+                save_predicted_codes=eval_config.with_fcd,
             )
 
-            # Compute mean RTF metrics
             mean_rtf = runner.compute_mean_rtf_metrics(rtf_metrics_list)
 
-            # Add FLOPs metrics per component
             for component_name, component_flops in flops_per_component.items():
                 for key, value in component_flops.items():
                     mean_rtf[f"{component_name}_{key}"] = value
@@ -291,7 +286,6 @@ def run_inference_and_evaluation(
                 logging.info("Skipping evaluation as requested.")
                 continue
 
-            # Run evaluation
             eval_config_for_dataset = EvaluationConfig(
                 sv_model=eval_config.sv_model,
                 asr_model_name=eval_config.asr_model_name,
@@ -312,7 +306,6 @@ def run_inference_and_evaluation(
             metrics_all_repeats.append(metrics)
             filewise_metrics_all_repeats.extend(filewise_metrics)
 
-            # Save metrics
             with open(os.path.join(eval_dir, f"{dataset}_metrics_{repeat_idx}.json"), "w") as f:
                 json.dump(metrics, f, indent=4)
 
@@ -320,24 +313,19 @@ def run_inference_and_evaluation(
             with open(os.path.join(eval_dir, f"{dataset}_filewise_metrics_{repeat_idx}.json"), "w") as f:
                 json.dump(sorted_filewise, f, indent=4)
 
-            # Append to per-run CSV
             append_metrics_to_csv(per_run_csv, full_checkpoint_name, dataset, metrics)
 
-            # Create violin plot for this repeat
             violin_path = Path(eval_dir) / f"{dataset}_violin_{repeat_idx}.png"
             create_violin_plot(filewise_metrics, violin_plot_metrics, violin_path)
 
-            # Delete temporary predicted codes files
             for codec_file_path in codec_file_paths:
                 os.remove(codec_file_path)
 
         if skip_evaluation or not metrics_all_repeats:
             continue
 
-        # Store for combined plot
         all_datasets_filewise_metrics[dataset] = filewise_metrics_all_repeats
 
-        # Compute mean with confidence interval across repeats
         metrics_mean_ci = compute_mean_with_confidence_interval(
             metrics_all_repeats,
             confidence=confidence_level,
@@ -345,42 +333,76 @@ def run_inference_and_evaluation(
 
         formatted_metrics_mean_ci = create_formatted_metrics_mean_ci(metrics_mean_ci)
 
-        # Write to aggregated CSV
         ci_csv = os.path.join(out_dir, "all_experiment_metrics_with_ci.csv")
         write_csv_header_if_needed(ci_csv, csv_header)
         append_metrics_to_csv(ci_csv, full_checkpoint_name, dataset, formatted_metrics_mean_ci)
 
-        # Track per-dataset means
         ssim_values = [m['ssim_pred_context_avg'] for m in metrics_all_repeats]
         cer_values = [m['cer_cumulative'] for m in metrics_all_repeats]
         ssim_per_dataset.append(np.mean(ssim_values))
         cer_per_dataset.append(np.mean(cer_values))
 
-    # Create combined plot if we have multiple datasets
     if len(all_datasets_filewise_metrics) > 1:
         combined_plot_path = os.path.join(out_dir, f"{full_checkpoint_name}_combined_violin_plot.png")
         create_combined_box_plot(all_datasets_filewise_metrics, violin_plot_metrics, combined_plot_path)
 
-    # Clean up if requested
     if clean_up_disk:
         logging.info(f"Cleaning up output directory: {out_dir}")
         shutil.rmtree(out_dir)
 
-    # Return averaged metrics
     if ssim_per_dataset and cer_per_dataset:
         return np.mean(cer_per_dataset), np.mean(ssim_per_dataset)
     return None, None
 
 
-def create_argument_parser() -> argparse.ArgumentParser:
-    """Create the CLI argument parser."""
-    parser = argparse.ArgumentParser(
-        description='MagpieTTS Inference and Evaluation',
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__,
+# ---------------------------------------------------------------------------
+# CLI argument parser
+# ---------------------------------------------------------------------------
+
+
+def _add_inference_param_fields(
+    group: argparse._ArgumentGroup,
+    param_cls: type,
+    skip_fields: Optional[set] = None,
+) -> None:
+    """Auto-generate argparse arguments from fields of a dataclass.
+
+    Args:
+        group: The argparse argument group to add arguments to.
+        param_cls: The dataclass whose fields to add.
+        skip_fields: Field names to skip (already added by another group).
+    """
+    if skip_fields is None:
+        skip_fields = set()
+    for f in fields(param_cls):
+        if f.name in skip_fields:
+            continue
+        extra_args: dict = {"type": f.type}
+        if f.type == bool:
+            extra_args = {"action": "store_true"}
+        if f.name in ("estimate_alignment_from_layers", "apply_prior_to_layers"):
+            extra_args = {
+                "help": "Must be a comma separate string. Not enclosed in brackets",
+                "type": str,
+            }
+        elif f.name == "eos_detection_method":
+            extra_args["choices"] = [m.value for m in EOSDetectionMethod]
+        group.add_argument(f"--{f.name}", **extra_args)
+
+
+def _add_common_args(parser: argparse.ArgumentParser) -> None:
+    """Add arguments shared by all model types."""
+
+    parser.add_argument(
+        '--model_type',
+        type=str,
+        default='magpie',
+        choices=['magpie', 'easy_magpie'],
+        help='Model type: "magpie" for encoder-decoder MagpieTTSModel, '
+        '"easy_magpie" for decoder-only EasyMagpieTTSModel',
     )
 
-    # Model loading arguments
+    # Model loading
     model_group = parser.add_argument_group('Model Loading')
     model_group.add_argument(
         '--hparams_files',
@@ -422,73 +444,37 @@ def create_argument_parser() -> argparse.ArgumentParser:
         help='Use legacy text conditioning (for old checkpoints)',
     )
 
-    # Dataset and output arguments
+    # Dataset and output
     data_group = parser.add_argument_group('Dataset and Output')
     data_group.add_argument(
         '--datasets_json_path',
         type=str,
         required=True,
         default=None,
-        help='Path to dataset configuration JSON file (will process all datasets in the file if --datasets is not specified)',
+        help='Path to dataset configuration JSON file',
     )
     data_group.add_argument(
         '--datasets',
         type=str,
         default=None,
-        help='Comma-separated list of dataset names to process using names from the datasets_json_path file.  If not specified, all datasets in the datasets_json_path will be processed.',
-    )
-    data_group.add_argument(
-        '--out_dir',
-        type=str,
-        required=True,
-        help='Output directory for generated audio and metrics',
-    )
-    data_group.add_argument(
-        '--log_exp_name',
-        action='store_true',
-        help='Include experiment name in output folder name',
-    )
-    data_group.add_argument(
-        '--clean_up_disk',
-        action='store_true',
-        help='Delete output directory after completion',
+        help='Comma-separated list of dataset names to process',
     )
+    data_group.add_argument('--out_dir', type=str, required=True, help='Output directory')
+    data_group.add_argument('--log_exp_name', action='store_true')
+    data_group.add_argument('--clean_up_disk', action='store_true')
 
-    # Inference arguments
-    infer_group = parser.add_argument_group('Inference Parameters')
-    # Add model specific parameters
-    for field in fields(ModelInferenceParameters):
-        extra_args = {"type": field.type}
-        if field.type == bool:
-            extra_args["action"] = "store_true"
-            del extra_args["type"]
-        if field.name == "estimate_alignment_from_layers" or field.name == "apply_prior_to_layers":
-            extra_args["help"] = "Must be a comma separate string. Not enclosed in brackets"
-            extra_args["type"] = str
-        elif field.name == "eos_detection_method":
-            extra_args["choices"] = [m.value for m in EOSDetectionMethod]
-        infer_group.add_argument(f"--{field.name}", **extra_args)
+    # Common inference parameters
+    infer_group = parser.add_argument_group('Common Inference Parameters')
     infer_group.add_argument('--batch_size', type=int, default=32)
     infer_group.add_argument('--use_cfg', action='store_true', help='Enable classifier-free guidance')
-
-    # Local transformer / MaskGit arguments
     infer_group.add_argument('--use_local_transformer', action='store_true')
-    infer_group.add_argument('--maskgit_n_steps', type=int, default=3)
-    infer_group.add_argument('--maskgit_noise_scale', type=float, default=0.0)
-    infer_group.add_argument('--maskgit_fixed_schedule', type=int, nargs='+', default=None)
-    infer_group.add_argument(
-        '--maskgit_sampling_type',
-        default=None,
-        choices=["default", "causal", "purity_causal", "purity_default"],
-    )
 
-    # Evaluation arguments
+    # Shared model inference parameters (max_decoder_steps, temperature, topk, cfg_scale)
+    _add_inference_param_fields(infer_group, EasyModelInferenceParameters)
+
+    # Evaluation
     eval_group = parser.add_argument_group('Evaluation')
-    eval_group.add_argument(
-        '--run_evaluation',
-        action='store_true',
-        help='Run evaluation after inference (default: False, inference only)',
-    )
+    eval_group.add_argument('--run_evaluation', action='store_true', help='Run evaluation after inference')
     eval_group.add_argument('--sv_model', type=str, default="titanet", choices=["titanet", "wavlm"])
     eval_group.add_argument('--asr_model_name', type=str, default="nvidia/parakeet-tdt-1.1b")
     eval_group.add_argument('--num_repeats', type=int, default=1)
@@ -500,70 +486,92 @@ def create_argument_parser() -> argparse.ArgumentParser:
         nargs='*',
         default=['cer', 'pred_context_ssim', 'utmosv2'],
     )
-    eval_group.add_argument('--disable_fcd', action='store_true', help="Disable Frechet Codec Distance computation")
+    eval_group.add_argument('--disable_fcd', action='store_true')
 
-    # Quality targets (for CI/CD)
+    # Quality targets
     target_group = parser.add_argument_group('Quality Targets')
     target_group.add_argument('--cer_target', type=float, default=None)
     target_group.add_argument('--ssim_target', type=float, default=None)
-    target_group.add_argument('--is_decoder_only_model', action='store_true')
-    target_group.add_argument(
-        '--legacy_context_stacking',
-        action='store_true',
-        help='Use audio_bos_id/audio_eos_id instead of context_audio_bos_id/context_audio_eos_id for context stacking',
-    )
-    target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt'])
-    target_group.add_argument(
-        '--phoneme_sampling_method', type=str, default='argmax', choices=['argmax', 'multinomial']
-    )
-    target_group.add_argument('--dropout_text_input', action='store_true')
 
-    return parser
 
+def _add_magpie_args(parser: argparse.ArgumentParser) -> None:
+    """Add arguments specific to encoder-decoder MagpieTTSModel."""
+    group = parser.add_argument_group('MagpieTTS-specific Parameters')
 
-def main(argv=None):
-    """Entry point for MagpieTTS inference and evaluation.
+    # MagpieTTS-specific model inference parameters (attention prior, EOS, etc.)
+    # Skip fields already added by the common inference group.
+    shared_field_names = {f.name for f in fields(EasyModelInferenceParameters)}
+    _add_inference_param_fields(group, ModelInferenceParameters, skip_fields=shared_field_names)
 
-    Args:
-        argv: Command-line arguments. If None, uses sys.argv.
-    """
-    parser = create_argument_parser()
-    args = parser.parse_args(argv)
+    group.add_argument('--maskgit_n_steps', type=int, default=3)
+    group.add_argument('--maskgit_noise_scale', type=float, default=0.0)
+    group.add_argument('--maskgit_fixed_schedule', type=int, nargs='+', default=None)
+    group.add_argument(
+        '--maskgit_sampling_type',
+        default=None,
+        choices=["default", "causal", "purity_causal", "purity_default"],
+    )
 
-    dataset_meta_info = load_evalset_config(args.datasets_json_path)
-    datasets = filter_datasets(dataset_meta_info, args.datasets)
 
-    logging.info(f"Loaded {len(datasets)} datasets: {', '.join(datasets)}")
+def _add_easy_magpie_args(parser: argparse.ArgumentParser) -> None:
+    """Add arguments specific to decoder-only EasyMagpieTTSModel."""
+    group = parser.add_argument_group('EasyMagpieTTS-specific Parameters')
+    group.add_argument(
+        '--phoneme_input_type',
+        type=str,
+        default='gt',
+        choices=['gt', 'predicted'],
+        help='Source of phoneme input for decoder-only model',
+    )
+    group.add_argument(
+        '--phoneme_sampling_method',
+        type=str,
+        default='argmax',
+        choices=['argmax', 'multinomial'],
+        help='Sampling method for phoneme prediction',
+    )
+    group.add_argument('--dropout_text_input', action='store_true', help='Force dropout on text input')
+    group.add_argument(
+        '--legacy_context_stacking',
+        action='store_true',
+        help='Use audio_bos_id/audio_eos_id for context stacking',
+    )
 
-    # Determine mode and validate
-    has_checkpoint_mode = (
-        args.hparams_files is not None
-        and args.checkpoint_files is not None
-        and args.hparams_files != "null"
-        and args.checkpoint_files != "null"
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    """Create the CLI argument parser with all argument groups."""
+    parser = argparse.ArgumentParser(
+        description='TTS Inference and Evaluation (MagpieTTS & EasyMagpieTTS)',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
     )
-    has_nemo_mode = args.nemo_files is not None and args.nemo_files != "null"
+    _add_common_args(parser)
+    _add_magpie_args(parser)
+    _add_easy_magpie_args(parser)
+    return parser
 
-    if not has_checkpoint_mode and not has_nemo_mode:
-        parser.error("You must provide either:\n 1. --hparams_files and --checkpoint_files\n 2. --nemo_files")
 
-    # Build configurations
-    model_inference_parameters = {}
-    for field in fields(ModelInferenceParameters):
-        field_name = field.name
-        arg_from_cmdline = vars(args)[field_name]
-        if arg_from_cmdline is not None:
-            if field_name in ["estimate_alignment_from_layers", "apply_prior_to_layers"]:
-                model_inference_parameters[field_name] = parse_layer_list(arg_from_cmdline)
+# ---------------------------------------------------------------------------
+# Config builders (one per model type)
+# ---------------------------------------------------------------------------
+
+
+def _build_inference_params_from_args(param_cls: type, args):
+    """Extract inference parameters from parsed CLI args for the given dataclass."""
+    params = {}
+    for f in fields(param_cls):
+        arg_val = vars(args).get(f.name)
+        if arg_val is not None:
+            if f.name in ("estimate_alignment_from_layers", "apply_prior_to_layers"):
+                params[f.name] = parse_layer_list(arg_val)
             else:
-                model_inference_parameters[field_name] = arg_from_cmdline
+                params[f.name] = arg_val
+    return param_cls.from_dict(params)
 
-    if "max_decoder_steps" not in model_inference_parameters:
-        if args.is_decoder_only_model:
-            model_inference_parameters["max_decoder_steps"] = 300
 
-    inference_config = InferenceConfig(
-        model_inference_parameters=ModelInferenceParameters.from_dict(model_inference_parameters),
+def _build_magpie_config(args) -> MagpieInferenceConfig:
+    return MagpieInferenceConfig(
+        model_inference_parameters=_build_inference_params_from_args(ModelInferenceParameters, args),
         batch_size=args.batch_size,
         use_cfg=args.use_cfg,
         apply_attention_prior=args.apply_attention_prior,
@@ -572,13 +580,54 @@ def main(argv=None):
         maskgit_noise_scale=args.maskgit_noise_scale,
         maskgit_fixed_schedule=args.maskgit_fixed_schedule,
         maskgit_sampling_type=args.maskgit_sampling_type,
-        is_decoder_only_model=args.is_decoder_only_model,
+    )
+
+
+def _build_easy_magpie_config(args) -> EasyMagpieInferenceConfig:
+    return EasyMagpieInferenceConfig(
+        model_inference_parameters=_build_inference_params_from_args(EasyModelInferenceParameters, args),
+        batch_size=args.batch_size,
+        use_cfg=args.use_cfg,
+        use_local_transformer=args.use_local_transformer,
         phoneme_input_type=args.phoneme_input_type,
         phoneme_sampling_method=args.phoneme_sampling_method,
         dropout_text_input=args.dropout_text_input,
         legacy_context_stacking=args.legacy_context_stacking,
     )
 
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+
+def main(argv=None):
+    """Entry point for TTS inference and evaluation."""
+    parser = create_argument_parser()
+    args = parser.parse_args(argv)
+
+    dataset_meta_info = load_evalset_config(args.datasets_json_path)
+    datasets = filter_datasets(dataset_meta_info, args.datasets)
+    logging.info(f"Loaded {len(datasets)} datasets: {', '.join(datasets)}")
+
+    # Validate model loading args
+    has_checkpoint_mode = (
+        args.hparams_files is not None
+        and args.checkpoint_files is not None
+        and args.hparams_files != "null"
+        and args.checkpoint_files != "null"
+    )
+    has_nemo_mode = args.nemo_files is not None and args.nemo_files != "null"
+
+    if not has_checkpoint_mode and not has_nemo_mode:
+        parser.error("You must provide either:\n 1. --hparams_files and --checkpoint_files\n 2. --nemo_files")
+
+    # Select model loader and config builder based on --model_type
+    is_easy_magpie = args.model_type == 'easy_magpie'
+    load_fn = load_easy_magpie_model if is_easy_magpie else load_magpie_model
+    inference_config = _build_easy_magpie_config(args) if is_easy_magpie else _build_magpie_config(args)
+    runner_cls = EasyMagpieInferenceRunner if is_easy_magpie else MagpieInferenceRunner
+
     eval_config = EvaluationConfig(
         sv_model=args.sv_model,
         asr_model_name=args.asr_model_name,
@@ -589,7 +638,7 @@ def main(argv=None):
 
     cer, ssim = None, None
 
-    # Run for each model (checkpoint or nemo)
+    # Iterate over model files (checkpoint or nemo)
     if has_checkpoint_mode:
         hparam_files = args.hparams_files.split(",")
         checkpoint_files = args.checkpoint_files.split(",")
@@ -609,17 +658,28 @@ def main(argv=None):
                 hparams_from_wandb=args.hparams_file_from_wandb,
             )
 
+            model, checkpoint_name = load_fn(model_config)
+            moe_info, flops_per_component = log_model_architecture_summary(model)
+
+            if args.log_exp_name and model_config.checkpoint_file:
+                exp_name = get_experiment_name_from_checkpoint_path(model_config.checkpoint_file)
+                checkpoint_name = f"{exp_name}__{checkpoint_name}"
+
+            runner = runner_cls(model, inference_config)
+
             cer, ssim = run_inference_and_evaluation(
-                model_config=model_config,
+                runner=runner,
+                checkpoint_name=checkpoint_name,
                 inference_config=inference_config,
                 eval_config=eval_config,
                 dataset_meta_info=dataset_meta_info,
                 datasets=datasets,
                 out_dir=args.out_dir,
+                flops_per_component=flops_per_component,
+                moe_info=moe_info,
                 num_repeats=args.num_repeats,
                 confidence_level=args.confidence_level,
                 violin_plot_metrics=args.violin_plot_metrics,
-                log_exp_name=args.log_exp_name,
                 clean_up_disk=args.clean_up_disk,
                 skip_evaluation=not args.run_evaluation,
             )
@@ -635,17 +695,24 @@ def main(argv=None):
                 legacy_text_conditioning=args.legacy_text_conditioning,
             )
 
+            model, checkpoint_name = load_fn(model_config)
+            moe_info, flops_per_component = log_model_architecture_summary(model)
+
+            runner = runner_cls(model, inference_config)
+
             cer, ssim = run_inference_and_evaluation(
-                model_config=model_config,
+                runner=runner,
+                checkpoint_name=checkpoint_name,
                 inference_config=inference_config,
                 eval_config=eval_config,
                 dataset_meta_info=dataset_meta_info,
                 datasets=datasets,
                 out_dir=args.out_dir,
+                flops_per_component=flops_per_component,
+                moe_info=moe_info,
                 num_repeats=args.num_repeats,
                 confidence_level=args.confidence_level,
                 violin_plot_metrics=args.violin_plot_metrics,
-                log_exp_name=args.log_exp_name,
                 clean_up_disk=args.clean_up_disk,
                 skip_evaluation=not args.run_evaluation,
             )
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 5a117432b986..19705eed1ad3 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -14,7 +14,7 @@
 import json
 import os
 import random
-from dataclasses import dataclass
+from dataclasses import dataclass, fields
 from typing import Dict, List, Optional, Tuple
 
 import numpy as np
@@ -98,6 +98,29 @@ class ProcessBatchOutput:
     selected_training_mode: Optional[str]
 
 
+@dataclass
+class EasyModelInferenceParameters:
+    """Inference parameters for the decoder-only EasyMagpieTTS model.
+
+    Attributes:
+        max_decoder_steps: Maximum number of decoder steps.
+        temperature: Sampling temperature.
+        topk: Number of top-probability tokens to consider in sampling.
+        cfg_scale: Scale factor for classifier-free guidance.
+    """
+
+    max_decoder_steps: int = 500
+    temperature: float = 0.7
+    topk: int = 80
+    cfg_scale: float = 2.5
+
+    @classmethod
+    def from_dict(cls, data: dict) -> 'EasyModelInferenceParameters':
+        field_names = {field.name for field in fields(cls)}
+        filtered_data = {k: v for k, v in data.items() if k in field_names}
+        return cls(**filtered_data)
+
+
 class EasyMagpieTTSModel(EasyMagpieTTSInferenceModel):
     """
     Magpie-TTS Model Decoder Only Model with training support.
diff --git a/nemo/collections/tts/modules/magpietts_inference/__init__.py b/nemo/collections/tts/modules/magpietts_inference/__init__.py
index fd99780f21b2..b1ff0aefe91e 100644
--- a/nemo/collections/tts/modules/magpietts_inference/__init__.py
+++ b/nemo/collections/tts/modules/magpietts_inference/__init__.py
@@ -12,35 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-MagpieTTS inference and evaluation subpackage.
+TTS inference and evaluation subpackage.
 
 This package provides modular components for:
 - Model loading and configuration (utils.py)
-- Batch inference (inference.py)
+- Batch inference (inference.py) for both MagpieTTS and EasyMagpieTTS
 - Audio quality evaluation (evaluation.py)
 - Metrics visualization (visualization.py)
 
-Example Usage:
-    from examples.tts.magpietts import (
-        InferenceConfig,
+Example Usage (MagpieTTS - encoder-decoder):
+    from nemo.collections.tts.modules.magpietts_inference import (
+        MagpieInferenceConfig,
         MagpieInferenceRunner,
         load_magpie_model,
         ModelLoadConfig,
     )
 
-    # Load model
-    model_config = ModelLoadConfig(
-        nemo_file="/path/to/model.nemo",
-        codecmodel_path="/path/to/codec.nemo",
-    )
-    model, checkpoint_name = load_magpie_model(model_config)
+    model_config = ModelLoadConfig(nemo_file="/path/to/model.nemo", codecmodel_path="/path/to/codec.nemo")
+    model, name = load_magpie_model(model_config)
+    runner = MagpieInferenceRunner(model, MagpieInferenceConfig())
 
-    # Log architecture summary and retrieve MoE info + FLOPs metrics
-    moe_info, flops_per_component = log_model_architecture_summary(model)
+Example Usage (EasyMagpieTTS - decoder-only):
+    from nemo.collections.tts.modules.magpietts_inference import (
+        EasyMagpieInferenceConfig,
+        EasyMagpieInferenceRunner,
+        load_easy_magpie_model,
+        ModelLoadConfig,
+    )
 
-    # Create runner and run inference
-    inference_config = InferenceConfig()
-    runner = MagpieInferenceRunner(model, inference_config)
+    model_config = ModelLoadConfig(nemo_file="/path/to/model.nemo", codecmodel_path="/path/to/codec.nemo")
+    model, name = load_easy_magpie_model(model_config)
+    runner = EasyMagpieInferenceRunner(model, EasyMagpieInferenceConfig())
 """
 
 from nemo.collections.tts.modules.magpietts_inference.evaluation import (
@@ -49,11 +51,20 @@
     compute_mean_with_confidence_interval,
     evaluate_generated_audio_dir,
 )
-from nemo.collections.tts.modules.magpietts_inference.inference import InferenceConfig, MagpieInferenceRunner
+from nemo.collections.tts.modules.magpietts_inference.inference import (
+    BaseInferenceConfig,
+    BaseInferenceRunner,
+    EasyMagpieInferenceConfig,
+    EasyMagpieInferenceRunner,
+    InferenceConfig,
+    MagpieInferenceConfig,
+    MagpieInferenceRunner,
+)
 from nemo.collections.tts.modules.magpietts_inference.utils import (
     ModelLoadConfig,
     compute_ffn_flops_per_token,
     get_experiment_name_from_checkpoint_path,
+    load_easy_magpie_model,
     load_magpie_model,
     log_model_architecture_summary,
 )
@@ -63,12 +74,19 @@
     # Utils
     "ModelLoadConfig",
     "load_magpie_model",
+    "load_easy_magpie_model",
     "compute_ffn_flops_per_token",
     "get_experiment_name_from_checkpoint_path",
     "log_model_architecture_summary",
-    # Inference
+    # Inference configs
+    "BaseInferenceConfig",
+    "MagpieInferenceConfig",
+    "EasyMagpieInferenceConfig",
     "InferenceConfig",
+    # Inference runners
+    "BaseInferenceRunner",
     "MagpieInferenceRunner",
+    "EasyMagpieInferenceRunner",
     # Evaluation
     "EvaluationConfig",
     "evaluate_generated_audio_dir",
diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index cf325b91d71c..d5d34537e088 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -12,15 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Core inference logic for MagpieTTS.
+Core inference logic for MagpieTTS models.
 
-This module provides:
-- InferenceConfig: Dataclass for inference hyperparameters
-- MagpieInferenceRunner: Class for running batch inference with a loaded model
-  (uses unified inference path with automatic text chunking based on language thresholds)
+This module provides a strategy-pattern based inference framework with:
+- BaseInferenceConfig / MagpieInferenceConfig / EasyMagpieInferenceConfig
+- BaseInferenceRunner / MagpieInferenceRunner / EasyMagpieInferenceRunner
+
+MagpieInferenceRunner handles the encoder-decoder MagpieTTSModel
+(chunked text, generate_speech + codes_to_audio).
+
+EasyMagpieInferenceRunner handles the decoder-only EasyMagpieTTSModel
+(infer_batch, returns audio directly).
 """
 from __future__ import annotations
 
+import abc
 import glob
 import os
 import shutil
@@ -34,65 +40,56 @@
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
 from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPATokenizer
 from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset, MagpieTTSDataset
-from nemo.collections.tts.models import EasyMagpieTTSModel, MagpieTTSModel
+from nemo.collections.tts.models.easy_magpietts import EasyModelInferenceParameters
 from nemo.collections.tts.models.magpietts import ModelInferenceParameters
 from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors
 from nemo.utils import logging
 
 
+# ---------------------------------------------------------------------------
+# Inference config hierarchy
+# ---------------------------------------------------------------------------
+
+
 @dataclass
-class InferenceConfig:
-    """Configuration for MagpieTTS inference.
-
-    Attributes:
-        batch_size: Batch size for inference.
-        use_cfg: Whether to use classifier-free guidance.
-        apply_attention_prior: Whether to apply attention prior during decoding.
-
-        # Model specific inference parameters
-        model_inference_parameters: See ModelInferenceParameters dataclass
-
-        # Local transformer / MaskGit parameters
-        use_local_transformer: Whether to use local transformer for inference.
-        maskgit_n_steps: Number of MaskGit refinement steps.
-        maskgit_noise_scale: Noise scale for MaskGit sampling.
-        maskgit_fixed_schedule: Fixed schedule for MaskGit (optional).
-        maskgit_sampling_type: Type of MaskGit sampling.
+class BaseInferenceConfig(abc.ABC):
+    """Shared inference configuration fields.
+
+    Subclasses must declare their own ``model_inference_parameters`` field
+    with the appropriate type (ModelInferenceParameters or
+    EasyModelInferenceParameters).
     """
 
-    # Core sampling parameters
     batch_size: int = 32
     use_cfg: bool = False
-    apply_attention_prior: bool = False
+    use_local_transformer: bool = False
+
+    @abc.abstractmethod
+    def build_identifier(self) -> str:
+        """Build a unique identifier string for naming output directories."""
+        ...
+
+    @staticmethod
+    def _format_layer_list(layers: Optional[List[int]]) -> str:
+        if layers is None:
+            return "None"
+        return "".join(str(_layer) for _layer in layers)
+
+
+@dataclass
+class MagpieInferenceConfig(BaseInferenceConfig):
+    """Configuration for encoder-decoder MagpieTTSModel inference."""
+
     model_inference_parameters: ModelInferenceParameters = field(default_factory=ModelInferenceParameters)
+    apply_attention_prior: bool = False
 
-    # Local transformer / MaskGit parameters
-    use_local_transformer: bool = False
+    # MaskGit parameters
     maskgit_n_steps: int = 3
     maskgit_noise_scale: float = 0.0
     maskgit_fixed_schedule: Optional[List[int]] = None
     maskgit_sampling_type: Optional[str] = None
 
-    # Decoder-only inference options
-    phoneme_input_type: str = "gt"  # gt or predicted
-    phoneme_sampling_method: str = "argmax"  # argmax or multinomial
-    dropout_text_input: bool = False
-    legacy_context_stacking: bool = False  # Use audio_bos_id/audio_eos_id for context stacking
-
-    # Longform inference mode
-    longform_mode: str = "auto"  # "auto" | "always" | "never"
-    longform_word_threshold: int = 40  # Word threshold for auto-detection
-
-    is_decoder_only_model: bool = False
-
     def build_identifier(self) -> str:
-        """Build a unique identifier string for this configuration.
-
-        Used for naming output directories and files.
-
-        Returns:
-            String identifier incorporating key config values.
-        """
         parts = [
             f"Temp{self.model_inference_parameters.temperature}",
             f"Topk{self.model_inference_parameters.topk}",
@@ -123,134 +120,69 @@ def build_identifier(self) -> str:
 
         return "_".join(parts)
 
-    @staticmethod
-    def _format_layer_list(layers: Optional[List[int]]) -> str:
-        """Format a list of layer indices as a compact string."""
-        if layers is None:
-            return "None"
-        return "".join(str(_layer) for _layer in layers)
+
+@dataclass
+class EasyMagpieInferenceConfig(BaseInferenceConfig):
+    """Configuration for decoder-only EasyMagpieTTSModel inference."""
+
+    model_inference_parameters: EasyModelInferenceParameters = field(
+        default_factory=EasyModelInferenceParameters
+    )
+    phoneme_input_type: str = "gt"
+    phoneme_sampling_method: str = "argmax"
+    dropout_text_input: bool = False
+    legacy_context_stacking: bool = False
+
+    def build_identifier(self) -> str:
+        parts = [
+            f"Temp{self.model_inference_parameters.temperature}",
+            f"Topk{self.model_inference_parameters.topk}",
+            f"Cfg_{self.use_cfg}_{self.model_inference_parameters.cfg_scale}",
+            f"LT_{self.use_local_transformer}",
+            f"Phoneme_{self.phoneme_input_type}_{self.phoneme_sampling_method}",
+        ]
+        return "_".join(parts)
+
+
+# Backwards-compatible aliases
+InferenceConfig = MagpieInferenceConfig
+
+
+# ---------------------------------------------------------------------------
+# Inference runner hierarchy
+# ---------------------------------------------------------------------------
 
 
-class MagpieInferenceRunner:
-    """Runner class for MagpieTTS batch inference.
+class BaseInferenceRunner(abc.ABC):
+    """Abstract base for TTS inference runners.
 
-    Encapsulates the logic for running inference on a dataset, saving outputs,
-    and collecting metrics.
+    Provides shared utilities (batch-to-cuda, file cleanup, reference audio
+    copying, RTF metrics) and declares the interface that concrete runners
+    must implement.
     """
 
-    def __init__(
-        self,  # model can be MagpieTTSModel or DecoderOnlyMagpieTTSModel
-        model: Union[MagpieTTSModel, EasyMagpieTTSModel],
-        config: InferenceConfig,
-    ):
-        """Initialize the inference runner.
-
-        Args:
-            model: Loaded MagpieTTS model (should be on GPU and in eval mode).
-            config: Inference configuration.
-        """
+    def __init__(self, model, config: BaseInferenceConfig):
         self.model = model
         self.config = config
-
-        # Set legacy context stacking flag on model
-        self.model.legacy_context_stacking = config.legacy_context_stacking
-
-        # Set phoneme probability to 1 for inference
         self._configure_tokenizer()
-
-        # Cached state from create_dataset (set when create_dataset is called)
         self._manifest_records: Optional[List[dict]] = None
         self._audio_base_dir: Optional[str] = None
 
-    def _configure_tokenizer(self) -> None:
-        """Configure the tokenizer for inference (phoneme prob = 1.0)."""
-        g2p = None
-        if isinstance(self.model.tokenizer, AggregatedTTSTokenizer):
-            if "english_phoneme" in self.model.tokenizer.tokenizers and hasattr(
-                self.model.tokenizer.tokenizers["english_phoneme"], "g2p"
-            ):
-                g2p = self.model.tokenizer.tokenizers["english_phoneme"].g2p
-        elif isinstance(self.model.tokenizer, IPATokenizer):
-            g2p = self.model.tokenizer.g2p
-
-        if g2p is not None:
-            g2p.phoneme_probability = 1.0
+    # -- interface -----------------------------------------------------------
 
+    @abc.abstractmethod
     def create_dataset(
         self,
         dataset_meta: dict,
         context_duration_min: Optional[float] = None,
         context_duration_max: Optional[float] = None,
     ) -> Union[ChunkedTTSInferenceDataset, MagpieTTSDataset]:
-        """Create an inference dataset.
-
-        Standard MagpieTTS uses the chunked inference dataset from `main`.
-        Decoder-only MagpieTTS uses the regular dataset and its dedicated
-        `infer_batch()` inference path.
-
-        Args:
-            dataset_meta: Dataset metadata dictionary with 'manifest_path' and 'audio_dir'.
-            context_duration_min: Minimum context duration (uses model default if None).
-            context_duration_max: Maximum context duration (uses model default if None).
-
-        Returns:
-            Configured ChunkedTTSInferenceDataset instance.
-        """
-        # Use model defaults if not specified
-        if context_duration_min is None:
-            context_duration_min = self.model.cfg.get('context_duration_min', 5.0)
-        if context_duration_max is None:
-            context_duration_max = self.model.cfg.get('context_duration_max', 5.0)
-
-        # For multi-encoder models, use fixed 5s context for fair evaluation
-        if context_duration_min < 5.0 and context_duration_max > 5.0:
-            context_duration_min = 5.0
-            context_duration_max = 5.0
-
-        # Read manifest and cache for later use
-        dataset_name = list(dataset_meta.keys())[0]
-        dataset_info = dataset_meta[dataset_name]
-        manifest_path = dataset_info.get('manifest_path')
-        audio_dir = dataset_info.get('audio_dir', '')
-        logging.info(f"Dataset name: {dataset_name}, manifest_path: {manifest_path}, audio_dir: {audio_dir}")
-
-        self._manifest_records = read_manifest(manifest_path)
-        self._audio_base_dir = audio_dir
-        if self.config.is_decoder_only_model:
-            logging.info("Creating standard inference dataset for decoder-only model")
-            dataset = MagpieTTSDataset(
-                dataset_meta=dataset_meta,
-                sample_rate=self.model.sample_rate,
-                min_duration=0.5,
-                max_duration=20,
-                codec_model_samples_per_frame=self.model.codec_model_samples_per_frame,
-                bos_id=getattr(self.model, "bos_id", None),
-                eos_id=self.model.eos_id,
-                num_audio_codebooks=self.model.num_audio_codebooks,
-                prior_scaling_factor=None,
-                load_cached_codes_if_available=False,
-                dataset_type='test',
-                tokenizer_config=None,
-                load_16khz_audio=False,
-                use_text_conditioning_tokenizer=True,
-                text_conditioning_tokenizer_name=self.model.text_conditioning_tokenizer_name,
-                pad_context_text_to_max_duration=False,
-                context_duration_min=context_duration_min,
-                context_duration_max=context_duration_max,
-            )
-            dataset.text_tokenizer = self.model.tokenizer
-        else:
-            logging.info("Creating unified inference dataset")
-            dataset = self._create_chunked_inference_dataset(dataset_meta, context_duration_min, context_duration_max)
-
-        if hasattr(self.model, 'phoneme_tokenizer'):
-            dataset.phoneme_tokenizer = self.model.phoneme_tokenizer
-
-        return dataset
+        ...
 
+    @abc.abstractmethod
     def run_inference_on_dataset(
         self,
-        dataset: ChunkedTTSInferenceDataset,
+        dataset,
         output_dir: str,
         manifest_records: Optional[List[dict]] = None,
         audio_base_dir: Optional[str] = None,
@@ -258,127 +190,66 @@ def run_inference_on_dataset(
         save_context_audio: bool = True,
         save_predicted_codes: bool = True,
     ) -> Tuple[List[dict], List[str], List[str]]:
-        """Run inference on a dataset.
-
-        Args:
-            dataset: The inference dataset (created by create_dataset()).
-            output_dir: Directory to save generated audio and artifacts.
-            manifest_records: Original manifest records (uses cached if None).
-            audio_base_dir: Base directory for audio paths (uses cached if None).
-            save_cross_attention_maps: Whether to save attention map images (not used in unified path).
-            save_context_audio: Whether to copy context audio files.
-            save_predicted_codes: Whether to save predicted code files.
-
-        Returns:
-            Tuple of:
-                - rtf_metrics: List of real-time factor metrics per batch.
-                - generated_audio_paths: List of paths to generated audio files.
-                - codec_file_paths: List of paths to predicted codes files.
-        """
-        # Use cached values if not provided
+        ...
+
+    # -- shared helpers ------------------------------------------------------
+
+    def _configure_tokenizer(self) -> None:
+        """Configure the tokenizer for inference (phoneme prob = 1.0)."""
+        g2p = None
+        if isinstance(self.model.tokenizer, AggregatedTTSTokenizer):
+            if "english_phoneme" in self.model.tokenizer.tokenizers and hasattr(
+                self.model.tokenizer.tokenizers["english_phoneme"], "g2p"
+            ):
+                g2p = self.model.tokenizer.tokenizers["english_phoneme"].g2p
+        elif isinstance(self.model.tokenizer, IPATokenizer):
+            g2p = self.model.tokenizer.g2p
+
+        if g2p is not None:
+            g2p.phoneme_probability = 1.0
+
+    def _resolve_manifest_and_audio_dir(
+        self,
+        manifest_records: Optional[List[dict]],
+        audio_base_dir: Optional[str],
+    ) -> Tuple[List[dict], str]:
         if manifest_records is None:
             if self._manifest_records is None:
                 raise ValueError("manifest_records not provided and not cached from create_dataset()")
             manifest_records = self._manifest_records
-
         if audio_base_dir is None:
             if self._audio_base_dir is None:
                 raise ValueError("audio_base_dir not provided and not cached from create_dataset()")
             audio_base_dir = self._audio_base_dir
+        return manifest_records, audio_base_dir
 
-        if self.config.is_decoder_only_model:
-            logging.info("Using decoder-only inference path")
-            return self._run_decoder_only_inference(
-                dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes
-            )
-
-        logging.info("Using unified inference path")
-        return self._run_unified_inference(
-            dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes
-        )
+    def _read_and_cache_manifest(self, dataset_meta: dict) -> Tuple[str, str]:
+        """Read manifest from dataset_meta, cache records, return (manifest_path, audio_dir)."""
+        dataset_name = list(dataset_meta.keys())[0]
+        dataset_info = dataset_meta[dataset_name]
+        manifest_path = dataset_info.get('manifest_path')
+        audio_dir = dataset_info.get('audio_dir', '')
+        logging.info(f"Dataset name: {dataset_name}, manifest_path: {manifest_path}, audio_dir: {audio_dir}")
+        self._manifest_records = read_manifest(manifest_path)
+        self._audio_base_dir = audio_dir
+        return manifest_path, audio_dir
 
-    def _run_decoder_only_inference(
+    def _get_context_durations(
         self,
-        dataset: MagpieTTSDataset,
-        output_dir: str,
-        manifest_records: List[dict],
-        audio_base_dir: str,
-        save_context_audio: bool = True,
-        save_predicted_codes: bool = True,
-    ) -> Tuple[List[dict], List[str], List[str]]:
-        """Run inference for decoder-only models via `infer_batch()`."""
-        os.makedirs(output_dir, exist_ok=True)
-        self._delete_old_generated_files(output_dir)
-
-        dataloader = torch.utils.data.DataLoader(
-            dataset,
-            batch_size=self.config.batch_size,
-            collate_fn=dataset.collate_fn,
-            num_workers=0,
-            shuffle=False,
-        )
-
-        all_rtf_metrics = []
-        generated_audio_paths = []
-        codec_file_paths = []
-        item_idx = 0
-        phoneme_sampling_method = (
-            "argmax" if self.config.phoneme_sampling_method == "greedy" else self.config.phoneme_sampling_method
-        )
-
-        for batch_idx, batch in enumerate(dataloader):
-            logging.info(f"Processing batch {batch_idx + 1}/{len(dataloader)}")
-            batch = self._batch_to_cuda(batch)
-            output = self.model.infer_batch(
-                batch,
-                max_decoder_steps=self.config.model_inference_parameters.max_decoder_steps,
-                temperature=self.config.model_inference_parameters.temperature,
-                topk=self.config.model_inference_parameters.topk,
-                use_cfg=self.config.use_cfg,
-                cfg_scale=self.config.model_inference_parameters.cfg_scale,
-                use_local_transformer_for_inference=self.config.use_local_transformer,
-                phoneme_input_type=self.config.phoneme_input_type,
-                phoneme_sampling_method=phoneme_sampling_method,
-                force_dropout_text=self.config.dropout_text_input,
-            )
-            predicted_audio = output.predicted_audio
-            predicted_audio_lens = output.predicted_audio_lens
-            predicted_codes = output.predicted_codes
-            predicted_codes_lens = output.predicted_codes_lens
-            rtf_metrics = output.rtf_metrics
-
-            all_rtf_metrics.append(rtf_metrics)
-            logging.info(f"Output shape: {predicted_audio.size()}")
-
-            for idx in range(predicted_audio.size(0)):
-                audio_len = predicted_audio_lens[idx].item()
-                audio_np = predicted_audio[idx].float().detach().cpu().numpy()[:audio_len]
-                audio_path = os.path.join(output_dir, f"predicted_audio_{item_idx}.wav")
-                sample_rate = getattr(self.model, "output_sample_rate", self.model.sample_rate)
-                sf.write(audio_path, audio_np, sample_rate)
-                generated_audio_paths.append(audio_path)
-
-                if save_context_audio and item_idx < len(manifest_records):
-                    self._copy_reference_audio(
-                        manifest_records[item_idx],
-                        audio_base_dir,
-                        output_dir,
-                        item_idx,
-                    )
-
-                if save_predicted_codes:
-                    code_len = predicted_codes_lens[idx].item()
-                    codes_path = os.path.join(output_dir, f"predicted_codes_{item_idx}.pt")
-                    torch.save(predicted_codes[idx, :, :code_len].detach().cpu(), codes_path)
-                    codec_file_paths.append(codes_path)
-
-                item_idx += 1
-
-        return all_rtf_metrics, generated_audio_paths, codec_file_paths
+        context_duration_min: Optional[float],
+        context_duration_max: Optional[float],
+    ) -> Tuple[float, float]:
+        if context_duration_min is None:
+            context_duration_min = self.model.cfg.get('context_duration_min', 5.0)
+        if context_duration_max is None:
+            context_duration_max = self.model.cfg.get('context_duration_max', 5.0)
+        if context_duration_min < 5.0 and context_duration_max > 5.0:
+            context_duration_min = 5.0
+            context_duration_max = 5.0
+        return context_duration_min, context_duration_max
 
     @staticmethod
     def _batch_to_cuda(batch: dict) -> dict:
-        """Move batch tensors to CUDA device."""
         batch_cuda = {}
         for key, value in batch.items():
             if isinstance(value, torch.Tensor):
@@ -389,7 +260,6 @@ def _batch_to_cuda(batch: dict) -> dict:
 
     @staticmethod
     def _delete_old_generated_files(output_dir: str) -> None:
-        """Delete leftover generated files from previous runs."""
         logging.info(f"Cleaning up old generated files in: {output_dir}")
         patterns = [
             "predicted_codes*.pt",
@@ -407,7 +277,6 @@ def _copy_reference_audio(
         output_dir: str,
         item_idx: int,
     ) -> None:
-        """Copy context and target audio files to output directory."""
         context_path = record.get('context_audio_filepath')
         target_path = record.get('audio_filepath')
 
@@ -425,48 +294,69 @@ def _copy_reference_audio(
 
     @staticmethod
     def compute_mean_rtf_metrics(rtf_metrics_list: List[dict]) -> Dict[str, float]:
-        """Compute mean RTF metrics across batches."""
         if not rtf_metrics_list or not rtf_metrics_list[0]:
             return {}
-
         mean_metrics = {}
         for key in rtf_metrics_list[0]:
             values = [m[key] for m in rtf_metrics_list if key in m]
             mean_metrics[key] = float(sum(values) / len(values)) if values else 0.0
-
         return mean_metrics
 
-    def _create_chunked_inference_dataset(
+
+# ---------------------------------------------------------------------------
+# MagpieInferenceRunner  (encoder-decoder MagpieTTSModel)
+# ---------------------------------------------------------------------------
+
+
+class MagpieInferenceRunner(BaseInferenceRunner):
+    """Runner for encoder-decoder MagpieTTSModel.
+
+    Uses ChunkedTTSInferenceDataset and model.generate_speech() per chunk,
+    then model.codes_to_audio() to produce waveforms.
+    """
+
+    def __init__(self, model, config: MagpieInferenceConfig):
+        super().__init__(model, config)
+
+    def create_dataset(
         self,
         dataset_meta: dict,
         context_duration_min: Optional[float] = None,
         context_duration_max: Optional[float] = None,
     ) -> ChunkedTTSInferenceDataset:
-        """Create a unified inference dataset.
-
-        Creates ChunkedTTSInferenceDataset which uses language-aware chunking
-        to automatically handle both short and long texts.
+        context_duration_min, context_duration_max = self._get_context_durations(
+            context_duration_min, context_duration_max
+        )
+        self._read_and_cache_manifest(dataset_meta)
 
-        Args:
-            dataset_meta: Dataset metadata dictionary (same format as MagpieTTSDataset).
-            context_duration_min: Minimum context duration (uses model default if None).
-            context_duration_max: Maximum context duration (uses model default if None).
+        logging.info("Creating unified inference dataset")
+        dataset = self._create_chunked_inference_dataset(dataset_meta, context_duration_min, context_duration_max)
+        return dataset
 
-        Returns:
-            Configured ChunkedTTSInferenceDataset instance.
-        """
-        # Use model defaults if not specified
-        if context_duration_min is None:
-            context_duration_min = self.model.cfg.get('context_duration_min', 5.0)
-        if context_duration_max is None:
-            context_duration_max = self.model.cfg.get('context_duration_max', 5.0)
+    def run_inference_on_dataset(
+        self,
+        dataset: ChunkedTTSInferenceDataset,
+        output_dir: str,
+        manifest_records: Optional[List[dict]] = None,
+        audio_base_dir: Optional[str] = None,
+        save_cross_attention_maps: bool = True,
+        save_context_audio: bool = True,
+        save_predicted_codes: bool = True,
+    ) -> Tuple[List[dict], List[str], List[str]]:
+        manifest_records, audio_base_dir = self._resolve_manifest_and_audio_dir(manifest_records, audio_base_dir)
+        logging.info("Using unified inference path")
+        return self._run_unified_inference(
+            dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes
+        )
 
-        # For multi-encoder models, use fixed 5s context for fair evaluation
-        if context_duration_min < 5.0 and context_duration_max > 5.0:
-            context_duration_min = 5.0
-            context_duration_max = 5.0
+    # -- private -------------------------------------------------------------
 
-        # Create unified dataset - language and tokenizer are determined per-sample from manifest
+    def _create_chunked_inference_dataset(
+        self,
+        dataset_meta: dict,
+        context_duration_min: float,
+        context_duration_max: float,
+    ) -> ChunkedTTSInferenceDataset:
         dataset = ChunkedTTSInferenceDataset(
             dataset_meta=dataset_meta,
             sample_rate=self.model.output_sample_rate,
@@ -480,10 +370,7 @@ def _create_chunked_inference_dataset(
             pad_context_text_to_max_duration=self.model.pad_context_text_to_max_duration,
             load_16khz_audio=self.model.model_type == 'single_encoder_sv_tts',
         )
-
-        # Attach model's tokenizer
         dataset.text_tokenizer = self.model.tokenizer
-
         return dataset
 
     def _run_unified_inference(
@@ -495,26 +382,6 @@ def _run_unified_inference(
         save_context_audio: bool = True,
         save_predicted_codes: bool = True,
     ) -> Tuple[List[dict], List[str], List[str]]:
-        """Run unified inference with automatic single/multi-chunk handling.
-
-        Processes all samples through generate_speech, passing
-        beginning_of_text and end_of_text so the model can handle both
-        single-chunk (short text) and multi-chunk (long text) cases correctly.
-
-        Args:
-            dataset: ChunkedTTSInferenceDataset created by create_dataset().
-            output_dir: Directory to save generated audio and artifacts.
-            manifest_records: List of manifest record dictionaries.
-            audio_base_dir: Base directory for resolving audio paths.
-            save_context_audio: Whether to copy context audio files.
-            save_predicted_codes: Whether to save predicted code files.
-
-        Returns:
-            Tuple of:
-                - rtf_metrics: List of real-time factor metrics per batch.
-                - generated_audio_paths: List of paths to generated audio files.
-                - codec_file_paths: List of paths to predicted codes files.
-        """
         os.makedirs(output_dir, exist_ok=True)
         self._delete_old_generated_files(output_dir)
 
@@ -522,7 +389,7 @@ def _run_unified_inference(
             dataset,
             batch_size=self.config.batch_size,
             collate_fn=dataset.collate_fn,
-            num_workers=0,  # Avoid multiprocessing issues with CUDA
+            num_workers=0,
             shuffle=False,
         )
 
@@ -534,54 +401,42 @@ def _run_unified_inference(
         for batch_idx, batch in enumerate(dataloader):
             logging.info(f"Processing batch {batch_idx + 1}/{len(dataloader)}")
 
-            # Move batch tensors to CUDA
             batch = self._batch_to_cuda(batch)
-
             batch['sample_rate'] = self.model.output_sample_rate
             batch['context_sample_rate'] = self.model.output_sample_rate
 
             batch_size = len(batch['chunked_tokens'])
             max_num_chunks = max(len(tokens) for tokens in batch['chunked_tokens'])
 
-            # Clear stale KV cache from prior inference calls (e.g., the previous batch or dataset
-            # may have left with populated tensors).
             logging.info(f"Resetting KV cache for decoder: {self.model.use_kv_cache_for_inference}")
             use_kv_cache_for_this_batch = self.model.use_kv_cache_for_inference if max_num_chunks == 1 else False
             self.model.decoder.reset_cache(use_cache=use_kv_cache_for_this_batch)
 
-            # Create chunk state for this batch
             chunk_state = self.model.create_chunk_state(batch_size=batch_size)
 
-            # Accumulators for predicted codes
             predicted_codes_per_sample = [[] for _ in range(batch_size)]
             predicted_codes_lens = [0 for _ in range(batch_size)]
 
-            # Overwrite the model's parameters since we want to use the arguments from the commandline
             self.model.inference_parameters = self.config.model_inference_parameters
 
             start_time = time.time()
-            # Iterate over text chunks (1 for short text, N for long text)
             for chunk_idx in range(max_num_chunks):
-                # Extract current chunk tokens for each sample
                 current_tokens = []
                 current_tokens_lens = []
                 for b_idx in range(batch_size):
                     current_tokens.append(batch['chunked_tokens'][b_idx][chunk_idx])
                     current_tokens_lens.append(batch['chunked_tokens_lens'][b_idx][chunk_idx])
 
-                # Pad tokens to max length in this chunk
                 max_len = max(current_tokens_lens)
                 batch['text'] = stack_tensors(current_tokens, max_lens=[max_len]).cuda()
                 batch['text_lens'] = torch.tensor(current_tokens_lens, dtype=torch.int32).cuda()
 
-                # Compute is_end_of_text flags (per-sample)
                 is_end_of_text = self._compute_end_of_text_flags(
                     batch, chunk_idx, max_num_chunks, current_tokens_lens, batch_size
                 )
 
                 beginning_of_text = chunk_idx == 0
 
-                # Call generate_speech (unified entry point)
                 output = self.model.generate_speech(
                     batch,
                     chunk_state=chunk_state,
@@ -595,16 +450,12 @@ def _run_unified_inference(
                     maskgit_sampling_type=self.config.maskgit_sampling_type,
                 )
 
-                # Unpack output
                 chunk_codes = output.predicted_codes
                 chunk_codes_lens = output.predicted_codes_lens
 
-                # Accumulate codes for each sample
                 for b_idx in range(batch_size):
-                    # Skip if this sample's text has ended (padding chunks)
                     if is_end_of_text[b_idx] and current_tokens_lens[b_idx] == 1:
                         continue
-
                     code_len = chunk_codes_lens[b_idx]
                     if code_len > 0:
                         codes_slice = chunk_codes[b_idx][:, :code_len]
@@ -614,17 +465,14 @@ def _run_unified_inference(
             elapsed = time.time() - start_time
             logging.info(f"Batch inference time: {elapsed:.2f}s")
 
-            # Concatenate codes and convert to audio
             predicted_codes_list = []
             for b_idx in range(batch_size):
                 if predicted_codes_per_sample[b_idx]:
                     concatenated = torch.cat(predicted_codes_per_sample[b_idx], dim=1).cuda()
                 else:
-                    # Empty placeholder
                     concatenated = torch.zeros((self.model.num_audio_codebooks, 1), dtype=torch.long, device='cuda')
                 predicted_codes_list.append(concatenated)
 
-            # Stack and convert to audio
             max_code_len = max(predicted_codes_lens) if any(predicted_codes_lens) else 1
             predicted_codes = stack_tensors(predicted_codes_list, max_lens=[max_code_len]).cuda()
             predicted_codes_lens_tensor = torch.tensor(predicted_codes_lens, dtype=torch.long, device='cuda')
@@ -633,7 +481,6 @@ def _run_unified_inference(
                 predicted_codes, predicted_codes_lens_tensor
             )
 
-            # Compute RTF metrics
             total_audio_samples = sum(predicted_audio_lens.cpu().tolist())
             total_audio_seconds = total_audio_samples / self.model.output_sample_rate
             rtf = elapsed / total_audio_seconds if total_audio_seconds > 0 else 0.0
@@ -644,7 +491,6 @@ def _run_unified_inference(
             }
             all_rtf_metrics.append(rtf_metrics)
 
-            # Save outputs
             predicted_audio_np = predicted_audio.float().detach().cpu().numpy()
 
             for b_idx in range(batch_size):
@@ -656,7 +502,6 @@ def _run_unified_inference(
                 sf.write(audio_path, audio_np, self.model.output_sample_rate)
                 generated_audio_paths.append(audio_path)
 
-                # Copy reference audio if requested
                 if save_context_audio and sample_idx < len(manifest_records):
                     self._copy_reference_audio(
                         manifest_records[sample_idx],
@@ -667,7 +512,7 @@ def _run_unified_inference(
 
                 if save_predicted_codes:
                     codes_path = os.path.join(output_dir, f"predicted_codes_{sample_idx}.pt")
-                    predicted_codes_current = predicted_codes[b_idx, :, : predicted_codes_lens[b_idx]]  # C, T
+                    predicted_codes_current = predicted_codes[b_idx, :, : predicted_codes_lens[b_idx]]
                     torch.save(predicted_codes_current, codes_path)
                     codec_file_paths.append(codes_path)
 
@@ -675,38 +520,173 @@ def _run_unified_inference(
 
         return all_rtf_metrics, generated_audio_paths, codec_file_paths
 
+    @staticmethod
     def _compute_end_of_text_flags(
-        self,
         batch: Dict[str, Any],
         chunk_idx: int,
         max_num_chunks: int,
         current_tokens_lens: List[int],
         batch_size: int,
     ) -> List[bool]:
-        """Compute end-of-text flags for each sample in batch.
-
-        Args:
-            batch: Current batch dictionary.
-            chunk_idx: Current chunk index.
-            max_num_chunks: Maximum number of chunks in this batch.
-            current_tokens_lens: Token lengths for current chunk per sample.
-            batch_size: Number of samples in batch.
-
-        Returns:
-            List of booleans indicating if each sample has reached end of text.
-        """
         is_end_of_text = []
         for b_idx in range(batch_size):
             if chunk_idx == max_num_chunks - 1:
-                # Last chunk
                 is_end_of_text.append(True)
             elif current_tokens_lens[b_idx] == 1:
-                # Current chunk is padding
                 is_end_of_text.append(True)
             elif batch['chunked_tokens_lens'][b_idx][chunk_idx + 1] == 1:
-                # Next chunk is padding
                 is_end_of_text.append(True)
             else:
                 is_end_of_text.append(False)
-
         return is_end_of_text
+
+
+# ---------------------------------------------------------------------------
+# EasyMagpieInferenceRunner  (decoder-only EasyMagpieTTSModel)
+# ---------------------------------------------------------------------------
+
+
+class EasyMagpieInferenceRunner(BaseInferenceRunner):
+    """Runner for decoder-only EasyMagpieTTSModel.
+
+    Uses MagpieTTSDataset and model.infer_batch() which returns audio directly.
+    """
+
+    def __init__(self, model, config: EasyMagpieInferenceConfig):
+        super().__init__(model, config)
+        self.model.legacy_context_stacking = config.legacy_context_stacking
+
+    def create_dataset(
+        self,
+        dataset_meta: dict,
+        context_duration_min: Optional[float] = None,
+        context_duration_max: Optional[float] = None,
+    ) -> MagpieTTSDataset:
+        context_duration_min, context_duration_max = self._get_context_durations(
+            context_duration_min, context_duration_max
+        )
+        self._read_and_cache_manifest(dataset_meta)
+
+        logging.info("Creating inference dataset for decoder-only model")
+        dataset = MagpieTTSDataset(
+            dataset_meta=dataset_meta,
+            sample_rate=self.model.sample_rate,
+            min_duration=0.5,
+            max_duration=20,
+            codec_model_samples_per_frame=self.model.codec_model_samples_per_frame,
+            bos_id=getattr(self.model, "bos_id", None),
+            eos_id=self.model.eos_id,
+            num_audio_codebooks=self.model.num_audio_codebooks,
+            prior_scaling_factor=None,
+            load_cached_codes_if_available=False,
+            dataset_type='test',
+            tokenizer_config=None,
+            load_16khz_audio=False,
+            use_text_conditioning_tokenizer=True,
+            text_conditioning_tokenizer_name=self.model.text_conditioning_tokenizer_name,
+            pad_context_text_to_max_duration=False,
+            context_duration_min=context_duration_min,
+            context_duration_max=context_duration_max,
+        )
+        dataset.text_tokenizer = self.model.tokenizer
+
+        if hasattr(self.model, 'phoneme_tokenizer'):
+            dataset.phoneme_tokenizer = self.model.phoneme_tokenizer
+
+        return dataset
+
+    def run_inference_on_dataset(
+        self,
+        dataset: MagpieTTSDataset,
+        output_dir: str,
+        manifest_records: Optional[List[dict]] = None,
+        audio_base_dir: Optional[str] = None,
+        save_cross_attention_maps: bool = True,
+        save_context_audio: bool = True,
+        save_predicted_codes: bool = True,
+    ) -> Tuple[List[dict], List[str], List[str]]:
+        manifest_records, audio_base_dir = self._resolve_manifest_and_audio_dir(manifest_records, audio_base_dir)
+        logging.info("Using decoder-only inference path")
+        return self._run_decoder_only_inference(
+            dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes
+        )
+
+    # -- private -------------------------------------------------------------
+
+    def _run_decoder_only_inference(
+        self,
+        dataset: MagpieTTSDataset,
+        output_dir: str,
+        manifest_records: List[dict],
+        audio_base_dir: str,
+        save_context_audio: bool = True,
+        save_predicted_codes: bool = True,
+    ) -> Tuple[List[dict], List[str], List[str]]:
+        os.makedirs(output_dir, exist_ok=True)
+        self._delete_old_generated_files(output_dir)
+
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            batch_size=self.config.batch_size,
+            collate_fn=dataset.collate_fn,
+            num_workers=0,
+            shuffle=False,
+        )
+
+        all_rtf_metrics = []
+        generated_audio_paths = []
+        codec_file_paths = []
+        item_idx = 0
+        phoneme_sampling_method = (
+            "argmax" if self.config.phoneme_sampling_method == "greedy" else self.config.phoneme_sampling_method
+        )
+
+        for batch_idx, batch in enumerate(dataloader):
+            logging.info(f"Processing batch {batch_idx + 1}/{len(dataloader)}")
+            batch = self._batch_to_cuda(batch)
+            output = self.model.infer_batch(
+                batch,
+                max_decoder_steps=self.config.model_inference_parameters.max_decoder_steps,
+                temperature=self.config.model_inference_parameters.temperature,
+                topk=self.config.model_inference_parameters.topk,
+                use_cfg=self.config.use_cfg,
+                cfg_scale=self.config.model_inference_parameters.cfg_scale,
+                use_local_transformer_for_inference=self.config.use_local_transformer,
+                phoneme_input_type=self.config.phoneme_input_type,
+                phoneme_sampling_method=phoneme_sampling_method,
+                force_dropout_text=self.config.dropout_text_input,
+            )
+            predicted_audio = output.predicted_audio
+            predicted_audio_lens = output.predicted_audio_lens
+            predicted_codes = output.predicted_codes
+            predicted_codes_lens = output.predicted_codes_lens
+            rtf_metrics = output.rtf_metrics
+
+            all_rtf_metrics.append(rtf_metrics)
+            logging.info(f"Output shape: {predicted_audio.size()}")
+
+            for idx in range(predicted_audio.size(0)):
+                audio_len = predicted_audio_lens[idx].item()
+                audio_np = predicted_audio[idx].float().detach().cpu().numpy()[:audio_len]
+                audio_path = os.path.join(output_dir, f"predicted_audio_{item_idx}.wav")
+                sample_rate = getattr(self.model, "output_sample_rate", self.model.sample_rate)
+                sf.write(audio_path, audio_np, sample_rate)
+                generated_audio_paths.append(audio_path)
+
+                if save_context_audio and item_idx < len(manifest_records):
+                    self._copy_reference_audio(
+                        manifest_records[item_idx],
+                        audio_base_dir,
+                        output_dir,
+                        item_idx,
+                    )
+
+                if save_predicted_codes:
+                    code_len = predicted_codes_lens[idx].item()
+                    codes_path = os.path.join(output_dir, f"predicted_codes_{item_idx}.pt")
+                    torch.save(predicted_codes[idx, :, :code_len].detach().cpu(), codes_path)
+                    codec_file_paths.append(codes_path)
+
+                item_idx += 1
+
+        return all_rtf_metrics, generated_audio_paths, codec_file_paths
diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py
index 580a6e32ebc7..ca89356494fa 100644
--- a/nemo/collections/tts/modules/magpietts_inference/utils.py
+++ b/nemo/collections/tts/modules/magpietts_inference/utils.py
@@ -23,7 +23,7 @@
 
 import os
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Tuple
 
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
@@ -253,9 +253,7 @@ def update_checkpoint_state_dict(state_dict: dict) -> dict:
     return new_state_dict
 
 
-def load_magpie_model(
-    config: ModelLoadConfig, device: str = "cuda", is_decoder_only_model: bool = False
-) -> Tuple[Union[MagpieTTSModel, EasyMagpieTTSModel], str]:
+def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[MagpieTTSModel, str]:
     """Load a MagpieTTS model from checkpoint or NeMo archive.
 
     Supports two loading modes:
@@ -273,7 +271,7 @@ def load_magpie_model(
         ValueError: If configuration is invalid or sample rates don't match.
     """
     config.validate()
-    model_cls = EasyMagpieTTSModel if is_decoder_only_model else MagpieTTSModel
+
     if config.hparams_file is not None and config.checkpoint_file is not None:
         # Mode 1: Load from hparams + checkpoint
         model_cfg = OmegaConf.load(config.hparams_file)
@@ -292,7 +290,7 @@ def load_magpie_model(
                 config.legacy_text_conditioning,
             )
 
-        model = model_cls(cfg=model_cfg)
+        model = MagpieTTSModel(cfg=model_cfg)
         model.use_kv_cache_for_inference = True
 
         # Load weights
@@ -304,15 +302,15 @@ def load_magpie_model(
         checkpoint_name = os.path.basename(config.checkpoint_file).replace(".ckpt", "")
 
     else:
-        if config.nemo_file.startswith("nvidia/"):
-            model = model_cls.from_pretrained(config.nemo_file)
+        if config.nemo_file.startswith("nvidia/"):  # TODO @xueyang: why ignore `update_config_for_inference`?
+            model = MagpieTTSModel.from_pretrained(config.nemo_file)
             model.use_kv_cache_for_inference = True
             checkpoint_name = config.nemo_file.split("/")[-1]
             cfg_sample_rate = None
         else:
             # Mode 2: Load from .nemo archive
             logging.info(f"Loading model from NeMo archive: {config.nemo_file}")
-            model_cfg = model_cls.restore_from(config.nemo_file, return_config=True)
+            model_cfg = MagpieTTSModel.restore_from(config.nemo_file, return_config=True)
 
             with open_dict(model_cfg):
                 model_cfg, cfg_sample_rate = update_config_for_inference(
@@ -322,7 +320,7 @@ def load_magpie_model(
                     config.legacy_text_conditioning,
                 )
 
-            model = model_cls.restore_from(config.nemo_file, override_config_path=model_cfg)
+            model = MagpieTTSModel.restore_from(config.nemo_file, override_config_path=model_cfg)
             model.use_kv_cache_for_inference = True
             checkpoint_name = os.path.basename(config.nemo_file).replace(".nemo", "")
 
@@ -338,6 +336,69 @@ def load_magpie_model(
     return model, checkpoint_name
 
 
+def load_easy_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[EasyMagpieTTSModel, str]:
+    """Load an EasyMagpieTTSModel (decoder-only) from checkpoint or NeMo archive.
+
+    Supports two loading modes:
+    1. Checkpoint mode: hparams.yaml + .ckpt file
+    2. NeMo mode: .nemo archive file
+
+    Args:
+        config: Model loading configuration.
+        device: Device to load the model onto ("cuda" or "cpu").
+
+    Returns:
+        Tuple of (loaded model, checkpoint name for output labeling).
+
+    Raises:
+        ValueError: If configuration is invalid.
+    """
+    config.validate()
+
+    if config.hparams_file is not None and config.checkpoint_file is not None:
+        model_cfg = OmegaConf.load(config.hparams_file)
+
+        if "cfg" in model_cfg:
+            model_cfg = model_cfg.cfg
+        if config.hparams_from_wandb:
+            model_cfg = model_cfg.value
+
+        with open_dict(model_cfg):
+            model_cfg.codecmodel_path = config.codecmodel_path
+            model_cfg.train_ds = None
+            model_cfg.validation_ds = None
+
+        model = EasyMagpieTTSModel(cfg=model_cfg)
+
+        logging.info(f"Loading weights from checkpoint: {config.checkpoint_file}")
+        ckpt = torch.load(config.checkpoint_file)
+        state_dict = ckpt['state_dict']
+        model.load_state_dict(state_dict)
+
+        checkpoint_name = os.path.basename(config.checkpoint_file).replace(".ckpt", "")
+    else:
+        if config.nemo_file.startswith("nvidia/"):
+            model = EasyMagpieTTSModel.from_pretrained(config.nemo_file)
+            checkpoint_name = config.nemo_file.split("/")[-1]
+        else:
+            logging.info(f"Loading model from NeMo archive: {config.nemo_file}")
+            model_cfg = EasyMagpieTTSModel.restore_from(config.nemo_file, return_config=True)
+
+            with open_dict(model_cfg):
+                model_cfg.codecmodel_path = config.codecmodel_path
+                model_cfg.train_ds = None
+                model_cfg.validation_ds = None
+
+            model = EasyMagpieTTSModel.restore_from(config.nemo_file, override_config_path=model_cfg)
+            checkpoint_name = os.path.basename(config.nemo_file).replace(".nemo", "")
+
+    model.to(device)
+    model.eval()
+    logging.info("EasyMagpieTTS model loaded and ready for inference.")
+
+    return model, checkpoint_name
+
+
 def _log_transformer_component(name: str, cfg: DictConfig, use_moe: bool = False) -> dict:
     """Log architecture info for a single transformer component and return its FLOPs metrics.
 
@@ -414,23 +475,22 @@ def _log_transformer_component(name: str, cfg: DictConfig, use_moe: bool = False
         return flops_info
 
 
-def log_model_architecture_summary(model: MagpieTTSModel) -> Tuple[str, Dict[str, dict]]:
+def log_model_architecture_summary(model) -> Tuple[str, Dict[str, dict]]:
     """Log model architecture summary including MoE configuration.
 
     Detects and logs MoE configuration for each transformer component,
-    computing FLOPs metrics and parameter counts.
+    computing FLOPs metrics and parameter counts. Gracefully handles
+    decoder-only models (EasyMagpieTTSModel) that use HuggingFace/Nemotron
+    decoders without the d_model/d_ffn config structure.
 
     Args:
-        model: Loaded MagpieTTS model.
+        model: Loaded MagpieTTS or EasyMagpieTTS model.
 
     Returns:
         Tuple of:
             - moe_info: String for checkpoint naming (e.g., "MoE_8x2_d2048_softmax_"), empty for dense models
             - flops_per_component: Dict mapping component name (e.g., "decoder") to its FLOPs metrics dict
     """
-    if isinstance(model, EasyMagpieTTSModel):
-        return "", {}
-
     logging.info("=" * 60)
     logging.info("MODEL ARCHITECTURE SUMMARY")
     logging.info("=" * 60)
@@ -438,23 +498,28 @@ def log_model_architecture_summary(model: MagpieTTSModel) -> Tuple[str, Dict[str
     flops_per_component: Dict[str, dict] = {}
     use_moe = getattr(model.cfg, 'use_moe', False)
 
-    # Log optional encoder if present
-    if hasattr(model.cfg, 'encoder'):
+    # Log optional encoder if present (encoder-decoder models)
+    if hasattr(model.cfg, 'encoder') and hasattr(model.cfg.encoder, 'd_model'):
         flops_per_component['encoder'] = _log_transformer_component('encoder', model.cfg.encoder)
 
     # Log optional context_encoder if present
-    if hasattr(model.cfg, 'context_encoder'):
+    if hasattr(model.cfg, 'context_encoder') and hasattr(model.cfg.context_encoder, 'd_model'):
         flops_per_component['context_encoder'] = _log_transformer_component(
             'context_encoder', model.cfg.context_encoder
         )
 
-    # Decoder is required - always present in MagpieTTS. MoE only applies to decoder.
-    flops_per_component['decoder'] = _log_transformer_component('decoder', model.cfg.decoder, use_moe=use_moe)
+    # Decoder -- only log detailed FLOPs for encoder-decoder models whose
+    # decoder config exposes d_model/d_ffn.  Decoder-only models (EasyMagpieTTS)
+    # use HuggingFace or Nemotron decoders with a different config shape.
+    decoder_cfg = getattr(model.cfg, 'decoder', None)
+    if decoder_cfg is not None and hasattr(decoder_cfg, 'd_model'):
+        flops_per_component['decoder'] = _log_transformer_component('decoder', decoder_cfg, use_moe=use_moe)
+    else:
+        logging.info("DECODER: detailed FLOPs logging not available for this model type")
 
     # Build MoE info string for checkpoint naming
     moe_info = ""
-    if use_moe:
-        decoder_cfg = model.cfg.decoder
+    if use_moe and decoder_cfg is not None and hasattr(decoder_cfg, 'num_experts'):
         moe_info = (
             f"decoder-MoE_{decoder_cfg.num_experts}x{decoder_cfg.top_k_experts}"
             f"_d{decoder_cfg.d_ffn}_{decoder_cfg.routing_strategy}_"
@@ -488,4 +553,4 @@ def get_experiment_name_from_checkpoint_path(checkpoint_path: str) -> str:
     Returns:
         The experiment name (parent directory of checkpoints folder).
     """
-    return os.path.basename(os.path.dirname(os.path.dirname(checkpoint_path)))
+    return os.path.basename(os.path.dirname(os.path.dirname(checkpoint_path)))
\ No newline at end of file
diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
index 6d91ad25f976..027ca47a4e82 100644
--- a/nemo/core/classes/modelPT.py
+++ b/nemo/core/classes/modelPT.py
@@ -1411,7 +1411,7 @@ def maybe_init_from_pretrained_checkpoint(self, cfg: OmegaConf, map_location: st
                 if isinstance(cfg.init_from_ptl_ckpt, str):
                     # Restore checkpoint
                     ckpt_path = cfg.pop('init_from_ptl_ckpt')
-                    ckpt = torch.load(ckpt_path, map_location=map_location, weights_only=False)
+                    ckpt = torch.load(ckpt_path, map_location=map_location)
 
                     # Restore checkpoint into current model
                     self.load_state_dict(ckpt['state_dict'], strict=False)
diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py
deleted file mode 100644
index 0ea66e2870ef..000000000000
--- a/tests/collections/tts/test_infer_vs_process_batch.py
+++ /dev/null
@@ -1,491 +0,0 @@
-"""
-Test script to verify that infer_batch (teacher-forced) produces the same audio code
-and phoneme predictions as process_batch (single forward pass).
-
-Usage:
-    python tests/collections/tts/test_infer_vs_process_batch.py --codecmodel_path /path/to/codec.nemo
-
-The script:
-1. Builds a tiny NemotronH-backed EasyMagpieTTSModel with a real codec model.
-2. Creates synthetic random inputs (with variable lengths per batch item).
-3. Runs process_batch (full-sequence forward) and infer_batch (streaming, teacher-forced).
-4. Compares the argmax audio code predictions and phoneme predictions from both paths.
-5. Repeats for multiple configurations.
-"""
-
-import argparse
-import sys
-import torch
-from omegaconf import OmegaConf
-
-from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel
-
-
-def build_minimal_config(codecmodel_path: str) -> OmegaConf:
-    """Build a minimal OmegaConf config for a tiny NemotronH model."""
-    hidden_size = 256
-
-    cfg_dict = {
-        # Decoder backend
-        'decoder_type': 'nemotron_h',
-        'nemotron_h_config': {
-            'hidden_size': hidden_size,
-            'num_hidden_layers': 2,
-            'vocab_size': 131072,
-            'num_attention_heads': 4,
-            'num_key_value_heads': 2,
-            'attention_dropout': 0.0,
-            'attention_bias': False,
-            'max_position_embeddings': 4096,
-            'mamba_num_heads': 16,
-            'mamba_head_dim': 16,
-            'ssm_state_size': 128,
-            'conv_kernel': 4,
-            'n_groups': 8,
-            'chunk_size': 256,
-            'mamba_hidden_act': 'silu',
-            'use_conv_bias': True,
-            'use_bias': False,
-            'intermediate_size': 512,
-            'mlp_hidden_act': 'silu',
-            'mlp_bias': False,
-            'hybrid_override_pattern': 'M*',  # All Mamba layers
-            'layer_norm_epsilon': 1e-5,
-            'residual_in_fp32': True,
-        },
-        'embedding_dim': hidden_size,
-        'hidden_dim': hidden_size,
-        'audio_embedding_dim': hidden_size,
-        'codecmodel_path': codecmodel_path,
-        # Text tokenizer - use a simple AutoTokenizer
-        'text_tokenizers': {
-            'test_tokenizer': {
-                '_target_': 'AutoTokenizer',
-                'pretrained_model': 'gpt2',
-            },
-        },
-        # Phoneme tokenizer
-        'phoneme_tokenizer': {
-            '_target_': 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer',
-            'tokenizer_path': 'scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json',
-        },
-        'phoneme_stacking_factor': 1,
-        # Training modes (single streaming mode)
-        'training_modes': [
-            {
-                'text_input_mode': 'streaming',
-                'streaming_phonemes_delay': 4,
-                'streaming_speech_delay': 8,
-            },
-        ],
-        'frame_stacking_factor': 2,
-        'cfg_unconditional_prob': 0.0,
-        'dropout_text_input_prob': 0.0,
-        'local_transformer_type': 'none',
-        'run_val_inference': False,
-        # Optim placeholder (required by ModelPT but not used)
-        'optim': {
-            '_target_': 'torch.optim.AdamW',
-            'lr': 1e-4,
-        },
-        # No dataloaders
-    }
-    return OmegaConf.create(cfg_dict)
-
-
-def create_synthetic_batch(
-    model,
-    batch_size=2,
-    text_lens_list=None,
-    audio_frames_list=None,
-    context_text_lens_list=None,
-    context_audio_frames_list=None,
-    phoneme_lens_list=None,
-    device='cpu',
-):
-    """Create a synthetic batch with random valid token IDs and variable lengths per item.
-
-    If *_list args are None, defaults to uniform lengths for all items.
-    """
-    num_codebooks = model.num_audio_codebooks
-    codebook_size = model.codebook_size
-    text_vocab_size = model.bos_id  # valid text tokens are [0, bos_id)
-    phoneme_vocab_size = model.phoneme_tokenizer.vocab_size - 2  # exclude BOS/EOS
-
-    # Defaults
-    if text_lens_list is None:
-        text_lens_list = [20] * batch_size
-    if audio_frames_list is None:
-        audio_frames_list = [30] * batch_size
-    if context_text_lens_list is None:
-        context_text_lens_list = [10] * batch_size
-    if context_audio_frames_list is None:
-        context_audio_frames_list = [15] * batch_size
-    if phoneme_lens_list is None:
-        phoneme_lens_list = [25] * batch_size
-
-    assert len(text_lens_list) == batch_size
-    assert len(audio_frames_list) == batch_size
-    assert len(context_text_lens_list) == batch_size
-    assert len(context_audio_frames_list) == batch_size
-    assert len(phoneme_lens_list) == batch_size
-
-    # Max lengths for padding
-    max_text_len = max(text_lens_list)
-    max_audio_frames = max(audio_frames_list)
-    max_context_text_len = max(context_text_lens_list)
-    max_context_audio_frames = max(context_audio_frames_list)
-    max_phoneme_len = max(phoneme_lens_list)
-
-    # Text tokens: random tokens + EOS at the end (matching dataset behavior)
-    text = torch.zeros(batch_size, max_text_len, dtype=torch.long, device=device)
-    for b in range(batch_size):
-        tl = text_lens_list[b]
-        text[b, : tl - 1] = torch.randint(0, text_vocab_size, (tl - 1,), device=device)
-        text[b, tl - 1] = model.eos_id  # EOS as last valid token
-    text_lens = torch.tensor(text_lens_list, dtype=torch.long, device=device)
-
-    # Context text tokens
-    context_text_tokens = torch.zeros(batch_size, max_context_text_len, dtype=torch.long, device=device)
-    for b in range(batch_size):
-        cl = context_text_lens_list[b]
-        context_text_tokens[b, :cl] = torch.randint(0, text_vocab_size, (cl,), device=device)
-    context_text_tokens_lens = torch.tensor(context_text_lens_list, dtype=torch.long, device=device)
-
-    # Audio codes (raw, without BOS/EOS)
-    audio_codes = torch.zeros(batch_size, num_codebooks, max_audio_frames, dtype=torch.long, device=device)
-    for b in range(batch_size):
-        af = audio_frames_list[b]
-        audio_codes[b, :, :af] = torch.randint(0, codebook_size, (num_codebooks, af), device=device)
-    audio_codes_lens = torch.tensor(audio_frames_list, dtype=torch.long, device=device)
-
-    # Context audio codes (raw, without BOS/EOS)
-    context_audio_codes = torch.zeros(
-        batch_size, num_codebooks, max_context_audio_frames, dtype=torch.long, device=device
-    )
-    for b in range(batch_size):
-        caf = context_audio_frames_list[b]
-        context_audio_codes[b, :, :caf] = torch.randint(0, codebook_size, (num_codebooks, caf), device=device)
-    context_audio_codes_lens = torch.tensor(context_audio_frames_list, dtype=torch.long, device=device)
-
-    # Phoneme tokens (raw IDs, BOS/EOS will be added by the model)
-    phoneme_tokens = torch.zeros(batch_size, max_phoneme_len, dtype=torch.long, device=device)
-    for b in range(batch_size):
-        pl = phoneme_lens_list[b]
-        phoneme_tokens[b, :pl] = torch.randint(0, phoneme_vocab_size, (pl,), device=device)
-    phoneme_tokens_lens = torch.tensor(phoneme_lens_list, dtype=torch.long, device=device)
-
-    batch = {
-        'text': text,
-        'text_lens': text_lens,
-        'context_text_tokens': context_text_tokens,
-        'context_text_tokens_lens': context_text_tokens_lens,
-        'audio_codes': audio_codes,
-        'audio_codes_lens': audio_codes_lens,
-        'context_audio_codes': context_audio_codes,
-        'context_audio_codes_lens': context_audio_codes_lens,
-        'phoneme_tokens': phoneme_tokens,
-        'phoneme_tokens_lens': phoneme_tokens_lens,
-    }
-    return batch
-
-
-def compare_audio_codes(model, pb_output, ib_output, batch):
-    """Compare audio codes from process_batch and infer_batch. Returns True if all match."""
-    C = model.num_audio_codebooks
-    S = model.frame_stacking_factor
-    C_stacked = C * S
-    V = model.num_all_tokens_per_codebook
-    pb_logits = pb_output.logits  # (B, T_stacked, C_stacked * V)
-    T_stacked = pb_logits.size(1)
-    batch_size = batch['text'].size(0)
-
-    # Extract per-codebook argmax at stacked resolution
-    pb_stacked_codes_list = []
-    for cb_idx in range(C_stacked):
-        si = cb_idx * V
-        ei = si + V
-        cb_logits = pb_logits[:, :, si:ei]  # (B, T_stacked, V)
-        cb_preds = cb_logits.argmax(dim=-1)  # (B, T_stacked)
-        pb_stacked_codes_list.append(cb_preds)
-    pb_stacked_codes = torch.stack(pb_stacked_codes_list, dim=1)  # (B, C_stacked, T_stacked)
-
-    # Unstack: (B, C*S, T_stacked) -> (B, C, S, T_stacked) -> (B, C, T_stacked, S) -> (B, C, T_stacked*S)
-    pb_unstacked = pb_stacked_codes.view(batch_size, C, S, T_stacked)
-    pb_unstacked = pb_unstacked.permute(0, 1, 3, 2).contiguous()
-    pb_unstacked = pb_unstacked.reshape(batch_size, C, T_stacked * S)
-    pb_unstacked_lens = pb_output.audio_codes_lens_target * S
-
-    ib_codes = ib_output.predicted_codes
-    ib_codes_lens = ib_output.predicted_codes_lens
-
-    print(f"  process_batch argmax codes (unstacked): {pb_unstacked.shape}, lens: {pb_unstacked_lens.tolist()}")
-    print(f"  infer_batch predicted codes: {ib_codes.shape}, lens: {ib_codes_lens.tolist()}")
-
-    all_match = True
-    for b in range(batch_size):
-        pb_len = pb_unstacked_lens[b].item()
-        ib_len = ib_codes_lens[b].item()
-        compare_len = min(pb_len, ib_len)
-
-        if compare_len == 0:
-            print(f"  Batch item {b}: No codes to compare (pb_len={pb_len}, ib_len={ib_len})")
-            continue
-
-        pb_codes_b = pb_unstacked[b, :, :compare_len]
-        ib_codes_b = ib_codes[b, :, :compare_len]
-
-        matches = (pb_codes_b == ib_codes_b).all()
-        num_matching = (pb_codes_b == ib_codes_b).sum().item()
-        total = pb_codes_b.numel()
-        match_pct = 100.0 * num_matching / total if total > 0 else 0.0
-
-        print(f"  Batch item {b}: pb_len={pb_len}, ib_len={ib_len}, compare_len={compare_len}")
-        print(f"    Audio match: {matches.item()}, {num_matching}/{total} ({match_pct:.1f}%)")
-
-        if not matches:
-            all_match = False
-            mismatch_mask = pb_codes_b != ib_codes_b
-            mismatch_positions = mismatch_mask.nonzero(as_tuple=False)
-            num_show = min(10, mismatch_positions.size(0))
-            for i in range(num_show):
-                cb, t = mismatch_positions[i].tolist()
-                print(
-                    f"    Mismatch at codebook={cb}, time={t}: "
-                    f"pb={pb_codes_b[cb, t].item()}, ib={ib_codes_b[cb, t].item()}"
-                )
-
-    return all_match
-
-
-def compare_phoneme_predictions(model, pb_output, ib_output, batch):
-    """Compare phoneme predictions from process_batch and infer_batch. Returns True if all match."""
-    if pb_output.phoneme_logits is None:
-        print("  No phoneme logits from process_batch (no phoneme tokenizer?). Skipping.")
-        return True
-    if ib_output.predicted_phoneme_tokens is None:
-        print("  No phoneme predictions from infer_batch. Skipping.")
-        return True
-
-    batch_size = batch['text'].size(0)
-    phoneme_stacking_factor = model.phoneme_stacking_factor
-    phoneme_vocab_size = model.phoneme_vocab_size
-
-    # Extract argmax phoneme predictions from process_batch logits
-    # phoneme_logits: (B, T_phoneme, phoneme_stacking_factor * phoneme_vocab_size)
-    pb_phoneme_logits = pb_output.phoneme_logits
-    T_phoneme = pb_phoneme_logits.size(1)
-
-    pb_phoneme_preds_list = []
-    for sf_idx in range(phoneme_stacking_factor):
-        si = sf_idx * phoneme_vocab_size
-        ei = si + phoneme_vocab_size
-        sf_logits = pb_phoneme_logits[:, :, si:ei]  # (B, T_phoneme, V_phoneme)
-        sf_preds = sf_logits.argmax(dim=-1)  # (B, T_phoneme)
-        pb_phoneme_preds_list.append(sf_preds)
-    pb_phoneme_preds = torch.stack(pb_phoneme_preds_list, dim=1)  # (B, phoneme_stacking_factor, T_phoneme)
-    pb_phoneme_lens = pb_output.phoneme_tokens_lens_target  # (B,) number of phoneme prediction steps
-
-    # infer_batch phoneme predictions: (B, phoneme_stacking_factor, T_all_steps)
-    ib_phoneme_preds = ib_output.predicted_phoneme_tokens
-    ib_phoneme_lens = ib_output.predicted_phoneme_tokens_lens
-
-    print(f"  process_batch phoneme preds: {pb_phoneme_preds.shape}, lens: {pb_phoneme_lens.tolist()}")
-    print(f"  infer_batch phoneme preds: {ib_phoneme_preds.shape}, lens: {ib_phoneme_lens.tolist()}")
-
-    # Get start indices for infer_batch phoneme predictions
-    ib_start_idx = ib_output.phoneme_prediction_start_idx  # (B,)
-
-    all_match = True
-    for b in range(batch_size):
-        pb_len = pb_phoneme_lens[b].item()
-        ib_len = ib_phoneme_lens[b].item()
-        compare_len = min(pb_len, ib_len)
-
-        if compare_len == 0:
-            print(f"  Batch item {b}: No phonemes to compare (pb_len={pb_len}, ib_len={ib_len})")
-            continue
-
-        # process_batch phoneme preds start from 0 (already sliced to prediction region)
-        pb_ph_b = pb_phoneme_preds[b, :, :compare_len]
-
-        # infer_batch phoneme preds: slice from start_idx for this batch item
-        start = max(0, ib_start_idx[b].item())
-        ib_ph_b = ib_phoneme_preds[b, :, start : start + compare_len]
-
-        matches = (pb_ph_b == ib_ph_b).all()
-        num_matching = (pb_ph_b == ib_ph_b).sum().item()
-        total = pb_ph_b.numel()
-        match_pct = 100.0 * num_matching / total if total > 0 else 0.0
-
-        print(f"  Batch item {b}: pb_len={pb_len}, ib_len={ib_len}, compare_len={compare_len}")
-        print(f"    Phoneme match: {matches.item()}, {num_matching}/{total} ({match_pct:.1f}%)")
-
-        if not matches:
-            all_match = False
-            mismatch_mask = pb_ph_b != ib_ph_b
-            mismatch_positions = mismatch_mask.nonzero(as_tuple=False)
-            num_show = min(10, mismatch_positions.size(0))
-            for i in range(num_show):
-                sf, t = mismatch_positions[i].tolist()
-                print(
-                    f"    Mismatch at stacking_factor={sf}, time={t}: "
-                    f"pb={pb_ph_b[sf, t].item()}, ib={ib_ph_b[sf, t].item()}"
-                )
-
-    return all_match
-
-
-def run_single_test(model, batch, test_name, device):
-    """Run a single test comparing process_batch and infer_batch outputs."""
-    print(f"\n{'='*60}")
-    print(f"TEST: {test_name}")
-    print(f"{'='*60}")
-
-    for k, v in batch.items():
-        if isinstance(v, torch.Tensor):
-            print(f"  {k}: shape={v.shape}, dtype={v.dtype}")
-
-    # Run process_batch
-    print("\n  Running process_batch...")
-    training_mode = model.training_modes[0]
-    with torch.inference_mode():
-        pb_output = model.process_batch(
-            text=batch['text'],
-            text_lens=batch['text_lens'],
-            context_text_tokens=batch['context_text_tokens'],
-            context_text_tokens_lens=batch['context_text_tokens_lens'],
-            audio_codes=batch['audio_codes'],
-            audio_codes_lens=batch['audio_codes_lens'],
-            context_audio_codes=batch['context_audio_codes'],
-            context_audio_codes_lens=batch['context_audio_codes_lens'],
-            phoneme_tokens=batch['phoneme_tokens'],
-            phoneme_tokens_lens=batch['phoneme_tokens_lens'],
-            mode='val',
-            training_mode=training_mode,
-        )
-
-    # Run infer_batch (teacher-forced)
-    print("  Running infer_batch (teacher-forced)...")
-    ib_output = model.infer_batch(
-        batch=batch,
-        max_decoder_steps=1000,
-        temperature=0.0,
-        topk=80,
-        use_cfg=False,
-        use_local_transformer_for_inference=False,
-        phoneme_input_type='gt',
-        phoneme_sampling_method='argmax',
-        use_teacher_forced=True,
-    )
-
-    # Compare audio codes
-    print("\n  --- Audio Codes Comparison ---")
-    audio_match = compare_audio_codes(model, pb_output, ib_output, batch)
-
-    # Compare phoneme predictions
-    print("\n  --- Phoneme Predictions Comparison ---")
-    phoneme_match = compare_phoneme_predictions(model, pb_output, ib_output, batch)
-
-    success = audio_match and phoneme_match
-    if success:
-        print(f"\n  ✓ {test_name}: PASSED (audio + phoneme match)")
-    else:
-        parts = []
-        if not audio_match:
-            parts.append("audio")
-        if not phoneme_match:
-            parts.append("phoneme")
-        print(f"\n  ✗ {test_name}: FAILED ({' and '.join(parts)} mismatch)")
-
-    return success
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Test infer_batch vs process_batch')
-    parser.add_argument('--codecmodel_path', type=str, required=True, help='Path to codec model .nemo file')
-    parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu')
-    args = parser.parse_args()
-
-    device = args.device
-    print(f"Using device: {device}")
-
-    # 1. Build config and model
-    print("Building minimal config...")
-    cfg = build_minimal_config(args.codecmodel_path)
-
-    print("Instantiating EasyMagpieTTSModel (tiny NemotronH + real codec)...")
-    model = EasyMagpieTTSModel(cfg=cfg, trainer=None)
-    model = model.to(device)
-    model.eval()
-    print(f"  num_audio_codebooks={model.num_audio_codebooks}, codebook_size={model.codebook_size}")
-    print(f"  frame_stacking_factor={model.frame_stacking_factor}")
-    print(f"  phoneme_vocab_size={model.phoneme_tokenizer.vocab_size}")
-
-    # Define test configurations: (test_name, kwargs_for_create_synthetic_batch)
-    test_configs = [
-        (
-            "Uniform lengths (B=2, text=20, audio=30, ctx_text=10, ctx_audio=15, phoneme=25)",
-            dict(
-                batch_size=2,
-                text_lens_list=[20, 20],
-                audio_frames_list=[30, 30],
-                context_text_lens_list=[10, 10],
-                context_audio_frames_list=[15, 15],
-                phoneme_lens_list=[25, 25],
-            ),
-        ),
-        (
-            "Variable text & context lens (B=2, text=[15,25], ctx_text=[8,12], ctx_audio=[10,20])",
-            dict(
-                batch_size=2,
-                text_lens_list=[15, 25],
-                audio_frames_list=[30, 30],
-                context_text_lens_list=[8, 12],
-                context_audio_frames_list=[10, 20],
-                phoneme_lens_list=[20, 30],
-            ),
-        ),
-        (
-            "Variable audio & phoneme lens (B=2, audio=[20,40], phoneme=[15,35])",
-            dict(
-                batch_size=2,
-                text_lens_list=[20, 20],
-                audio_frames_list=[20, 40],
-                context_text_lens_list=[10, 10],
-                context_audio_frames_list=[15, 15],
-                phoneme_lens_list=[15, 35],
-            ),
-        ),
-        (
-            "All different (B=3)",
-            dict(
-                batch_size=3,
-                text_lens_list=[12, 20, 28],
-                audio_frames_list=[20, 30, 40],
-                context_text_lens_list=[6, 10, 14],
-                context_audio_frames_list=[8, 15, 22],
-                phoneme_lens_list=[15, 25, 35],
-            ),
-        ),
-    ]
-
-    all_passed = True
-    for test_name, kwargs in test_configs:
-        batch = create_synthetic_batch(model, device=device, **kwargs)
-        passed = run_single_test(model, batch, test_name, device)
-        if not passed:
-            all_passed = False
-
-    # Final summary
-    print(f"\n{'='*60}")
-    if all_passed:
-        print("✓ ALL TESTS PASSED")
-    else:
-        print("✗ SOME TESTS FAILED")
-        sys.exit(1)
-    print(f"{'='*60}")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh
index 368b5c83bba5..b6d87e91a254 100644
--- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh
@@ -14,7 +14,7 @@
 
 # Tests a 4x-stacked model with local transformer inference.
 
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
     --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \
     --datasets_json_path examples/tts/evalset_config.json \
     --datasets an4_val_ci \
diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh
index a591497f22e0..4e917733f59a 100755
--- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
     --nemo_files "/home/TestData/tts/2602_MoE/moe16_sinkhorn_top1_valLoss5.0469_step2625132_epoch524.nemo" \
     --codecmodel_path "/home/TestData/tts/21fps_causal_codecmodel.nemo" \
     --datasets_json_path "examples/tts/evalset_config.json" \
diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh
index 5ed8d48f5aff..8eb30eb40c36 100644
--- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
     --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \
     --datasets_json_path examples/tts/evalset_config.json \
     --datasets an4_val_ci \
diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh
index 3a9415bbc2b3..eed95fc5a64e 100644
--- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
     --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \
     --datasets_json_path examples/tts/evalset_config.json \
     --datasets an4_val_ci \
diff --git a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh
index ec8b6b885212..c21454d39cb1 100755
--- a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
     --nemo_files "/home/TestData/tts/2602_MoE/moe16_sinkhorn_top1_valLoss5.0469_step2625132_epoch524.nemo" \
     --codecmodel_path "/home/TestData/tts/21fps_causal_codecmodel.nemo" \
     --datasets_json_path "examples/tts/evalset_config.json" \
diff --git a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh
index a0694c16b9ba..96e20304197a 100644
--- a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
     --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \
     --datasets_json_path examples/tts/evalset_config.json \
     --datasets an4_val_ci_longform_tiny \

From 5eaf1a4b93aae1e1326f3cbc4349f3b6b024c75c Mon Sep 17 00:00:00 2001
From: shehzeen <shehzeen@users.noreply.github.com>
Date: Wed, 11 Mar 2026 17:31:53 +0000
Subject: [PATCH 88/94] Apply isort and black reformatting

Signed-off-by: shehzeen <shehzeen@users.noreply.github.com>
---
 .../tts/modules/magpietts_inference/inference.py       | 10 +++-------
 .../tts/modules/magpietts_inference/utils.py           |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index d5d34537e088..c343a9d31f9a 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -125,9 +125,7 @@ def build_identifier(self) -> str:
 class EasyMagpieInferenceConfig(BaseInferenceConfig):
     """Configuration for decoder-only EasyMagpieTTSModel inference."""
 
-    model_inference_parameters: EasyModelInferenceParameters = field(
-        default_factory=EasyModelInferenceParameters
-    )
+    model_inference_parameters: EasyModelInferenceParameters = field(default_factory=EasyModelInferenceParameters)
     phoneme_input_type: str = "gt"
     phoneme_sampling_method: str = "argmax"
     dropout_text_input: bool = False
@@ -176,8 +174,7 @@ def create_dataset(
         dataset_meta: dict,
         context_duration_min: Optional[float] = None,
         context_duration_max: Optional[float] = None,
-    ) -> Union[ChunkedTTSInferenceDataset, MagpieTTSDataset]:
-        ...
+    ) -> Union[ChunkedTTSInferenceDataset, MagpieTTSDataset]: ...
 
     @abc.abstractmethod
     def run_inference_on_dataset(
@@ -189,8 +186,7 @@ def run_inference_on_dataset(
         save_cross_attention_maps: bool = True,
         save_context_audio: bool = True,
         save_predicted_codes: bool = True,
-    ) -> Tuple[List[dict], List[str], List[str]]:
-        ...
+    ) -> Tuple[List[dict], List[str], List[str]]: ...
 
     # -- shared helpers ------------------------------------------------------
 
diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py
index ca89356494fa..a14cd0789f7a 100644
--- a/nemo/collections/tts/modules/magpietts_inference/utils.py
+++ b/nemo/collections/tts/modules/magpietts_inference/utils.py
@@ -553,4 +553,4 @@ def get_experiment_name_from_checkpoint_path(checkpoint_path: str) -> str:
     Returns:
         The experiment name (parent directory of checkpoints folder).
     """
-    return os.path.basename(os.path.dirname(os.path.dirname(checkpoint_path)))
\ No newline at end of file
+    return os.path.basename(os.path.dirname(os.path.dirname(checkpoint_path)))

From 6d96df464ab0eb445844668a7f6aa77b0449346e Mon Sep 17 00:00:00 2001
From: Shehzeen Hussain <shehzeensh@gmail.com>
Date: Wed, 11 Mar 2026 21:24:08 -0700
Subject: [PATCH 89/94] Paarthneekhara/magpietts decoderonly 2601 (#70)

* clean up code, rename back to magpietts_inference.py

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

* bug fixes, inference runs now

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>

---------

Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com>
---
 docs/source/tts/magpietts-longform.rst        |  6 +-
 docs/source/tts/magpietts.rst                 |  2 +-
 .../{tts_infer.py => magpietts_inference.py}  | 60 ++++++++-----------
 nemo/collections/tts/models/easy_magpietts.py | 25 +-------
 .../tts/models/easy_magpietts_inference.py    | 44 ++++++++++----
 .../modules/magpietts_inference/inference.py  | 14 ++---
 .../tts/modules/magpietts_inference/utils.py  | 34 ++++++++---
 ...S_InferEvaluate_Magpietts_FrameStacking.sh |  2 +-
 ...TS_InferEvaluate_Magpietts_MoE_ZeroShot.sh |  2 +-
 ...TS_InferEvaluate_Magpietts_SeenSpeakers.sh |  2 +-
 ...L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh |  2 +-
 ...Evaluatelongform_Magpietts_MoE_ZeroShot.sh |  2 +-
 ...nferEvaluatelongform_Magpietts_ZeroShot.sh |  2 +-
 13 files changed, 102 insertions(+), 95 deletions(-)
 rename examples/tts/{tts_infer.py => magpietts_inference.py} (94%)

diff --git a/docs/source/tts/magpietts-longform.rst b/docs/source/tts/magpietts-longform.rst
index fb3eeb659d33..33aef42a5abe 100644
--- a/docs/source/tts/magpietts-longform.rst
+++ b/docs/source/tts/magpietts-longform.rst
@@ -169,7 +169,7 @@ The ``do_tts`` method automatically detects whether longform inference is needed
     sf.write("output.wav", long_audio[0].cpu().numpy(), 22050)
 
 
-Method 2: Using CLI (``tts_infer.py``)
+Method 2: Using CLI (``magpietts_inference.py``)
 ------------------------------------------------
 
 For batch inference from manifests:
@@ -177,7 +177,7 @@ For batch inference from manifests:
 .. code-block:: bash
 
     # Auto-detect longform based on text length (default)
-    python examples/tts/tts_infer.py \
+    python examples/tts/magpietts_inference.py \
         --nemo_files /path/to/magpietts.nemo \
         --datasets_json_path /path/to/evalset_config.json \
         --out_dir /path/to/output \
@@ -185,7 +185,7 @@ For batch inference from manifests:
         --longform_mode auto
 
     # Force longform inference for all inputs
-    python examples/tts/tts_infer.py \
+    python examples/tts/magpietts_inference.py \
         --nemo_files /path/to/magpietts.nemo \
         --datasets_json_path /path/to/evalset_config.json \
         --out_dir /path/to/output \
diff --git a/docs/source/tts/magpietts.rst b/docs/source/tts/magpietts.rst
index 6d297a694596..b79c11ea88ff 100644
--- a/docs/source/tts/magpietts.rst
+++ b/docs/source/tts/magpietts.rst
@@ -130,7 +130,7 @@ Several parameters control the generation behavior. The temperature setting affe
 
 .. code-block:: bash
 
-    python examples/tts/tts_infer.py \
+    python examples/tts/magpietts_inference.py \
         --nemo_files /path/to/magpietts_model.nemo \
         --codecmodel_path /path/to/audio_codec.nemo \
         --datasets your_evaluation_set \
diff --git a/examples/tts/tts_infer.py b/examples/tts/magpietts_inference.py
similarity index 94%
rename from examples/tts/tts_infer.py
rename to examples/tts/magpietts_inference.py
index 2c3bec0aa7f7..fca92fccddc4 100644
--- a/examples/tts/tts_infer.py
+++ b/examples/tts/magpietts_inference.py
@@ -26,7 +26,7 @@
 
 Example usage:
     # MagpieTTS inference (encoder-decoder, default)
-    python examples/tts/tts_infer.py \\
+    python examples/tts/magpietts_inference.py \\
         --model_type magpie \\
         --nemo_files /path/to/model.nemo \\
         --datasets_json_path /path/to/evalset_config.json \\
@@ -34,7 +34,7 @@
         --codecmodel_path /path/to/codec.nemo
 
     # EasyMagpieTTS inference (decoder-only)
-    python examples/tts/tts_infer.py \\
+    python examples/tts/magpietts_inference.py \\
         --model_type easy_magpie \\
         --nemo_files /path/to/model.nemo \\
         --datasets_json_path /path/to/evalset_config.json \\
@@ -42,7 +42,7 @@
         --codecmodel_path /path/to/codec.nemo
 
     # With evaluation
-    python examples/tts/tts_infer.py \\
+    python examples/tts/magpietts_inference.py \\
         --model_type magpie \\
         --hparams_files /path/to/hparams.yaml \\
         --checkpoint_files /path/to/model.ckpt \\
@@ -66,7 +66,7 @@
 import numpy as np
 
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
-from nemo.collections.tts.models.easy_magpietts import EasyModelInferenceParameters
+from nemo.collections.tts.models.easy_magpietts_inference import EasyModelInferenceParameters
 from nemo.collections.tts.models.magpietts import ModelInferenceParameters
 from nemo.collections.tts.modules.magpietts_inference.evaluate_generated_audio import load_evalset_config
 from nemo.collections.tts.modules.magpietts_inference.evaluation import (
@@ -161,11 +161,6 @@ def filter_datasets(dataset_meta_info: dict, datasets: Optional[List[str]]) -> L
         return datasets
 
 
-# ---------------------------------------------------------------------------
-# Core inference + evaluation orchestration (model-type agnostic)
-# ---------------------------------------------------------------------------
-
-
 def run_inference_and_evaluation(
     runner: BaseInferenceRunner,
     checkpoint_name: str,
@@ -355,15 +350,18 @@ def run_inference_and_evaluation(
     return None, None
 
 
-# ---------------------------------------------------------------------------
-# CLI argument parser
-# ---------------------------------------------------------------------------
+def _get_shared_inference_param_names() -> set:
+    """Return the field names shared by ModelInferenceParameters and EasyModelInferenceParameters."""
+    magpie_fields = {f.name for f in fields(ModelInferenceParameters)}
+    easy_fields = {f.name for f in fields(EasyModelInferenceParameters)}
+    return magpie_fields & easy_fields
 
 
 def _add_inference_param_fields(
     group: argparse._ArgumentGroup,
     param_cls: type,
     skip_fields: Optional[set] = None,
+    only_fields: Optional[set] = None,
 ) -> None:
     """Auto-generate argparse arguments from fields of a dataclass.
 
@@ -371,12 +369,15 @@ def _add_inference_param_fields(
         group: The argparse argument group to add arguments to.
         param_cls: The dataclass whose fields to add.
         skip_fields: Field names to skip (already added by another group).
+        only_fields: If provided, only add fields whose names are in this set.
     """
     if skip_fields is None:
         skip_fields = set()
     for f in fields(param_cls):
         if f.name in skip_fields:
             continue
+        if only_fields is not None and f.name not in only_fields:
+            continue
         extra_args: dict = {"type": f.type}
         if f.type == bool:
             extra_args = {"action": "store_true"}
@@ -399,7 +400,7 @@ def _add_common_args(parser: argparse.ArgumentParser) -> None:
         default='magpie',
         choices=['magpie', 'easy_magpie'],
         help='Model type: "magpie" for encoder-decoder MagpieTTSModel, '
-        '"easy_magpie" for decoder-only EasyMagpieTTSModel',
+        '"easy_magpie" for decoder-only EasyMagpieTTSInferenceModel',
     )
 
     # Model loading
@@ -469,8 +470,9 @@ def _add_common_args(parser: argparse.ArgumentParser) -> None:
     infer_group.add_argument('--use_cfg', action='store_true', help='Enable classifier-free guidance')
     infer_group.add_argument('--use_local_transformer', action='store_true')
 
-    # Shared model inference parameters (max_decoder_steps, temperature, topk, cfg_scale)
-    _add_inference_param_fields(infer_group, EasyModelInferenceParameters)
+    # Model inference parameters shared by both MagpieTTS and EasyMagpieTTS
+    shared_param_names = _get_shared_inference_param_names()
+    _add_inference_param_fields(infer_group, ModelInferenceParameters, only_fields=shared_param_names)
 
     # Evaluation
     eval_group = parser.add_argument_group('Evaluation')
@@ -499,9 +501,8 @@ def _add_magpie_args(parser: argparse.ArgumentParser) -> None:
     group = parser.add_argument_group('MagpieTTS-specific Parameters')
 
     # MagpieTTS-specific model inference parameters (attention prior, EOS, etc.)
-    # Skip fields already added by the common inference group.
-    shared_field_names = {f.name for f in fields(EasyModelInferenceParameters)}
-    _add_inference_param_fields(group, ModelInferenceParameters, skip_fields=shared_field_names)
+    shared_param_names = _get_shared_inference_param_names()
+    _add_inference_param_fields(group, ModelInferenceParameters, skip_fields=shared_param_names)
 
     group.add_argument('--maskgit_n_steps', type=int, default=3)
     group.add_argument('--maskgit_noise_scale', type=float, default=0.0)
@@ -514,7 +515,7 @@ def _add_magpie_args(parser: argparse.ArgumentParser) -> None:
 
 
 def _add_easy_magpie_args(parser: argparse.ArgumentParser) -> None:
-    """Add arguments specific to decoder-only EasyMagpieTTSModel."""
+    """Add arguments specific to decoder-only EasyMagpieTTSInferenceModel."""
     group = parser.add_argument_group('EasyMagpieTTS-specific Parameters')
     group.add_argument(
         '--phoneme_input_type',
@@ -532,9 +533,10 @@ def _add_easy_magpie_args(parser: argparse.ArgumentParser) -> None:
     )
     group.add_argument('--dropout_text_input', action='store_true', help='Force dropout on text input')
     group.add_argument(
-        '--legacy_context_stacking',
-        action='store_true',
-        help='Use audio_bos_id/audio_eos_id for context stacking',
+        '--phoneme_tokenizer_path',
+        type=str,
+        default=None,
+        help='Override path to the phoneme tokenizer file (overrides the path stored in the checkpoint config)',
     )
 
 
@@ -551,11 +553,6 @@ def create_argument_parser() -> argparse.ArgumentParser:
     return parser
 
 
-# ---------------------------------------------------------------------------
-# Config builders (one per model type)
-# ---------------------------------------------------------------------------
-
-
 def _build_inference_params_from_args(param_cls: type, args):
     """Extract inference parameters from parsed CLI args for the given dataclass."""
     params = {}
@@ -592,15 +589,8 @@ def _build_easy_magpie_config(args) -> EasyMagpieInferenceConfig:
         phoneme_input_type=args.phoneme_input_type,
         phoneme_sampling_method=args.phoneme_sampling_method,
         dropout_text_input=args.dropout_text_input,
-        legacy_context_stacking=args.legacy_context_stacking,
     )
 
-
-# ---------------------------------------------------------------------------
-# Entry point
-# ---------------------------------------------------------------------------
-
-
 def main(argv=None):
     """Entry point for TTS inference and evaluation."""
     parser = create_argument_parser()
@@ -656,6 +646,7 @@ def main(argv=None):
                 legacy_codebooks=args.legacy_codebooks,
                 legacy_text_conditioning=args.legacy_text_conditioning,
                 hparams_from_wandb=args.hparams_file_from_wandb,
+                phoneme_tokenizer_path=getattr(args, 'phoneme_tokenizer_path', None),
             )
 
             model, checkpoint_name = load_fn(model_config)
@@ -693,6 +684,7 @@ def main(argv=None):
                 codecmodel_path=args.codecmodel_path,
                 legacy_codebooks=args.legacy_codebooks,
                 legacy_text_conditioning=args.legacy_text_conditioning,
+                phoneme_tokenizer_path=getattr(args, 'phoneme_tokenizer_path', None),
             )
 
             model, checkpoint_name = load_fn(model_config)
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 19705eed1ad3..5a117432b986 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -14,7 +14,7 @@
 import json
 import os
 import random
-from dataclasses import dataclass, fields
+from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
 import numpy as np
@@ -98,29 +98,6 @@ class ProcessBatchOutput:
     selected_training_mode: Optional[str]
 
 
-@dataclass
-class EasyModelInferenceParameters:
-    """Inference parameters for the decoder-only EasyMagpieTTS model.
-
-    Attributes:
-        max_decoder_steps: Maximum number of decoder steps.
-        temperature: Sampling temperature.
-        topk: Number of top-probability tokens to consider in sampling.
-        cfg_scale: Scale factor for classifier-free guidance.
-    """
-
-    max_decoder_steps: int = 500
-    temperature: float = 0.7
-    topk: int = 80
-    cfg_scale: float = 2.5
-
-    @classmethod
-    def from_dict(cls, data: dict) -> 'EasyModelInferenceParameters':
-        field_names = {field.name for field in fields(cls)}
-        filtered_data = {k: v for k, v in data.items() if k in field_names}
-        return cls(**filtered_data)
-
-
 class EasyMagpieTTSModel(EasyMagpieTTSInferenceModel):
     """
     Magpie-TTS Model Decoder Only Model with training support.
diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py
index 765c234e2683..59db7decda0e 100644
--- a/nemo/collections/tts/models/easy_magpietts_inference.py
+++ b/nemo/collections/tts/models/easy_magpietts_inference.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
-from dataclasses import dataclass
+from dataclasses import dataclass, fields
 from functools import partial
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
@@ -184,6 +184,29 @@ class InferBatchOutput:
     phoneme_prediction_start_idx: Optional[torch.Tensor] = None  # (B,) start index into predicted_phoneme_tokens
 
 
+@dataclass
+class EasyModelInferenceParameters:
+    """Inference parameters for the decoder-only EasyMagpieTTS model.
+
+    Attributes:
+        max_decoder_steps: Maximum number of decoder steps.
+        temperature: Sampling temperature.
+        topk: Number of top-probability tokens to consider in sampling.
+        cfg_scale: Scale factor for classifier-free guidance.
+    """
+
+    max_decoder_steps: int = 300
+    temperature: float = 0.7
+    topk: int = 80
+    cfg_scale: float = 2.5
+
+    @classmethod
+    def from_dict(cls, data: dict) -> 'EasyModelInferenceParameters':
+        field_names = {field.name for field in fields(cls)}
+        filtered_data = {k: v for k, v in data.items() if k in field_names}
+        return cls(**filtered_data)
+
+
 class EasyMagpieTTSInferenceModel(BaseMagpieTTSModel):
     """
     Inference-only base class for EasyMagpieTTS decoder-only model.
@@ -319,6 +342,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
 
         self.pad_context_text_to_max_duration = False
         self.add_language_to_context_text = cfg.get('add_language_to_context_text', False)
+        self.ignore_phoneme_languages = cfg.get('ignore_phoneme_languages', [])
 
         super().__init__(cfg=cfg, trainer=trainer)
 
@@ -465,6 +489,12 @@ def _get_state_dict_keys_to_exclude(self):
             '_codec_model',
         ]
 
+    def setup_training_data(self, train_data_config=None):
+        pass
+
+    def setup_validation_data(self, val_data_config=None):
+        pass
+
     def codes_to_audio(self, codes, codes_len):
         # codes: (B, C, T')
         self._codec_model.eval()
@@ -734,19 +764,11 @@ def prepare_context_tensors(
             eos_id=self.context_audio_eos_id,
         )
 
-        # Use legacy audio_bos_id/audio_eos_id if flag is set
-        stack_bos_id = (
-            self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id
-        )
-        stack_eos_id = (
-            self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id
-        )
-
         context_audio_codes, context_audio_codes_lens = self.stack_codes(
             context_audio_codes,
             context_audio_codes_lens,
-            stack_bos_id,
-            stack_eos_id,
+            self.context_audio_bos_id,
+            self.context_audio_eos_id,
             self.frame_stacking_factor,
             self.num_audio_codebooks,
         )
diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index c343a9d31f9a..ab501075c98d 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -21,7 +21,7 @@
 MagpieInferenceRunner handles the encoder-decoder MagpieTTSModel
 (chunked text, generate_speech + codes_to_audio).
 
-EasyMagpieInferenceRunner handles the decoder-only EasyMagpieTTSModel
+EasyMagpieInferenceRunner handles the decoder-only EasyMagpieTTSInferenceModel
 (infer_batch, returns audio directly).
 """
 from __future__ import annotations
@@ -40,7 +40,7 @@
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
 from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPATokenizer
 from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset, MagpieTTSDataset
-from nemo.collections.tts.models.easy_magpietts import EasyModelInferenceParameters
+from nemo.collections.tts.models.easy_magpietts_inference import EasyModelInferenceParameters
 from nemo.collections.tts.models.magpietts import ModelInferenceParameters
 from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors
 from nemo.utils import logging
@@ -123,13 +123,12 @@ def build_identifier(self) -> str:
 
 @dataclass
 class EasyMagpieInferenceConfig(BaseInferenceConfig):
-    """Configuration for decoder-only EasyMagpieTTSModel inference."""
+    """Configuration for decoder-only EasyMagpieTTSInferenceModel inference."""
 
     model_inference_parameters: EasyModelInferenceParameters = field(default_factory=EasyModelInferenceParameters)
     phoneme_input_type: str = "gt"
     phoneme_sampling_method: str = "argmax"
     dropout_text_input: bool = False
-    legacy_context_stacking: bool = False
 
     def build_identifier(self) -> str:
         parts = [
@@ -538,19 +537,18 @@ def _compute_end_of_text_flags(
 
 
 # ---------------------------------------------------------------------------
-# EasyMagpieInferenceRunner  (decoder-only EasyMagpieTTSModel)
+# EasyMagpieInferenceRunner  (decoder-only EasyMagpieTTSInferenceModel)
 # ---------------------------------------------------------------------------
 
 
 class EasyMagpieInferenceRunner(BaseInferenceRunner):
-    """Runner for decoder-only EasyMagpieTTSModel.
+    """Runner for decoder-only EasyMagpieTTSInferenceModel.
 
     Uses MagpieTTSDataset and model.infer_batch() which returns audio directly.
     """
 
     def __init__(self, model, config: EasyMagpieInferenceConfig):
         super().__init__(model, config)
-        self.model.legacy_context_stacking = config.legacy_context_stacking
 
     def create_dataset(
         self,
@@ -583,6 +581,8 @@ def create_dataset(
             pad_context_text_to_max_duration=False,
             context_duration_min=context_duration_min,
             context_duration_max=context_duration_max,
+            ignore_phoneme_languages=self.config.get('ignore_phoneme_languages', []),
+            add_language_to_context_text=self.model.add_language_to_context_text
         )
         dataset.text_tokenizer = self.model.tokenizer
 
diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py
index a14cd0789f7a..9c67125f4343 100644
--- a/nemo/collections/tts/modules/magpietts_inference/utils.py
+++ b/nemo/collections/tts/modules/magpietts_inference/utils.py
@@ -28,7 +28,7 @@
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 
-from nemo.collections.tts.models import EasyMagpieTTSModel, MagpieTTSModel
+from nemo.collections.tts.models import EasyMagpieTTSInferenceModel, MagpieTTSModel
 from nemo.utils import logging
 
 
@@ -119,6 +119,7 @@ class ModelLoadConfig:
         legacy_codebooks: Use legacy codebook indices for old checkpoints.
         legacy_text_conditioning: Use legacy text conditioning for old checkpoints.
         hparams_from_wandb: Whether hparams file is from wandb export.
+        phoneme_tokenizer_path: Override path to the phoneme tokenizer file (EasyMagpieTTS only).
     """
 
     hparams_file: Optional[str] = None
@@ -128,6 +129,7 @@ class ModelLoadConfig:
     legacy_codebooks: bool = False
     legacy_text_conditioning: bool = False
     hparams_from_wandb: bool = False
+    phoneme_tokenizer_path: Optional[str] = None
 
     def validate(self) -> None:
         """Validate that the configuration is complete and consistent."""
@@ -336,8 +338,13 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma
     return model, checkpoint_name
 
 
-def load_easy_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[EasyMagpieTTSModel, str]:
-    """Load an EasyMagpieTTSModel (decoder-only) from checkpoint or NeMo archive.
+def load_easy_magpie_model(
+    config: ModelLoadConfig, device: str = "cuda"
+) -> Tuple[EasyMagpieTTSInferenceModel, str]:
+    """Load an EasyMagpieTTSInferenceModel (decoder-only) from checkpoint or NeMo archive.
+
+    Uses the inference-only base class rather than the full training model,
+    which avoids pulling in training-specific dependencies.
 
     Supports two loading modes:
     1. Checkpoint mode: hparams.yaml + .ckpt file
@@ -367,8 +374,10 @@ def load_easy_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tup
             model_cfg.codecmodel_path = config.codecmodel_path
             model_cfg.train_ds = None
             model_cfg.validation_ds = None
+            if config.phoneme_tokenizer_path and hasattr(model_cfg, 'phoneme_tokenizer'):
+                model_cfg.phoneme_tokenizer.tokenizer_path = config.phoneme_tokenizer_path
 
-        model = EasyMagpieTTSModel(cfg=model_cfg)
+        model = EasyMagpieTTSInferenceModel(cfg=model_cfg)
 
         logging.info(f"Loading weights from checkpoint: {config.checkpoint_file}")
         ckpt = torch.load(config.checkpoint_file)
@@ -378,22 +387,29 @@ def load_easy_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tup
         checkpoint_name = os.path.basename(config.checkpoint_file).replace(".ckpt", "")
     else:
         if config.nemo_file.startswith("nvidia/"):
-            model = EasyMagpieTTSModel.from_pretrained(config.nemo_file)
+            model = EasyMagpieTTSInferenceModel.from_pretrained(config.nemo_file)
             checkpoint_name = config.nemo_file.split("/")[-1]
         else:
             logging.info(f"Loading model from NeMo archive: {config.nemo_file}")
-            model_cfg = EasyMagpieTTSModel.restore_from(config.nemo_file, return_config=True)
+            model_cfg = EasyMagpieTTSInferenceModel.restore_from(config.nemo_file, return_config=True)
 
             with open_dict(model_cfg):
                 model_cfg.codecmodel_path = config.codecmodel_path
                 model_cfg.train_ds = None
                 model_cfg.validation_ds = None
+                if config.phoneme_tokenizer_path and hasattr(model_cfg, 'phoneme_tokenizer'):
+                    model_cfg.phoneme_tokenizer.tokenizer_path = config.phoneme_tokenizer_path
+                # Override target so restore_from instantiates the inference class,
+                # not the training subclass stored in the .nemo config.
+                model_cfg.target = (
+                    'nemo.collections.tts.models.easy_magpietts_inference.EasyMagpieTTSInferenceModel'
+                )
 
-            model = EasyMagpieTTSModel.restore_from(config.nemo_file, override_config_path=model_cfg)
+            model = EasyMagpieTTSInferenceModel.restore_from(config.nemo_file, override_config_path=model_cfg)
             checkpoint_name = os.path.basename(config.nemo_file).replace(".nemo", "")
 
     model.to(device)
-    model.eval()
+    model.eval().float()
     logging.info("EasyMagpieTTS model loaded and ready for inference.")
 
     return model, checkpoint_name
@@ -480,7 +496,7 @@ def log_model_architecture_summary(model) -> Tuple[str, Dict[str, dict]]:
 
     Detects and logs MoE configuration for each transformer component,
     computing FLOPs metrics and parameter counts. Gracefully handles
-    decoder-only models (EasyMagpieTTSModel) that use HuggingFace/Nemotron
+    decoder-only models (EasyMagpieTTSInferenceModel) that use HuggingFace/Nemotron
     decoders without the d_model/d_ffn config structure.
 
     Args:
diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh
index b6d87e91a254..368b5c83bba5 100644
--- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh
@@ -14,7 +14,7 @@
 
 # Tests a 4x-stacked model with local transformer inference.
 
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
     --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \
     --datasets_json_path examples/tts/evalset_config.json \
     --datasets an4_val_ci \
diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh
index 4e917733f59a..a591497f22e0 100755
--- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
     --nemo_files "/home/TestData/tts/2602_MoE/moe16_sinkhorn_top1_valLoss5.0469_step2625132_epoch524.nemo" \
     --codecmodel_path "/home/TestData/tts/21fps_causal_codecmodel.nemo" \
     --datasets_json_path "examples/tts/evalset_config.json" \
diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh
index 8eb30eb40c36..5ed8d48f5aff 100644
--- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
     --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \
     --datasets_json_path examples/tts/evalset_config.json \
     --datasets an4_val_ci \
diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh
index eed95fc5a64e..3a9415bbc2b3 100644
--- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
     --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \
     --datasets_json_path examples/tts/evalset_config.json \
     --datasets an4_val_ci \
diff --git a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh
index c21454d39cb1..ec8b6b885212 100755
--- a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
     --nemo_files "/home/TestData/tts/2602_MoE/moe16_sinkhorn_top1_valLoss5.0469_step2625132_epoch524.nemo" \
     --codecmodel_path "/home/TestData/tts/21fps_causal_codecmodel.nemo" \
     --datasets_json_path "examples/tts/evalset_config.json" \
diff --git a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh
index 96e20304197a..a0694c16b9ba 100644
--- a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh
+++ b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \
     --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \
     --datasets_json_path examples/tts/evalset_config.json \
     --datasets an4_val_ci_longform_tiny \

From cb6926733fb8b05ad008368bab3b533c9c48f28e Mon Sep 17 00:00:00 2001
From: shehzeen <shehzeen@users.noreply.github.com>
Date: Thu, 12 Mar 2026 04:24:49 +0000
Subject: [PATCH 90/94] Apply isort and black reformatting

Signed-off-by: shehzeen <shehzeen@users.noreply.github.com>
---
 examples/tts/magpietts_inference.py                       | 1 +
 .../tts/modules/magpietts_inference/inference.py          | 2 +-
 nemo/collections/tts/modules/magpietts_inference/utils.py | 8 ++------
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py
index fca92fccddc4..b8a91d3ea307 100644
--- a/examples/tts/magpietts_inference.py
+++ b/examples/tts/magpietts_inference.py
@@ -591,6 +591,7 @@ def _build_easy_magpie_config(args) -> EasyMagpieInferenceConfig:
         dropout_text_input=args.dropout_text_input,
     )
 
+
 def main(argv=None):
     """Entry point for TTS inference and evaluation."""
     parser = create_argument_parser()
diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index ab501075c98d..e936f81439be 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -582,7 +582,7 @@ def create_dataset(
             context_duration_min=context_duration_min,
             context_duration_max=context_duration_max,
             ignore_phoneme_languages=self.config.get('ignore_phoneme_languages', []),
-            add_language_to_context_text=self.model.add_language_to_context_text
+            add_language_to_context_text=self.model.add_language_to_context_text,
         )
         dataset.text_tokenizer = self.model.tokenizer
 
diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py
index 9c67125f4343..47b2553e99eb 100644
--- a/nemo/collections/tts/modules/magpietts_inference/utils.py
+++ b/nemo/collections/tts/modules/magpietts_inference/utils.py
@@ -338,9 +338,7 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma
     return model, checkpoint_name
 
 
-def load_easy_magpie_model(
-    config: ModelLoadConfig, device: str = "cuda"
-) -> Tuple[EasyMagpieTTSInferenceModel, str]:
+def load_easy_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[EasyMagpieTTSInferenceModel, str]:
     """Load an EasyMagpieTTSInferenceModel (decoder-only) from checkpoint or NeMo archive.
 
     Uses the inference-only base class rather than the full training model,
@@ -401,9 +399,7 @@ def load_easy_magpie_model(
                     model_cfg.phoneme_tokenizer.tokenizer_path = config.phoneme_tokenizer_path
                 # Override target so restore_from instantiates the inference class,
                 # not the training subclass stored in the .nemo config.
-                model_cfg.target = (
-                    'nemo.collections.tts.models.easy_magpietts_inference.EasyMagpieTTSInferenceModel'
-                )
+                model_cfg.target = 'nemo.collections.tts.models.easy_magpietts_inference.EasyMagpieTTSInferenceModel'
 
             model = EasyMagpieTTSInferenceModel.restore_from(config.nemo_file, override_config_path=model_cfg)
             checkpoint_name = os.path.basename(config.nemo_file).replace(".nemo", "")

From db9763dcdcf6aea2baa333015653d070c1f89513 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Thu, 12 Mar 2026 16:35:03 -0400
Subject: [PATCH 91/94] refactoring to remove magpie base class

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/__init__.py       |   2 -
 nemo/collections/tts/models/base_magpietts.py | 527 -----------------
 nemo/collections/tts/models/easy_magpietts.py |  55 +-
 .../tts/models/easy_magpietts_inference.py    | 117 +++-
 .../easy_magpietts_preference_optimization.py |   7 +-
 nemo/collections/tts/models/magpietts.py      | 128 +++--
 .../magpietts_preference_optimization.py      |   3 +-
 .../modules/magpietts_inference/inference.py  |   6 +-
 .../tts/modules/magpietts_modules.py          | 538 ++++++++++++++++++
 9 files changed, 774 insertions(+), 609 deletions(-)
 delete mode 100644 nemo/collections/tts/models/base_magpietts.py

diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py
index 28d49bca1c81..576077bdcddf 100644
--- a/nemo/collections/tts/models/__init__.py
+++ b/nemo/collections/tts/models/__init__.py
@@ -14,7 +14,6 @@
 
 from nemo.collections.tts.models.aligner import AlignerModel
 from nemo.collections.tts.models.audio_codec import AudioCodecModel
-from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel
 from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel
 from nemo.collections.tts.models.easy_magpietts_inference import EasyMagpieTTSInferenceModel
 from nemo.collections.tts.models.easy_magpietts_preference_optimization import EasyMagpieTTSModelOnlinePO
@@ -32,7 +31,6 @@
 __all__ = [
     "AlignerModel",
     "AudioCodecModel",
-    "BaseMagpieTTSModel",
     "FastPitchModel",
     "FastPitchModel_SSL",
     "SSLDisentangler",
diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py
deleted file mode 100644
index f031ebf98fab..000000000000
--- a/nemo/collections/tts/models/base_magpietts.py
+++ /dev/null
@@ -1,527 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, List, Optional
-
-import numpy as np
-import torch
-from hydra.utils import instantiate
-from torch.utils.data import get_worker_info
-
-from nemo.collections.tts.data.text_to_speech_dataset_lhotse import setup_tokenizers
-from nemo.collections.tts.modules.magpietts_modules import SpecialAudioToken, cosine_schedule
-from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
-from nemo.core.classes import ModelPT
-from nemo.utils import logging
-
-
-def worker_init_fn(worker_id):
-    """Per-worker init for DataLoader workers.
-
-    Sets up tokenizers for the dataset (text and optionally phoneme)
-    when using multiprocessing.
-    """
-    logging.info(f"Worker {worker_id} initializing...")
-    worker_info = get_worker_info()
-    dataset = worker_info.dataset
-    tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type)
-    dataset.text_tokenizer = tokenizer
-    if hasattr(dataset, 'phoneme_tokenizer_config'):
-        dataset.phoneme_tokenizer = instantiate(dataset.phoneme_tokenizer_config)
-
-
-class BaseMagpieTTSModel(ModelPT):
-    """Base class for MagpieTTS models.
-
-    Contains shared functionality for audio codec helpers, special token
-    manipulation, local transformer functions, and state dict handling.
-    Subclasses (EasyMagpieTTSModel, MagpieTTSModel) provide their own
-    ``__init__``, data loading, training/inference logic, etc.
-    """
-
-    def _get_state_dict_keys_to_exclude(self) -> List[str]:
-        """Return list of key substrings to exclude from checkpoint save/load.
-
-        Subclasses should override to specify model-specific exclusions
-        (e.g. codec model, eval models).
-        """
-        return ['_codec_model']
-
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        if hasattr(self, '_no_state_dict') and self._no_state_dict:
-            return {}
-        state_dict = super().state_dict(destination, prefix, keep_vars)
-        keys_substrings_to_exclude = self._get_state_dict_keys_to_exclude()
-        for key in list(state_dict.keys()):
-            if any(substring in key for substring in keys_substrings_to_exclude):
-                del state_dict[key]
-        return state_dict
-
-    def load_state_dict(self, state_dict, strict=True):
-        if not strict:
-            super().load_state_dict(state_dict, strict=False)
-        modules_to_skip = self._get_state_dict_keys_to_exclude()
-        for name, child in self.named_children():
-            if name in modules_to_skip:
-                continue
-            if any(param.numel() > 0 for param in child.parameters()):
-                new_state_dict = {}
-                for key in state_dict.keys():
-                    name_with_dot = f"{name}."
-                    if key.startswith(name_with_dot):
-                        new_state_dict[key[len(name_with_dot) :]] = state_dict[key]
-                child.load_state_dict(new_state_dict)
-
-    def setup_optimizer_param_groups(self):
-        """Exclude frozen eval/inference-only models from the optimizer."""
-        modules_to_exclude = set(self._get_state_dict_keys_to_exclude())
-
-        excluded_param_ids = set()
-        for name, module in self.named_children():
-            if name in modules_to_exclude:
-                for param in module.parameters():
-                    excluded_param_ids.add(id(param))
-
-        trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids]
-
-        logging.info(
-            f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, "
-            f"{len(excluded_param_ids)} params excluded (eval models)"
-        )
-
-        self._optimizer_param_groups = [{"params": trainable_params}]
-
-    def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1):
-        # codes: (B, C, T')
-        codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0)
-        codes_len = codes_len + num_eos_tokens
-        for idx in range(codes.size(0)):
-            codes[idx, :, codes_len[idx] - 1] = eos_id
-        return codes, codes_len
-
-    def add_special_tokens(self, codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1):
-        # codes: (B, C, T')
-        codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id)
-        codes_len = codes_len + num_bos_tokens
-        codes, codes_len = self.add_eos_token(
-            codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens
-        )
-        return codes, codes_len
-
-    def remove_bos_token(self, codes, codes_len, num_tokens=1):
-        codes = codes[:, :, num_tokens:]
-        codes_len = codes_len - num_tokens
-        return codes, codes_len
-
-    def remove_embedded_bos_token(self, embedded, embedded_len):
-        embedded = embedded[:, 1:, :]
-        embedded_len = embedded_len - 1
-        return embedded, embedded_len
-
-    def remove_eos_token(self, codes, codes_len):
-        codes_len = codes_len - 1
-        codes = codes[:, :, :-1]
-        mask = get_mask_from_lengths(lengths=codes_len)
-        codes = codes * mask.unsqueeze(1)
-        return codes, codes_len
-
-    def remove_embedded_eos_token(self, embedded, embedded_len):
-        # embedded: (B, T', D)
-        embedded_len = embedded_len - 1
-        embedded = embedded[:, :-1, :]
-        mask = get_mask_from_lengths(lengths=embedded_len)
-        embedded = embedded * mask.unsqueeze(2)
-        return embedded, embedded_len
-
-    def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1):
-        codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens)
-        codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len)
-        return codes, codes_len
-
-    def audio_to_codes(self, audio, audio_len, sample_rate=None):
-        self._codec_model.eval()
-        with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32):
-            codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate)
-            return codes, codes_len
-
-    def codes_to_audio(self, codes, codes_len):
-        # codes: (B, C, T')
-        self._codec_model.eval()
-        with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
-            if self._codec_converter is not None:
-                codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len)
-            audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len)
-            return audio, audio_len, codes
-
-    def pad_audio_codes(self, audio_codes: torch.Tensor):
-        """Pads the time dimension of the audio codes to a multiple of the frame stacking factor.
-
-        Args:
-            audio_codes: (B, C, T)
-        Returns:
-            (B, C, T_padded)
-        """
-        T = audio_codes.size(2)
-        T_padded = int(np.ceil(T / self.frame_stacking_factor) * self.frame_stacking_factor)
-        num_pad = T_padded - T
-        audio_codes = torch.nn.functional.pad(input=audio_codes, pad=(0, num_pad))
-        return audio_codes
-
-    def clear_forbidden_logits(self, logits: torch.Tensor, forbid_audio_eos: bool = False) -> torch.Tensor:
-        """Sets logits of forbidden tokens to ``-inf`` so they will never be sampled.
-
-        Specifically, we forbid sampling of all special tokens except AUDIO_EOS
-        which is allowed by default.
-
-        Args:
-            logits: (B, C, num_audio_tokens_per_codebook)
-            forbid_audio_eos: If True, also forbid AUDIO_EOS tokens from being sampled.
-        """
-        logits[
-            :,
-            :,
-            SpecialAudioToken.get_forbidden_tokens(self.codebook_size, forbid_audio_eos=forbid_audio_eos),
-        ] = float('-inf')
-        return logits
-
-    def maskgit_create_random_mask(self, codes):
-        """Creates a mask where True indicates positions that should be replaced with MASK_TOKEN."""
-        B, C, T = codes.shape
-        rand_values = torch.rand(B, T, device=codes.device)
-        frac_masked = cosine_schedule(rand_values)
-        n_masked = torch.ceil(frac_masked * C).long()
-        random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1)
-        mask_indices = torch.arange(C, device=codes.device).view(1, C, 1)
-        mask = mask_indices < n_masked.view(B, 1, T)
-        mask = torch.gather(mask, 1, random_permutations)
-        return mask
-
-    def maskgit_apply_random_mask(self, codes):
-        """Randomly replaces some codes with MASK_TOKEN following the cosine schedule."""
-        mask = self.maskgit_create_random_mask(codes)
-        codes_with_mask = torch.where(mask, self.mask_token_id, codes)
-        return codes_with_mask, mask
-
-    def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False):
-        """Predicts the logits for all codebooks using the local transformer.
-
-        Used in both autoregressive (AR) and MaskGit (MG) modes during
-        training and validation (not inference/sampling).
-
-        The sequence layout is slightly different between AR and MG modes, as shown below
-        (using an 8-codebook setup as an example)::
-
-            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-            | AR target  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |   none  |
-            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-            | MG target  |  none   |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
-            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-            |   Input    | Magpie  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
-            |            | Latent  | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK |
-            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-            | Seq. Index |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |    8    |
-            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-
-        Args:
-            dec_out: (B, T', E)
-            audio_codes_target: (B, C, T')
-            targets_offset_by_one: if False, target for index 0 is codebook 0 (AR);
-                if True, target for index 1 is codebook 0 (MaskGit).
-        """
-        C = self.num_audio_codebooks
-        dec_out_all = dec_out.reshape(-1, dec_out.size(-1))  # (B*T', E)
-        local_transformer_input = [dec_out_all]
-        audio_codes_target = self.pad_audio_codes(audio_codes_target).long()
-        for fs_index in range(self.frame_stacking_factor):
-            for codebook_num in range(C):
-                codes = audio_codes_target[:, codebook_num, fs_index :: self.frame_stacking_factor]
-                codes = codes.reshape(-1)
-                codebook_embedding = self.audio_embeddings[codebook_num + fs_index * C](codes)
-                codebook_embedding = self.audio_in_projection(codebook_embedding)
-                local_transformer_input.append(codebook_embedding)
-
-        local_transformer_input = torch.stack(local_transformer_input, dim=1)
-        local_transformer_input = self.local_transformer_in_projection(local_transformer_input)
-        _mask = torch.ones(
-            local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
-        )
-        local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']
-        if not targets_offset_by_one:
-            local_transformer_output = local_transformer_output[:, :-1, :]
-        else:
-            local_transformer_output = local_transformer_output[:, 1:, :]
-
-        local_transformer_output = self.local_transformer_audio_out_projection(local_transformer_output)
-
-        all_code_logits = []
-        for fs_index in range(self.frame_stacking_factor):
-            for codebook_num in range(audio_codes_target.size(1)):
-                codebook_logits = self.local_transformer_out_projections[codebook_num + fs_index * C](
-                    local_transformer_output[:, codebook_num + fs_index * C, :]
-                )
-                all_code_logits.append(codebook_logits)
-        all_code_logits = torch.cat(all_code_logits, dim=1)
-
-        all_code_logits = all_code_logits.view(
-            audio_codes_target.size(0), audio_codes_target.size(2) // self.frame_stacking_factor, -1
-        )
-
-        return all_code_logits
-
-    def local_transformer_sample_autoregressive(
-        self,
-        dec_output: torch.Tensor,
-        temperature: float = 0.7,
-        topk: int = 80,
-        unfinished_items: Dict[int, bool] = {},
-        finished_items: Dict[int, bool] = {},
-        use_cfg: bool = False,
-        cfg_scale: float = 1.0,
-        use_kv_cache: bool = True,
-        forbid_audio_eos: bool = False,
-        sanitize_logits: bool = False,
-    ) -> torch.Tensor:
-        """Sample audio codes autoregressively across codebooks using the local transformer.
-
-        Uses multinomial sampling with temperature, top-k, and
-        classifier-free guidance (CFG).
-
-        Args:
-            dec_output: Decoder output tensor (B, E).
-            temperature: Sampling temperature. When <= 0, uses argmax.
-            topk: Number of top-probability tokens to consider.
-            unfinished_items: Batch indices that have not completed generation (EOS forbidden).
-            finished_items: Batch indices that are completed (EOS forced).
-            use_cfg: Whether to use classifier-free guidance (doubled batch).
-            cfg_scale: Scale factor for CFG.
-            use_kv_cache: Whether to use key-value caching in the local transformer.
-            forbid_audio_eos: Whether to globally forbid audio EOS.
-            sanitize_logits: Whether to clamp/clean logits before sampling.
-
-        Returns:
-            Sampled audio codes (B, num_codebooks, frame_stacking_factor).
-        """
-        self.local_transformer.reset_cache(use_cache=use_kv_cache)
-        dec_output = dec_output.unsqueeze(1)  # (B, 1, E)
-        local_transformer_input = self.local_transformer_in_projection(dec_output)
-        all_preds = []
-        for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor):
-            _mask = torch.ones(
-                local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
-            )
-            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']
-
-            lt_out_for_proj = self.local_transformer_audio_out_projection(local_transformer_output[:, -1, :])
-            codebook_logits = self.local_transformer_out_projections[codebook_num](lt_out_for_proj)
-
-            if use_cfg:
-                actual_batch_size = codebook_logits.size(0) // 2
-                conditional_logits = codebook_logits[:actual_batch_size]
-                unconditional_logits = codebook_logits[actual_batch_size:]
-                cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
-                codebook_logits[:actual_batch_size] = cfg_logits
-
-            if sanitize_logits:
-                codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
-                codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
-
-            for item_idx in unfinished_items:
-                codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
-            for item_idx in finished_items:
-                codebook_logits[item_idx, :] = float('-inf')
-                codebook_logits[item_idx, self.audio_eos_id] = 0.0
-
-            codebook_logits = self.clear_forbidden_logits(
-                codebook_logits.unsqueeze(1), forbid_audio_eos=forbid_audio_eos
-            ).squeeze(1)
-
-            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]
-            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(-1)
-            codebook_logits_rescored = codebook_logits.clone()
-            codebook_logits_rescored[indices_to_remove] = float('-inf')
-
-            if temperature <= 0.0:
-                codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True)
-            else:
-                codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1)
-                codebook_preds = torch.multinomial(codebook_probs, 1)
-
-            if use_cfg:
-                codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size]
-            all_preds.append(codebook_preds)
-
-            next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(1)
-            next_local_transformer_input = self.audio_in_projection(next_local_transformer_input)
-            next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input)
-            local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1)
-
-        all_preds = torch.cat(all_preds, dim=1)  # (B, num_codebooks * frame_stacking_factor)
-        all_preds = all_preds.reshape(-1, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1)
-        if use_cfg:
-            all_preds = all_preds[:actual_batch_size]
-
-        return all_preds
-
-    def local_transformer_sample_maskgit(
-        self,
-        dec_output: torch.Tensor,
-        temperature: float = 0.7,
-        topk: int = 80,
-        unfinished_items: Dict[int, bool] = {},
-        finished_items: Dict[int, bool] = {},
-        use_cfg: bool = False,
-        cfg_scale: float = 1.0,
-        n_steps: int = 3,
-        noise_scale: float = 0.0,
-        fixed_schedule: Optional[List[int]] = None,
-        dynamic_cfg_scale: bool = False,
-        sampling_type: Optional[str] = None,
-        forbid_audio_eos: bool = False,
-    ) -> torch.Tensor:
-        """Sample audio codes using MaskGit-like iterative prediction with the local transformer.
-
-        If frame-stacking is enabled, the codes for all frames in the stack
-        are sampled, treated as one long sequence.
-
-        Args:
-            dec_output: Decoder output tensor (B, E).
-            temperature: Sampling temperature.
-            topk: Number of top-probability tokens to consider.
-            unfinished_items: Batch indices that have not completed generation.
-            finished_items: Batch indices that are completed.
-            use_cfg: Whether to use classifier-free guidance.
-            cfg_scale: Scale factor for CFG.
-            n_steps: Number of iterative refinement steps.
-            noise_scale: Scale factor for noise added to confidence scores.
-            fixed_schedule: Fixed schedule for number of tokens to unmask per step.
-            dynamic_cfg_scale: Whether to dynamically adjust CFG scale.
-            sampling_type: Sampling strategy (``"default"``, ``"causal"``,
-                ``"purity_causal"``, ``"purity_default"``).
-            forbid_audio_eos: Whether to globally forbid audio EOS.
-
-        Returns:
-            Sampled audio codes (B, num_codebooks, frame_stacking_factor).
-        """
-        device = dec_output.device
-        self.local_transformer.reset_cache(use_cache=False)
-        dec_output = dec_output.unsqueeze(1)
-        local_transformer_input_init = self.local_transformer_in_projection(dec_output)
-        codebook_seq_len = self.num_audio_codebooks * self.frame_stacking_factor
-        B = dec_output.size(0)
-
-        min_confidence = 0
-        max_confidence = 5
-        confidences = min_confidence * torch.ones(B, codebook_seq_len, device=device)
-        codes = self.mask_token_id * torch.ones((B, codebook_seq_len), device=device, dtype=torch.long)
-        sampled_codes = codes.clone()
-        if fixed_schedule is not None:
-            n_steps = len(fixed_schedule)
-        for step in range(n_steps):
-            progress = step / n_steps
-            frac_masked = cosine_schedule(torch.tensor(progress))
-            if sampling_type == "causal" or sampling_type == "purity_causal":
-                frac_masked = torch.ones_like(frac_masked) * (1.0 - progress)
-            if fixed_schedule is None:
-                n_masked = torch.ceil(codebook_seq_len * frac_masked).long()
-            else:
-                n_masked = codebook_seq_len - fixed_schedule[step]
-            n_unmasked = codebook_seq_len - n_masked
-
-            if sampling_type == "causal" or sampling_type == "purity_causal":
-                n_frames_to_allow = int(np.floor(progress * self.frame_stacking_factor + 1))
-                confidences[:, n_frames_to_allow * self.num_audio_codebooks :] = min_confidence - 1
-
-            _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1)
-            if use_cfg:
-                actual_batch_size = topk_indices.size(0) // 2
-                assert (
-                    topk_indices[actual_batch_size:] == topk_indices[:actual_batch_size]
-                ).all(), "Topk indices are not the same for conditional and unconditional codes"
-
-            unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices)
-            codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
-
-            local_transformer_input = local_transformer_input_init
-            for codebook_num in range(codebook_seq_len):
-                next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1)
-                next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input)
-                local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1)
-
-            _mask = torch.ones(B, codebook_seq_len + 1, device=device)
-            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']
-
-            logits = []
-            for codebook_num in range(codebook_seq_len):
-                codebook_logits = self.local_transformer_out_projections[codebook_num](
-                    local_transformer_output[:, codebook_num + 1, :]
-                )
-                logits.append(codebook_logits)
-            logits = torch.stack(logits, dim=1)
-
-            if use_cfg:
-                actual_batch_size = logits.size(0) // 2
-                conditional_logits = logits[:actual_batch_size]
-                unconditional_logits = logits[actual_batch_size:]
-                if not dynamic_cfg_scale:
-                    current_cfg_scale = cfg_scale
-                else:
-                    progress = step / (n_steps - 1)
-                    interp = progress
-                    current_cfg_scale = (cfg_scale - 1) * interp + 1.0
-                cfg_logits = current_cfg_scale * conditional_logits + (1.0 - current_cfg_scale) * unconditional_logits
-                logits[:actual_batch_size] = cfg_logits
-
-            logits = self.clear_forbidden_logits(logits, forbid_audio_eos=forbid_audio_eos)
-
-            for item_idx in unfinished_items:
-                logits[item_idx, self.audio_eos_id] = float('-inf')
-            for item_idx in finished_items:
-                logits[item_idx, :, :] = float('-inf')
-                logits[item_idx, :, self.audio_eos_id] = 0.0
-
-            logits_topk = torch.topk(logits, topk, dim=-1)[0]
-            indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1)
-            logits_rescored = logits.clone()
-            logits_rescored[indices_to_remove] = float('-inf')
-            probs = torch.softmax(logits_rescored / temperature, dim=-1)
-            sampled_codes = torch.multinomial(probs.view(B * codebook_seq_len, -1), 1).view(B, codebook_seq_len)
-            if use_cfg:
-                sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size]
-                probs[actual_batch_size:] = probs[:actual_batch_size]
-            if sampling_type != "purity_causal" and sampling_type != "purity_default":
-                confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1)
-            else:
-                confidences = probs.max(dim=2)[0]
-            sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
-            if noise_scale > 0.0:
-                noise = (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps)
-                confidences += noise
-                confidences[actual_batch_size:] = confidences[:actual_batch_size]
-            confidence_eps = 0.1
-            assert (
-                confidences.max() + confidence_eps < max_confidence
-            ), f"Predicted confidence is approaching max_confidence: {confidences.max()}"
-            confidences.scatter_(
-                index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float)
-            )
-        codes = sampled_codes
-        assert not (
-            codes == self.mask_token_id
-        ).any(), "Codes contain mask tokens after completion of MaskGit sampling"
-
-        codes = codes.reshape(B, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1)
-
-        if use_cfg:
-            codes = codes[:actual_batch_size]
-        return codes
diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index 5a117432b986..ccc8f315a3c2 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -33,7 +33,6 @@
 from nemo.collections.asr.parts.mixins.transcription import TranscribeConfig
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.collections.tts.data.text_to_speech_dataset_lhotse import MagpieTTSLhotseDataset, setup_tokenizers
-from nemo.collections.tts.models.base_magpietts import worker_init_fn
 from nemo.collections.tts.models.easy_magpietts_inference import (
     EasyMagpieTTSInferenceModel,
     InferBatchOutput,
@@ -41,7 +40,13 @@
     StreamingState,
     TrainingMode,
 )
-from nemo.collections.tts.modules.magpietts_modules import LocalTransformerType
+from nemo.collections.tts.modules.magpietts_modules import (
+    LocalTransformerType,
+    add_special_tokens,
+    remove_eos_token,
+    remove_special_tokens,
+    worker_init_fn,
+)
 from nemo.collections.tts.parts.utils.helpers import (
     compute_utmos_scores_from_filepaths,
     get_mask_from_lengths,
@@ -229,25 +234,34 @@ def log_val_audio_example(
         wandb_audio_log = {}
 
         pred_audio_codes = self.logits_to_audio_codes(logits, audio_codes_lens_target)
-        pred_audio_codes, _ = self.remove_eos_token(
+        pred_audio_codes, _ = remove_eos_token(
             codes=pred_audio_codes,
             codes_len=audio_codes_lens_target,
         )
-        pred_audio, pred_audio_lens, _ = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target - 1)
-        target_audio_codes, _ = self.remove_eos_token(
+        pred_audio_codes, pred_audio_codes_lens = self._prepare_codes_for_decode(pred_audio_codes, audio_codes_lens_target - 1)
+        pred_audio, pred_audio_lens, _ = self._codec_helper.codes_to_audio(
+            pred_audio_codes, pred_audio_codes_lens,
+        )
+        target_audio_codes, _ = remove_eos_token(
             codes=target_audio_codes,
             codes_len=audio_codes_lens_target,
         )
-        target_audio, target_audio_lens, _ = self.codes_to_audio(target_audio_codes, audio_codes_lens_target - 1)
+        target_audio_codes, target_audio_codes_lens = self._prepare_codes_for_decode(target_audio_codes, audio_codes_lens_target - 1)
+        target_audio, target_audio_lens, _ = self._codec_helper.codes_to_audio(
+            target_audio_codes, target_audio_codes_lens,
+        )
 
         context_audio, context_audio_lens = None, None
         if context_audio_codes is not None and context_audio_codes.shape[2] > 3:
             # > 3 ensures, it is a valid context audio tensor (and not dummy tensor used in text context)
-            context_audio_codes, context_audio_codes_lens = self.remove_special_tokens(
+            context_audio_codes, context_audio_codes_lens = remove_special_tokens(
                 codes=context_audio_codes,
                 codes_len=context_audio_codes_lens,
             )
-            context_audio, context_audio_lens, _ = self.codes_to_audio(context_audio_codes, context_audio_codes_lens)
+            context_audio_codes, context_audio_codes_lens = self._prepare_codes_for_decode(context_audio_codes, context_audio_codes_lens)
+            context_audio, context_audio_lens, _ = self._codec_helper.codes_to_audio(
+                context_audio_codes, context_audio_codes_lens,
+            )
 
         for logger in self.loggers:
             is_wandb = isinstance(logger, WandbLogger)
@@ -545,7 +559,7 @@ def prepare_audio_channel_embeddings(
             ).long()
 
         # Add BOS and EOS tokens
-        audio_codes, audio_codes_lens = self.add_special_tokens(
+        audio_codes, audio_codes_lens = add_special_tokens(
             codes=audio_codes,
             codes_len=audio_codes_lens,
             bos_id=self.audio_bos_id,
@@ -859,7 +873,7 @@ def process_batch(
         local_transformer_logits = None
         if self.local_transformer_type != LocalTransformerType.NO_LT:
             assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type"
-            local_transformer_logits = self.compute_local_transformer_logits(
+            local_transformer_logits = self._lt_helper.compute_logits(
                 pred_embeddings, audio_codes_target, targets_offset_by_one=False
             )
             local_transformer_loss, _ = self.compute_loss(
@@ -918,7 +932,9 @@ def training_step(self, batch, batch_idx):
         else:
             context_audio = batch['context_audio']
             context_audio_lens = batch['context_audio_lens']
-            context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
+            context_audio_codes, context_audio_codes_lens = self._codec_helper.audio_to_codes(
+                context_audio, context_audio_lens
+            )
 
         if 'audio_codes' in batch:
             audio_codes = batch['audio_codes']
@@ -926,7 +942,7 @@ def training_step(self, batch, batch_idx):
         else:
             audio = batch['audio']
             audio_lens = batch['audio_lens']
-            audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens)
+            audio_codes, audio_codes_lens = self._codec_helper.audio_to_codes(audio, audio_lens)
 
         batch_output = self.process_batch(
             text=batch['text'],
@@ -1013,7 +1029,9 @@ def validation_step(self, batch, batch_idx):
         else:
             context_audio = batch['context_audio']
             context_audio_lens = batch['context_audio_lens']
-            context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
+            context_audio_codes, context_audio_codes_lens = self._codec_helper.audio_to_codes(
+                context_audio, context_audio_lens
+            )
 
         if 'audio_codes' in batch:
             audio_codes = batch['audio_codes']
@@ -1021,7 +1039,7 @@ def validation_step(self, batch, batch_idx):
         else:
             audio = batch['audio']
             audio_lens = batch['audio_lens']
-            audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens)
+            audio_codes, audio_codes_lens = self._codec_helper.audio_to_codes(audio, audio_lens)
 
         batch_output = self.process_batch(
             text=batch['text'],
@@ -1095,12 +1113,15 @@ def validation_step(self, batch, batch_idx):
             predicted_audio_paths = []
             context_audio_paths = []
 
-            context_audio_codes_cleaned, context_audio_codes_lens_cleaned = self.remove_special_tokens(
+            context_audio_codes_cleaned, context_audio_codes_lens_cleaned = remove_special_tokens(
                 codes=context_audio_codes,
                 codes_len=context_audio_codes_lens,
             )
-            context_audio_cleaned, context_audio_lens_cleaned, _ = self.codes_to_audio(
-                context_audio_codes_cleaned, context_audio_codes_lens_cleaned
+            context_audio_codes_cleaned, context_audio_codes_lens_cleaned = self._prepare_codes_for_decode(
+                context_audio_codes_cleaned, context_audio_codes_lens_cleaned,
+            )
+            context_audio_cleaned, context_audio_lens_cleaned, _ = self._codec_helper.codes_to_audio(
+                context_audio_codes_cleaned, context_audio_codes_lens_cleaned,
             )
 
             for idx in range(infer_output.predicted_audio.size(0)):
diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py
index 59db7decda0e..555c30308e39 100644
--- a/nemo/collections/tts/models/easy_magpietts_inference.py
+++ b/nemo/collections/tts/models/easy_magpietts_inference.py
@@ -27,15 +27,19 @@
 
 from nemo.collections.tts.data.text_to_speech_dataset_lhotse import setup_tokenizers
 from nemo.collections.tts.models import AudioCodecModel
-from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel
 from nemo.collections.tts.modules import transformer_2501
 from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter
 from nemo.collections.tts.modules.magpietts_modules import (
     CharAwareSubwordEncoder,
+    CodecHelper,
+    LocalTransformerHelper,
     LocalTransformerType,
     SpecialAudioToken,
+    add_special_tokens,
+    remove_special_tokens,
 )
 from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
+from nemo.core.classes import ModelPT
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
 
@@ -207,7 +211,7 @@ def from_dict(cls, data: dict) -> 'EasyModelInferenceParameters':
         return cls(**filtered_data)
 
 
-class EasyMagpieTTSInferenceModel(BaseMagpieTTSModel):
+class EasyMagpieTTSInferenceModel(ModelPT):
     """
     Inference-only base class for EasyMagpieTTS decoder-only model.
 
@@ -350,6 +354,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self._codec_model = codec_model
         self._codec_model.freeze()  # Lightning does requires_grad = False and self.eval()
         self._codec_converter = codec_converter
+        self._codec_helper = CodecHelper(self._codec_model, self._codec_converter)
 
         # Audio embedding dimension - can be smaller than hidden_dim to reduce parameters
         self.audio_embedding_dim = cfg.get('audio_embedding_dim', cfg.hidden_dim)
@@ -484,33 +489,84 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
                 )
             self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections)
 
-    def _get_state_dict_keys_to_exclude(self):
+            self._lt_helper = LocalTransformerHelper(
+                local_transformer=self.local_transformer,
+                audio_embeddings=self.audio_embeddings,
+                audio_in_projection=self.audio_in_projection,
+                local_transformer_in_projection=self.local_transformer_in_projection,
+                local_transformer_audio_out_projection=self.local_transformer_audio_out_projection,
+                local_transformer_out_projections=self.local_transformer_out_projections,
+                num_audio_codebooks=self.num_audio_codebooks,
+                frame_stacking_factor=self.frame_stacking_factor,
+                audio_eos_id=self.audio_eos_id,
+                mask_token_id=self.mask_token_id,
+                codebook_size=self.codebook_size,
+            )
+
+    def _get_state_dict_keys_to_exclude(self) -> List[str]:
         return [
             '_codec_model',
         ]
 
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        if hasattr(self, '_no_state_dict') and self._no_state_dict:
+            return {}
+        state_dict = super().state_dict(destination, prefix, keep_vars)
+        keys_substrings_to_exclude = self._get_state_dict_keys_to_exclude()
+        for key in list(state_dict.keys()):
+            if any(substring in key for substring in keys_substrings_to_exclude):
+                del state_dict[key]
+        return state_dict
+
+    def load_state_dict(self, state_dict, strict=True):
+        if not strict:
+            super().load_state_dict(state_dict, strict=False)
+        modules_to_skip = self._get_state_dict_keys_to_exclude()
+        for name, child in self.named_children():
+            if name in modules_to_skip:
+                continue
+            if any(param.numel() > 0 for param in child.parameters()):
+                new_state_dict = {}
+                for key in state_dict.keys():
+                    name_with_dot = f"{name}."
+                    if key.startswith(name_with_dot):
+                        new_state_dict[key[len(name_with_dot) :]] = state_dict[key]
+                child.load_state_dict(new_state_dict)
+
+    def setup_optimizer_param_groups(self):
+        """Exclude frozen eval/inference-only models from the optimizer."""
+        modules_to_exclude = set(self._get_state_dict_keys_to_exclude())
+
+        excluded_param_ids = set()
+        for name, module in self.named_children():
+            if name in modules_to_exclude:
+                for param in module.parameters():
+                    excluded_param_ids.add(id(param))
+
+        trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids]
+
+        logging.info(
+            f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, "
+            f"{len(excluded_param_ids)} params excluded (eval models)"
+        )
+
+        self._optimizer_param_groups = [{"params": trainable_params}]
+
     def setup_training_data(self, train_data_config=None):
         pass
 
     def setup_validation_data(self, val_data_config=None):
         pass
 
-    def codes_to_audio(self, codes, codes_len):
-        # codes: (B, C, T')
-        self._codec_model.eval()
+    def _prepare_codes_for_decode(self, codes, codes_len, min_len=4):
+        """Unstack frame-stacked codes and pad short sequences before decoding."""
         if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor:
             codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor)
-
-        with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
-            if self._codec_converter is not None:
-                codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len)
-            if codes_len.min() < 4:
-                codes = torch.nn.functional.pad(input=codes, pad=(0, 4 - codes_len.min()), value=0)
-                codes_len = torch.where(codes_len < 4, torch.ones_like(codes_len) * 4, codes_len)
-                codes = codes[:, :, : codes_len.max()]
-
-            audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len)
-            return audio, audio_len, codes
+        if min_len > 0 and codes_len.min() < min_len:
+            codes = torch.nn.functional.pad(input=codes, pad=(0, min_len - codes_len.min()), value=0)
+            codes_len = torch.where(codes_len < min_len, torch.ones_like(codes_len) * min_len, codes_len)
+            codes = codes[:, :, : codes_len.max()]
+        return codes, codes_len
 
     def embed_audio_tokens(self, audio_tokens):
         # audio_tokens: (B, C, T')
@@ -750,14 +806,16 @@ def prepare_context_tensors(
         if context_audio_codes is None:
             if context_audio is None:
                 raise ValueError("Either context_audio_codes or context_audio must be provided")
-            context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
+            context_audio_codes, context_audio_codes_lens = self._codec_helper.audio_to_codes(
+                context_audio, context_audio_lens
+            )
 
         if self._codec_converter is not None:
             context_audio_codes = self._codec_converter.convert_original_to_new(
                 audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens
             ).long()
 
-        context_audio_codes, context_audio_codes_lens = self.add_special_tokens(
+        context_audio_codes, context_audio_codes_lens = add_special_tokens(
             codes=context_audio_codes,
             codes_len=context_audio_codes_lens,
             bos_id=self.context_audio_bos_id,
@@ -921,7 +979,7 @@ def _sample_audio_codes(
         """
         if use_local_transformer_for_inference:
             if self.local_transformer_type == LocalTransformerType.AR:
-                audio_codes_next = self.local_transformer_sample_autoregressive(
+                audio_codes_next = self._lt_helper.sample_autoregressive(
                     dec_output=last_hidden[:, -1, :],
                     temperature=temperature,
                     topk=topk,
@@ -1663,7 +1721,10 @@ def streaming_finalize(
 
             # No need to remove EOS - end_indices already point to the frame before EOS
             # Decode to audio (codes are already unstacked: B, C, T)
-            audio, audio_len, decoded_codes = self.codes_to_audio(predicted_codes, predicted_codes_lens)
+            predicted_codes, predicted_codes_lens = self._prepare_codes_for_decode(predicted_codes, predicted_codes_lens)
+            audio, audio_len, decoded_codes = self._codec_helper.codes_to_audio(
+                predicted_codes, predicted_codes_lens,
+            )
 
             return StreamingFinalizeOutput(
                 audio=audio,
@@ -1740,7 +1801,9 @@ def infer_batch(
             else:
                 context_audio = batch['context_audio']
                 context_audio_lens = batch['context_audio_lens']
-                context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
+                context_audio_codes, context_audio_codes_lens = self._codec_helper.audio_to_codes(
+                    context_audio, context_audio_lens
+                )
 
             # Optional GT phoneme tokens for teacher forcing
             gt_phoneme_tokens = batch.get('phoneme_tokens')
@@ -1761,7 +1824,9 @@ def infer_batch(
                 elif 'audio' in batch:
                     gt_audio = batch['audio']
                     gt_audio_lens = batch['audio_lens']
-                    gt_audio_codes, gt_audio_codes_lens = self.audio_to_codes(gt_audio, gt_audio_lens)
+                    gt_audio_codes, gt_audio_codes_lens = self._codec_helper.audio_to_codes(
+                        gt_audio, gt_audio_lens
+                    )
                 else:
                     raise ValueError("Teacher forcing requires 'audio_codes' or 'audio' in batch")
 
@@ -1771,7 +1836,7 @@ def infer_batch(
                         audio_tokens=gt_audio_codes, audio_lens=gt_audio_codes_lens
                     ).long()
 
-                gt_audio_codes_processed, gt_audio_codes_lens_processed = self.add_special_tokens(
+                gt_audio_codes_processed, gt_audio_codes_lens_processed = add_special_tokens(
                     codes=gt_audio_codes,
                     codes_len=gt_audio_codes_lens,
                     bos_id=self.audio_bos_id,
@@ -1977,7 +2042,9 @@ def do_tts(
             context_audio = context_audio.to(device)
             context_audio_lens = torch.tensor([context_audio.size(1)], dtype=torch.long, device=device)
             with torch.inference_mode():
-                context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens)
+                context_audio_codes, context_audio_codes_lens = self._codec_helper.audio_to_codes(
+                    context_audio, context_audio_lens
+                )
         else:
             context_audio_codes = torch.zeros(
                 1,
diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
index 020e7af77aa5..600ddda579bd 100644
--- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
+++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
@@ -437,7 +437,10 @@ def _get_reference_audio_paths(self, batch_repeated: Dict) -> List[str]:
                 context_codes = self._codec_converter.convert_original_to_new(
                     audio_tokens=context_codes, audio_lens=context_lens
                 ).long()
-            context_audio, context_audio_lens, _ = self.codes_to_audio(context_codes, context_lens)
+            context_codes, context_lens = self._prepare_codes_for_decode(context_codes, context_lens)
+            context_audio, context_audio_lens, _ = self._codec_helper.codes_to_audio(
+                context_codes, context_lens,
+            )
             return self._save_waveforms_to_paths(
                 waveforms=context_audio,
                 waveform_lens=context_audio_lens,
@@ -462,7 +465,7 @@ def _run_easy_process_batch(
             context_audio_codes = batch['context_audio_codes']
             context_audio_codes_lens = batch['context_audio_codes_lens']
         else:
-            context_audio_codes, context_audio_codes_lens = model.audio_to_codes(
+            context_audio_codes, context_audio_codes_lens = model._codec_helper.audio_to_codes(
                 batch['context_audio'], batch['context_audio_lens']
             )
 
diff --git a/nemo/collections/tts/models/magpietts.py b/nemo/collections/tts/models/magpietts.py
index 28af39542f21..f710bb853986 100644
--- a/nemo/collections/tts/models/magpietts.py
+++ b/nemo/collections/tts/models/magpietts.py
@@ -38,15 +38,26 @@
 from nemo.collections.tts.losses.aligner_loss import ForwardSumLoss
 from nemo.collections.tts.losses.moe_loss import MoEAuxiliaryLoss, compute_expert_usage
 from nemo.collections.tts.models import AudioCodecModel
-from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel, worker_init_fn
 from nemo.collections.tts.modules import transformer_2501
 from nemo.collections.tts.modules.aligner import AlignmentEncoder
 from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter
 from nemo.collections.tts.modules.magpietts_modules import (
     CharAwareSubwordEncoder,
+    CodecHelper,
     EOSDetectionMethod,
+    LocalTransformerHelper,
     LocalTransformerType,
     SpecialAudioToken,
+    add_eos_token,
+    add_special_tokens,
+    clear_forbidden_logits,
+    pad_audio_codes,
+    remove_bos_token,
+    remove_embedded_bos_token,
+    remove_embedded_eos_token,
+    remove_eos_token,
+    remove_special_tokens,
+    worker_init_fn,
 )
 from nemo.collections.tts.parts.utils.helpers import (
     binarize_attention_parallel,
@@ -59,6 +70,7 @@
     get_tokenizer_for_language,
     stack_tensors,
 )
+from nemo.core.classes import ModelPT
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
 
@@ -299,7 +311,7 @@ def from_dict(cls, data: dict) -> 'ModelInferenceParameters':
         return cls(**filtered_data)
 
 
-class MagpieTTSModel(BaseMagpieTTSModel):
+class MagpieTTSModel(ModelPT):
     """
     Magpie-TTS Model Base Class used for training a TTS model that can generate audio codes from transcript and a context
     audio/text
@@ -470,13 +482,14 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self._codec_model = codec_model
         self._codec_model.freeze()  # Lightning does requires_grad = False and self.eval()
         self._codec_converter = codec_converter
+        self._codec_helper = CodecHelper(self._codec_model, self._codec_converter)
 
         audio_embeddings = []
         for _ in range(self.num_audio_codebooks * self.frame_stacking_factor):
             audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, cfg.embedding_dim))
         self.audio_embeddings = nn.ModuleList(audio_embeddings)
 
-        # Identity projections required by BaseMagpieTTSModel local transformer methods.
+        # Identity projections required by LocalTransformerHelper methods.
         # MagpieTTSModel embeds directly in embedding_dim, so no projection is needed.
         self.audio_in_projection = nn.Identity()
         self.local_transformer_audio_out_projection = nn.Identity()
@@ -537,6 +550,20 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
                 )
             self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections)
 
+            self._lt_helper = LocalTransformerHelper(
+                local_transformer=self.local_transformer,
+                audio_embeddings=self.audio_embeddings,
+                audio_in_projection=self.audio_in_projection,
+                local_transformer_in_projection=self.local_transformer_in_projection,
+                local_transformer_audio_out_projection=self.local_transformer_audio_out_projection,
+                local_transformer_out_projections=self.local_transformer_out_projections,
+                num_audio_codebooks=self.num_audio_codebooks,
+                frame_stacking_factor=self.frame_stacking_factor,
+                audio_eos_id=self.audio_eos_id,
+                mask_token_id=self.mask_token_id,
+                codebook_size=self.codebook_size,
+            )
+
         if cfg.get('use_alignment_encoder', False):
             self.alignment_encoder = AlignmentEncoder(
                 n_mel_channels=cfg.embedding_dim,
@@ -751,6 +778,35 @@ def _get_state_dict_keys_to_exclude(self):
             keys.append('context_encoder')
         return keys
 
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        if hasattr(self, '_no_state_dict') and self._no_state_dict:
+            return {}
+        state_dict = super().state_dict(destination, prefix, keep_vars)
+        keys_substrings_to_exclude = self._get_state_dict_keys_to_exclude()
+        for key in list(state_dict.keys()):
+            if any(substring in key for substring in keys_substrings_to_exclude):
+                del state_dict[key]
+        return state_dict
+
+    def setup_optimizer_param_groups(self):
+        """Exclude frozen eval/inference-only models from the optimizer."""
+        modules_to_exclude = set(self._get_state_dict_keys_to_exclude())
+
+        excluded_param_ids = set()
+        for name, module in self.named_children():
+            if name in modules_to_exclude:
+                for param in module.parameters():
+                    excluded_param_ids.add(id(param))
+
+        trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids]
+
+        logging.info(
+            f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, "
+            f"{len(excluded_param_ids)} params excluded (eval models)"
+        )
+
+        self._optimizer_param_groups = [{"params": trainable_params}]
+
     def check_frame_stacking_config_validity(self):
         """
         Check if the configuration is compatible with frame stacking.
@@ -997,7 +1053,7 @@ def load_state_dict(self, state_dict, strict=True):
 
     def embed_audio_tokens(self, audio_tokens, audio_tokens_lens):
         B, C, T = audio_tokens.shape
-        audio_tokens = self.pad_audio_codes(audio_tokens).long()
+        audio_tokens = pad_audio_codes(audio_tokens, self.frame_stacking_factor).long()
         audio_embedding = None
         for i in range(self.frame_stacking_factor):
             for c in range(C):
@@ -1045,7 +1101,7 @@ def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=N
             # repeat loss mask for each codebook to simplify code below
             loss_mask = loss_mask.unsqueeze(1).repeat(1, audio_codes.size(1), 1)
         total_codebook_loss = None
-        audio_codes = self.pad_audio_codes(audio_codes).long()
+        audio_codes = pad_audio_codes(audio_codes, self.frame_stacking_factor).long()
         for fs_index in range(frame_stacking_factor):
             for codebook in range(audio_codes.size(1)):
                 si = (codebook + self.num_audio_codebooks * fs_index) * self.num_all_tokens_per_codebook
@@ -1210,8 +1266,8 @@ def sample_codes_from_logits(
                     codebook_logits[item_idx, self.audio_eos_id] = 0.0
 
                 # Disallow generation of special tokens
-                codebook_logits = self.clear_forbidden_logits(
-                    codebook_logits.unsqueeze(1), forbid_audio_eos=forbid_audio_eos
+                codebook_logits = clear_forbidden_logits(
+                    codebook_logits.unsqueeze(1), self.codebook_size, forbid_audio_eos=forbid_audio_eos
                 ).squeeze(1)
 
                 codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]  # (B, topk)
@@ -1301,25 +1357,29 @@ def _prepare_audio_examples(
         with torch.no_grad():
             # Decode predictions: convert logits to codes, remove EOS token, then decode to audio
             pred_audio_codes = self.logits_to_audio_codes(logits, audio_codes_lens)
-            pred_audio_codes, pred_audio_codes_lens = self.remove_eos_token(
+            pred_audio_codes, pred_audio_codes_lens = remove_eos_token(
                 codes=pred_audio_codes, codes_len=audio_codes_lens
             )
-            pred_audio, pred_audio_lens, _ = self.codes_to_audio(pred_audio_codes, pred_audio_codes_lens)
+            pred_audio, pred_audio_lens, _ = self._codec_helper.codes_to_audio(
+                pred_audio_codes, pred_audio_codes_lens
+            )
 
             # Decode targets: remove EOS token, then decode to audio
-            target_audio_codes, target_audio_codes_lens = self.remove_eos_token(
+            target_audio_codes, target_audio_codes_lens = remove_eos_token(
                 codes=target_audio_codes, codes_len=audio_codes_lens
             )
-            target_audio, target_audio_lens, _ = self.codes_to_audio(target_audio_codes, target_audio_codes_lens)
+            target_audio, target_audio_lens, _ = self._codec_helper.codes_to_audio(
+                target_audio_codes, target_audio_codes_lens
+            )
 
             # Decode context audio if available (shape check ensures it's not a dummy tensor used in text context)
             # This does not handle the case in which a batch has a mixture of text and audio context examples
             context_audio, context_audio_lens = None, None
             if context_audio_codes is not None and context_audio_codes.shape[2] > 3:
-                context_audio_codes, context_audio_codes_lens = self.remove_special_tokens(
+                context_audio_codes, context_audio_codes_lens = remove_special_tokens(
                     codes=context_audio_codes, codes_len=context_audio_codes_lens
                 )
-                context_audio, context_audio_lens, _ = self.codes_to_audio(
+                context_audio, context_audio_lens, _ = self._codec_helper.codes_to_audio(
                     context_audio_codes, context_audio_codes_lens
                 )
 
@@ -1539,14 +1599,15 @@ def _get_context_audio_codes(self, batch: Dict[str, torch.Tensor]) -> Tuple[torc
             codes = batch['context_audio_codes']
             lens = batch['context_audio_codes_lens']
         else:
-            codes, lens = self.audio_to_codes(
-                batch['context_audio'], batch['context_audio_lens'], batch.get('context_sample_rate')
+            codes, lens = self._codec_helper.audio_to_codes(
+                batch['context_audio'], batch['context_audio_lens'],
+                sample_rate=batch.get('context_sample_rate'),
             )
 
         if self._codec_converter is not None:
             codes = self._codec_converter.convert_original_to_new(audio_tokens=codes, audio_lens=lens)
 
-        codes, lens = self.add_special_tokens(
+        codes, lens = add_special_tokens(
             codes=codes,
             codes_len=lens,
             bos_id=self.context_audio_bos_id,
@@ -1950,8 +2011,9 @@ def process_batch(self, batch):
         disable_alignment_loss = False
 
         if 'audio_codes' not in batch:
-            audio_codes, audio_codes_lens = self.audio_to_codes(
-                batch['audio'], batch['audio_lens'], batch.get('sample_rate')
+            audio_codes, audio_codes_lens = self._codec_helper.audio_to_codes(
+                batch['audio'], batch['audio_lens'],
+                sample_rate=batch.get('sample_rate'),
             )
         else:
             audio_codes = batch['audio_codes']
@@ -1962,7 +2024,7 @@ def process_batch(self, batch):
                 audio_tokens=audio_codes, audio_lens=audio_codes_lens
             )
 
-        audio_codes, audio_codes_lens = self.add_special_tokens(
+        audio_codes, audio_codes_lens = add_special_tokens(
             codes=audio_codes,
             codes_len=audio_codes_lens,
             bos_id=self.audio_bos_id,
@@ -1976,7 +2038,7 @@ def process_batch(self, batch):
         # Note: if a tensor lacks the `_unstacked` suffix, it can be assumed to be in the frame-stacked domain
 
         # Remove EOS token for decoder inputs
-        audio_codes_embedded_input, audio_codes_lens_input = self.remove_embedded_eos_token(
+        audio_codes_embedded_input, audio_codes_lens_input = remove_embedded_eos_token(
             embedded=audio_codes_embedded_all, embedded_len=audio_codes_lens_all
         )
         use_cfg = self.training and (self.cfg_unconditional_prob > 0.0) and (context_tensors.cond is not None)
@@ -2009,7 +2071,7 @@ def process_batch(self, batch):
                 random_embedded, random_embedded_lens = self.embed_audio_tokens(
                     audio_tokens=random_audio_tokens, audio_tokens_lens=audio_codes_lens
                 )  # (B T E)
-                random_embedded, random_embedded_lens = self.remove_embedded_eos_token(
+                random_embedded, random_embedded_lens = remove_embedded_eos_token(
                     embedded=random_embedded, embedded_len=random_embedded_lens
                 )
                 dec_dropout_mask = (
@@ -2028,7 +2090,7 @@ def process_batch(self, batch):
             audio_codes_mask = torch.cat([additional_decoder_mask, audio_codes_mask], dim=1)
 
         # Remove BOS token for aligner targets
-        audio_codes_embedded_target, audio_codes_lens_target = self.remove_embedded_bos_token(
+        audio_codes_embedded_target, audio_codes_lens_target = remove_embedded_bos_token(
             embedded=audio_codes_embedded_all, embedded_len=audio_codes_lens_all
         )
         aligner_encoder_loss = None
@@ -2083,7 +2145,7 @@ def process_batch(self, batch):
         logits = logits[:, dec_context_size:, :]  # Remove the context audio embeddings from the logits
 
         # Remove BOS tokens from decoder targets
-        audio_codes_target_unstacked, audio_codes_lens_target_unstacked = self.remove_bos_token(
+        audio_codes_target_unstacked, audio_codes_lens_target_unstacked = remove_bos_token(
             codes=audio_codes, codes_len=audio_codes_lens, num_tokens=self.frame_stacking_factor
         )
         # Codebook loss (parallel)
@@ -2116,10 +2178,10 @@ def process_batch(self, batch):
             if self.local_transformer_type == LocalTransformerType.MASKGIT:
                 # Maskgit
                 # randomly replace some positions with MASK_TOKEN
-                audio_codes_masked, mask_tokens_mask = self.maskgit_apply_random_mask(audio_codes_target_unstacked)
+                audio_codes_masked, mask_tokens_mask = self._lt_helper.apply_random_mask(audio_codes_target_unstacked)
                 # TODO @rfejgin: the very last position might be padding but the local transformer might look at it as part of
                 #                of a pair where the first position is valid. Is this an issue?
-                local_transformer_logits = self.compute_local_transformer_logits(
+                local_transformer_logits = self._lt_helper.compute_logits(
                     dec_out[:, dec_context_size:, :], audio_codes_masked, targets_offset_by_one=True
                 )
                 local_transformer_loss, _ = self.compute_loss(
@@ -2132,7 +2194,7 @@ def process_batch(self, batch):
             else:
                 # Autoregressive
                 assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type"
-                local_transformer_logits = self.compute_local_transformer_logits(
+                local_transformer_logits = self._lt_helper.compute_logits(
                     dec_out[:, dec_context_size:, :], audio_codes_target_unstacked, targets_offset_by_one=False
                 )
                 local_transformer_loss, _ = self.compute_loss(
@@ -2903,7 +2965,7 @@ def infer_batch(
                 if use_local_transformer_for_inference:
                     if self.local_transformer_type == LocalTransformerType.AR:
                         # Autoregressive sampling with local transformer
-                        audio_codes_next = self.local_transformer_sample_autoregressive(
+                        audio_codes_next = self._lt_helper.sample_autoregressive(
                             dec_output=dec_out[:, -1, :],
                             temperature=self.inference_parameters.temperature,
                             topk=self.inference_parameters.topk,
@@ -2915,7 +2977,7 @@ def infer_batch(
                             forbid_audio_eos=forbid_audio_eos,
                         )
                     elif self.local_transformer_type == LocalTransformerType.MASKGIT:
-                        audio_codes_next = self.local_transformer_sample_maskgit(
+                        audio_codes_next = self._lt_helper.sample_maskgit(
                             dec_output=dec_out[:, -1, :],
                             temperature=self.inference_parameters.temperature,
                             topk=self.inference_parameters.topk,
@@ -2982,7 +3044,7 @@ def infer_batch(
             predicted_codes_lens = torch.tensor(predicted_lens, device=text.device).long()
             predicted_codes = predicted_codes[:, :, : predicted_codes_lens.max()]
 
-            predicted_audio, predicted_audio_lens, predicted_codes = self.codes_to_audio(
+            predicted_audio, predicted_audio_lens, predicted_codes = self._codec_helper.codes_to_audio(
                 predicted_codes, predicted_codes_lens
             )
             end_time = time.time()
@@ -3682,7 +3744,9 @@ def do_tts(
             if len(all_codes) > 0:
                 concatenated_codes = torch.cat(all_codes, dim=1).unsqueeze(0)
                 codes_lens = torch.tensor([concatenated_codes.shape[2]], device=self.device, dtype=torch.long)
-                predicted_audio, predicted_audio_lens, _ = self.codes_to_audio(concatenated_codes, codes_lens)
+                predicted_audio, predicted_audio_lens, _ = self._codec_helper.codes_to_audio(
+                    concatenated_codes, codes_lens
+                )
                 return predicted_audio, predicted_audio_lens
             else:
                 return torch.zeros(1, 0, device=self.device), torch.zeros(1, device=self.device, dtype=torch.long)
@@ -4489,7 +4553,7 @@ def generate_speech(
                 if use_local_transformer_for_inference:
                     if self.local_transformer_type == LocalTransformerType.AR:
                         # Autoregressive sampling with local transformer
-                        audio_codes_next = self.local_transformer_sample_autoregressive(
+                        audio_codes_next = self._lt_helper.sample_autoregressive(
                             dec_output=dec_out[:, -1, :],
                             temperature=self.inference_parameters.temperature,
                             topk=self.inference_parameters.topk,
@@ -4501,7 +4565,7 @@ def generate_speech(
                             forbid_audio_eos=forbid_audio_eos,
                         )
                     elif self.local_transformer_type == LocalTransformerType.MASKGIT:
-                        audio_codes_next = self.local_transformer_sample_maskgit(
+                        audio_codes_next = self._lt_helper.sample_maskgit(
                             dec_output=dec_out[:, -1, :],
                             temperature=self.inference_parameters.temperature,
                             topk=self.inference_parameters.topk,
diff --git a/nemo/collections/tts/models/magpietts_preference_optimization.py b/nemo/collections/tts/models/magpietts_preference_optimization.py
index d754f5718130..ce36df483ede 100644
--- a/nemo/collections/tts/models/magpietts_preference_optimization.py
+++ b/nemo/collections/tts/models/magpietts_preference_optimization.py
@@ -51,6 +51,7 @@
     PYNINI_AVAILABLE = False
 
 from nemo.collections.tts.models import MagpieTTSModel
+from nemo.collections.tts.modules.magpietts_modules import add_eos_token
 
 
 class MagpieTTSModelOfflinePODataGen(MagpieTTSModel):
@@ -904,7 +905,7 @@ def process_batch_online_po(self, batch, n_generations_per_item, mode='train'):
             with torch.no_grad():
                 reference_model_output = self._reference_model.process_batch(batch_repeated)
 
-        codebook_targets, _ = self.add_eos_token(
+        codebook_targets, _ = add_eos_token(
             codes=predicted_codes, codes_len=predicted_codes_lens, eos_id=self.audio_eos_id
         )
 
diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index e936f81439be..2cebff638977 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -307,7 +307,7 @@ class MagpieInferenceRunner(BaseInferenceRunner):
     """Runner for encoder-decoder MagpieTTSModel.
 
     Uses ChunkedTTSInferenceDataset and model.generate_speech() per chunk,
-    then model.codes_to_audio() to produce waveforms.
+    then codes_to_audio() to produce waveforms.
     """
 
     def __init__(self, model, config: MagpieInferenceConfig):
@@ -472,8 +472,8 @@ def _run_unified_inference(
             predicted_codes = stack_tensors(predicted_codes_list, max_lens=[max_code_len]).cuda()
             predicted_codes_lens_tensor = torch.tensor(predicted_codes_lens, dtype=torch.long, device='cuda')
 
-            predicted_audio, predicted_audio_lens, _ = self.model.codes_to_audio(
-                predicted_codes, predicted_codes_lens_tensor
+            predicted_audio, predicted_audio_lens, _ = self.model._codec_helper.codes_to_audio(
+                predicted_codes, predicted_codes_lens_tensor,
             )
 
             total_audio_samples = sum(predicted_audio_lens.cpu().tolist())
diff --git a/nemo/collections/tts/modules/magpietts_modules.py b/nemo/collections/tts/modules/magpietts_modules.py
index 8569b691242f..3f11e8231488 100644
--- a/nemo/collections/tts/modules/magpietts_modules.py
+++ b/nemo/collections/tts/modules/magpietts_modules.py
@@ -15,13 +15,18 @@
 from __future__ import annotations
 
 from enum import Enum
+from typing import Dict, List, Optional
 
+import numpy as np
 import torch
+from hydra.utils import instantiate
 from torch import Tensor
+from torch.utils.data import get_worker_info
 
 from nemo.collections.tts.modules import transformer_2501
 from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
 from nemo.core.classes.module import NeuralModule
+from nemo.utils import logging
 from nemo.utils.enum import PrettyStrEnum
 
 
@@ -251,3 +256,536 @@ def forward(self, subword_ids: Tensor, subword_mask: Tensor | None = None) -> Te
         subword_emb[subword_mask.unsqueeze(-1).expand(-1, -1, mean_emb.size(-1))] = mean_emb.view(-1)
 
         return subword_emb
+
+
+# ---------------------------------------------------------------------------
+# Audio code utility functions
+# ---------------------------------------------------------------------------
+
+
+def worker_init_fn(worker_id):
+    """Per-worker init for DataLoader workers.
+
+    Sets up tokenizers for the dataset (text and optionally phoneme)
+    when using multiprocessing.
+    """
+    from nemo.collections.tts.data.text_to_speech_dataset_lhotse import setup_tokenizers
+
+    logging.info(f"Worker {worker_id} initializing...")
+    worker_info = get_worker_info()
+    dataset = worker_info.dataset
+    tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type)
+    dataset.text_tokenizer = tokenizer
+    if hasattr(dataset, 'phoneme_tokenizer_config'):
+        dataset.phoneme_tokenizer = instantiate(dataset.phoneme_tokenizer_config)
+
+
+def add_eos_token(codes, codes_len, eos_id, num_eos_tokens=1):
+    """Appends EOS tokens at the end of each sequence in the batch.
+
+    Args:
+        codes: (B, C, T')
+        codes_len: (B,)
+        eos_id: Token id to use as EOS.
+        num_eos_tokens: Number of EOS tokens to append.
+    """
+    codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0)
+    codes_len = codes_len + num_eos_tokens
+    for idx in range(codes.size(0)):
+        codes[idx, :, codes_len[idx] - 1] = eos_id
+    return codes, codes_len
+
+
+def add_special_tokens(codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1):
+    """Prepends BOS and appends EOS tokens to each sequence.
+
+    Args:
+        codes: (B, C, T')
+    """
+    codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id)
+    codes_len = codes_len + num_bos_tokens
+    codes, codes_len = add_eos_token(codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens)
+    return codes, codes_len
+
+
+def remove_bos_token(codes, codes_len, num_tokens=1):
+    codes = codes[:, :, num_tokens:]
+    codes_len = codes_len - num_tokens
+    return codes, codes_len
+
+
+def remove_embedded_bos_token(embedded, embedded_len):
+    embedded = embedded[:, 1:, :]
+    embedded_len = embedded_len - 1
+    return embedded, embedded_len
+
+
+def remove_eos_token(codes, codes_len):
+    codes_len = codes_len - 1
+    codes = codes[:, :, :-1]
+    mask = get_mask_from_lengths(lengths=codes_len)
+    codes = codes * mask.unsqueeze(1)
+    return codes, codes_len
+
+
+def remove_embedded_eos_token(embedded, embedded_len):
+    """Remove the last token from embedded sequences.
+
+    Args:
+        embedded: (B, T', D)
+    """
+    embedded_len = embedded_len - 1
+    embedded = embedded[:, :-1, :]
+    mask = get_mask_from_lengths(lengths=embedded_len)
+    embedded = embedded * mask.unsqueeze(2)
+    return embedded, embedded_len
+
+
+def remove_special_tokens(codes, codes_len, num_bos_tokens=1):
+    codes, codes_len = remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens)
+    codes, codes_len = remove_eos_token(codes=codes, codes_len=codes_len)
+    return codes, codes_len
+
+
+def pad_audio_codes(audio_codes: torch.Tensor, frame_stacking_factor: int) -> torch.Tensor:
+    """Pads the time dimension of audio codes to a multiple of *frame_stacking_factor*.
+
+    Args:
+        audio_codes: (B, C, T)
+        frame_stacking_factor: Factor to pad to.
+    Returns:
+        (B, C, T_padded)
+    """
+    T = audio_codes.size(2)
+    T_padded = int(np.ceil(T / frame_stacking_factor) * frame_stacking_factor)
+    num_pad = T_padded - T
+    audio_codes = torch.nn.functional.pad(input=audio_codes, pad=(0, num_pad))
+    return audio_codes
+
+
+def clear_forbidden_logits(logits: torch.Tensor, codebook_size: int, forbid_audio_eos: bool = False) -> torch.Tensor:
+    """Sets logits of forbidden tokens to ``-inf`` so they will never be sampled.
+
+    Specifically, we forbid sampling of all special tokens except AUDIO_EOS
+    which is allowed by default.
+
+    Args:
+        logits: (B, C, num_audio_tokens_per_codebook) or compatible shape.
+        codebook_size: Base codebook size (excluding special tokens).
+        forbid_audio_eos: If True, also forbid AUDIO_EOS tokens from being sampled.
+    """
+    logits[
+        :,
+        :,
+        SpecialAudioToken.get_forbidden_tokens(codebook_size, forbid_audio_eos=forbid_audio_eos),
+    ] = float('-inf')
+    return logits
+
+
+class CodecHelper:
+    """Thin wrapper around a codec model and optional token converter.
+
+    Instantiate once per model and use ``audio_to_codes`` / ``codes_to_audio``
+    without having to pass the codec objects every time.
+    """
+
+    def __init__(self, codec_model, codec_converter=None):
+        self.codec_model = codec_model
+        self.codec_converter = codec_converter
+
+    def audio_to_codes(self, audio, audio_len, sample_rate=None):
+        """Encode audio waveforms into codec codes."""
+        self.codec_model.eval()
+        with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32):
+            codes, codes_len = self.codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate)
+            return codes, codes_len
+
+    def codes_to_audio(self, codes, codes_len):
+        """Decode codec codes back into audio waveforms.
+
+        ``codes`` must already be unstacked to the shape the codec expects.
+        """
+        self.codec_model.eval()
+        with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32):
+            if self.codec_converter is not None:
+                codes = self.codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len)
+            audio, audio_len = self.codec_model.decode(tokens=codes, tokens_len=codes_len)
+            return audio, audio_len, codes
+
+
+# ---------------------------------------------------------------------------
+# LocalTransformerHelper
+# ---------------------------------------------------------------------------
+
+
+class LocalTransformerHelper:
+    """Orchestrates local-transformer forward passes and sampling.
+
+    This is a plain Python class (not ``nn.Module``) that holds *references*
+    to nn.Module sub-modules owned by the parent model.  Keeping it non-Module
+    preserves checkpoint key compatibility.
+
+    Args:
+        local_transformer: The local transformer module.
+        audio_embeddings: List/ModuleList of per-codebook embedding layers.
+        audio_in_projection: Linear projection applied after per-codebook embedding.
+        local_transformer_in_projection: Projection into the local transformer input space.
+        local_transformer_audio_out_projection: Projection applied to local transformer output
+            before the per-codebook output heads.
+        local_transformer_out_projections: List/ModuleList of per-codebook output heads.
+        num_audio_codebooks: Number of audio codebooks (C).
+        frame_stacking_factor: Frame stacking factor (S).
+        audio_eos_id: Token id for audio EOS.
+        mask_token_id: Token id used for MaskGit masking.
+        codebook_size: Base codebook size (excluding special tokens).
+    """
+
+    def __init__(
+        self,
+        local_transformer,
+        audio_embeddings,
+        audio_in_projection,
+        local_transformer_in_projection,
+        local_transformer_audio_out_projection,
+        local_transformer_out_projections,
+        num_audio_codebooks: int,
+        frame_stacking_factor: int,
+        audio_eos_id: int,
+        mask_token_id: int,
+        codebook_size: int,
+    ):
+        self.local_transformer = local_transformer
+        self.audio_embeddings = audio_embeddings
+        self.audio_in_projection = audio_in_projection
+        self.local_transformer_in_projection = local_transformer_in_projection
+        self.local_transformer_audio_out_projection = local_transformer_audio_out_projection
+        self.local_transformer_out_projections = local_transformer_out_projections
+        self.num_audio_codebooks = num_audio_codebooks
+        self.frame_stacking_factor = frame_stacking_factor
+        self.audio_eos_id = audio_eos_id
+        self.mask_token_id = mask_token_id
+        self.codebook_size = codebook_size
+
+    def create_random_mask(self, codes):
+        """Creates a mask where True indicates positions that should be replaced with MASK_TOKEN."""
+        B, C, T = codes.shape
+        rand_values = torch.rand(B, T, device=codes.device)
+        frac_masked = cosine_schedule(rand_values)
+        n_masked = torch.ceil(frac_masked * C).long()
+        random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1)
+        mask_indices = torch.arange(C, device=codes.device).view(1, C, 1)
+        mask = mask_indices < n_masked.view(B, 1, T)
+        mask = torch.gather(mask, 1, random_permutations)
+        return mask
+
+    def apply_random_mask(self, codes):
+        """Randomly replaces some codes with MASK_TOKEN following the cosine schedule."""
+        mask = self.create_random_mask(codes)
+        codes_with_mask = torch.where(mask, self.mask_token_id, codes)
+        return codes_with_mask, mask
+
+    def compute_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False):
+        """Predicts the logits for all codebooks using the local transformer.
+
+        Used in both autoregressive (AR) and MaskGit (MG) modes during
+        training and validation (not inference/sampling).
+
+        The sequence layout is slightly different between AR and MG modes, as shown below
+        (using an 8-codebook setup as an example)::
+
+            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+            | AR target  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |   none  |
+            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+            | MG target  |  none   |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
+            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+            |   Input    | Magpie  |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |
+            |            | Latent  | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK |
+            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+            | Seq. Index |    0    |    1    |    2    |    3    |    4    |    5    |    6    |    7    |    8    |
+            +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+
+        Args:
+            dec_out: (B, T', E)
+            audio_codes_target: (B, C, T')
+            targets_offset_by_one: if False, target for index 0 is codebook 0 (AR);
+                if True, target for index 1 is codebook 0 (MaskGit).
+        """
+        C = self.num_audio_codebooks
+        dec_out_all = dec_out.reshape(-1, dec_out.size(-1))  # (B*T', E)
+        local_transformer_input = [dec_out_all]
+        audio_codes_target = pad_audio_codes(audio_codes_target, self.frame_stacking_factor).long()
+        for fs_index in range(self.frame_stacking_factor):
+            for codebook_num in range(C):
+                codes = audio_codes_target[:, codebook_num, fs_index :: self.frame_stacking_factor]
+                codes = codes.reshape(-1)
+                codebook_embedding = self.audio_embeddings[codebook_num + fs_index * C](codes)
+                codebook_embedding = self.audio_in_projection(codebook_embedding)
+                local_transformer_input.append(codebook_embedding)
+
+        local_transformer_input = torch.stack(local_transformer_input, dim=1)
+        local_transformer_input = self.local_transformer_in_projection(local_transformer_input)
+        _mask = torch.ones(
+            local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
+        )
+        local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']
+        if not targets_offset_by_one:
+            local_transformer_output = local_transformer_output[:, :-1, :]
+        else:
+            local_transformer_output = local_transformer_output[:, 1:, :]
+
+        local_transformer_output = self.local_transformer_audio_out_projection(local_transformer_output)
+
+        all_code_logits = []
+        for fs_index in range(self.frame_stacking_factor):
+            for codebook_num in range(audio_codes_target.size(1)):
+                codebook_logits = self.local_transformer_out_projections[codebook_num + fs_index * C](
+                    local_transformer_output[:, codebook_num + fs_index * C, :]
+                )
+                all_code_logits.append(codebook_logits)
+        all_code_logits = torch.cat(all_code_logits, dim=1)
+
+        all_code_logits = all_code_logits.view(
+            audio_codes_target.size(0), audio_codes_target.size(2) // self.frame_stacking_factor, -1
+        )
+
+        return all_code_logits
+
+    def sample_autoregressive(
+        self,
+        dec_output: torch.Tensor,
+        temperature: float = 0.7,
+        topk: int = 80,
+        unfinished_items: Dict[int, bool] = {},
+        finished_items: Dict[int, bool] = {},
+        use_cfg: bool = False,
+        cfg_scale: float = 1.0,
+        use_kv_cache: bool = True,
+        forbid_audio_eos: bool = False,
+        sanitize_logits: bool = False,
+    ) -> torch.Tensor:
+        """Sample audio codes autoregressively across codebooks using the local transformer.
+
+        Args:
+            dec_output: Decoder output tensor (B, E).
+            temperature: Sampling temperature. When <= 0, uses argmax.
+            topk: Number of top-probability tokens to consider.
+            unfinished_items: Batch indices that have not completed generation (EOS forbidden).
+            finished_items: Batch indices that are completed (EOS forced).
+            use_cfg: Whether to use classifier-free guidance (doubled batch).
+            cfg_scale: Scale factor for CFG.
+            use_kv_cache: Whether to use key-value caching in the local transformer.
+            forbid_audio_eos: Whether to globally forbid audio EOS.
+            sanitize_logits: Whether to clamp/clean logits before sampling.
+
+        Returns:
+            Sampled audio codes (B, num_codebooks, frame_stacking_factor).
+        """
+        self.local_transformer.reset_cache(use_cache=use_kv_cache)
+        dec_output = dec_output.unsqueeze(1)  # (B, 1, E)
+        local_transformer_input = self.local_transformer_in_projection(dec_output)
+        all_preds = []
+        for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor):
+            _mask = torch.ones(
+                local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device
+            )
+            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']
+
+            lt_out_for_proj = self.local_transformer_audio_out_projection(local_transformer_output[:, -1, :])
+            codebook_logits = self.local_transformer_out_projections[codebook_num](lt_out_for_proj)
+
+            if use_cfg:
+                actual_batch_size = codebook_logits.size(0) // 2
+                conditional_logits = codebook_logits[:actual_batch_size]
+                unconditional_logits = codebook_logits[actual_batch_size:]
+                cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits
+                codebook_logits[:actual_batch_size] = cfg_logits
+
+            if sanitize_logits:
+                codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0)
+                codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0)
+
+            for item_idx in unfinished_items:
+                codebook_logits[item_idx, self.audio_eos_id] = float('-inf')
+            for item_idx in finished_items:
+                codebook_logits[item_idx, :] = float('-inf')
+                codebook_logits[item_idx, self.audio_eos_id] = 0.0
+
+            codebook_logits = clear_forbidden_logits(
+                codebook_logits.unsqueeze(1), self.codebook_size, forbid_audio_eos=forbid_audio_eos
+            ).squeeze(1)
+
+            codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0]
+            indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(-1)
+            codebook_logits_rescored = codebook_logits.clone()
+            codebook_logits_rescored[indices_to_remove] = float('-inf')
+
+            if temperature <= 0.0:
+                codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True)
+            else:
+                codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1)
+                codebook_preds = torch.multinomial(codebook_probs, 1)
+
+            if use_cfg:
+                codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size]
+            all_preds.append(codebook_preds)
+
+            next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(1)
+            next_local_transformer_input = self.audio_in_projection(next_local_transformer_input)
+            next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input)
+            local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1)
+
+        all_preds = torch.cat(all_preds, dim=1)  # (B, num_codebooks * frame_stacking_factor)
+        all_preds = all_preds.reshape(-1, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1)
+        if use_cfg:
+            all_preds = all_preds[:actual_batch_size]
+
+        return all_preds
+
+    def sample_maskgit(
+        self,
+        dec_output: torch.Tensor,
+        temperature: float = 0.7,
+        topk: int = 80,
+        unfinished_items: Dict[int, bool] = {},
+        finished_items: Dict[int, bool] = {},
+        use_cfg: bool = False,
+        cfg_scale: float = 1.0,
+        n_steps: int = 3,
+        noise_scale: float = 0.0,
+        fixed_schedule: Optional[List[int]] = None,
+        dynamic_cfg_scale: bool = False,
+        sampling_type: Optional[str] = None,
+        forbid_audio_eos: bool = False,
+    ) -> torch.Tensor:
+        """Sample audio codes using MaskGit-like iterative prediction with the local transformer.
+
+        Args:
+            dec_output: Decoder output tensor (B, E).
+            temperature: Sampling temperature.
+            topk: Number of top-probability tokens to consider.
+            unfinished_items: Batch indices that have not completed generation.
+            finished_items: Batch indices that are completed.
+            use_cfg: Whether to use classifier-free guidance.
+            cfg_scale: Scale factor for CFG.
+            n_steps: Number of iterative refinement steps.
+            noise_scale: Scale factor for noise added to confidence scores.
+            fixed_schedule: Fixed schedule for number of tokens to unmask per step.
+            dynamic_cfg_scale: Whether to dynamically adjust CFG scale.
+            sampling_type: Sampling strategy.
+            forbid_audio_eos: Whether to globally forbid audio EOS.
+
+        Returns:
+            Sampled audio codes (B, num_codebooks, frame_stacking_factor).
+        """
+        device = dec_output.device
+        self.local_transformer.reset_cache(use_cache=False)
+        dec_output = dec_output.unsqueeze(1)
+        local_transformer_input_init = self.local_transformer_in_projection(dec_output)
+        codebook_seq_len = self.num_audio_codebooks * self.frame_stacking_factor
+        B = dec_output.size(0)
+
+        min_confidence = 0
+        max_confidence = 5
+        confidences = min_confidence * torch.ones(B, codebook_seq_len, device=device)
+        codes = self.mask_token_id * torch.ones((B, codebook_seq_len), device=device, dtype=torch.long)
+        sampled_codes = codes.clone()
+        if fixed_schedule is not None:
+            n_steps = len(fixed_schedule)
+        for step in range(n_steps):
+            progress = step / n_steps
+            frac_masked = cosine_schedule(torch.tensor(progress))
+            if sampling_type == "causal" or sampling_type == "purity_causal":
+                frac_masked = torch.ones_like(frac_masked) * (1.0 - progress)
+            if fixed_schedule is None:
+                n_masked = torch.ceil(codebook_seq_len * frac_masked).long()
+            else:
+                n_masked = codebook_seq_len - fixed_schedule[step]
+            n_unmasked = codebook_seq_len - n_masked
+
+            if sampling_type == "causal" or sampling_type == "purity_causal":
+                n_frames_to_allow = int(np.floor(progress * self.frame_stacking_factor + 1))
+                confidences[:, n_frames_to_allow * self.num_audio_codebooks :] = min_confidence - 1
+
+            _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1)
+            if use_cfg:
+                actual_batch_size = topk_indices.size(0) // 2
+                assert (
+                    topk_indices[actual_batch_size:] == topk_indices[:actual_batch_size]
+                ).all(), "Topk indices are not the same for conditional and unconditional codes"
+
+            unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices)
+            codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
+
+            local_transformer_input = local_transformer_input_init
+            for codebook_num in range(codebook_seq_len):
+                next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1)
+                next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input)
+                local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1)
+
+            _mask = torch.ones(B, codebook_seq_len + 1, device=device)
+            local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output']
+
+            logits = []
+            for codebook_num in range(codebook_seq_len):
+                codebook_logits = self.local_transformer_out_projections[codebook_num](
+                    local_transformer_output[:, codebook_num + 1, :]
+                )
+                logits.append(codebook_logits)
+            logits = torch.stack(logits, dim=1)
+
+            if use_cfg:
+                actual_batch_size = logits.size(0) // 2
+                conditional_logits = logits[:actual_batch_size]
+                unconditional_logits = logits[actual_batch_size:]
+                if not dynamic_cfg_scale:
+                    current_cfg_scale = cfg_scale
+                else:
+                    progress = step / (n_steps - 1)
+                    interp = progress
+                    current_cfg_scale = (cfg_scale - 1) * interp + 1.0
+                cfg_logits = current_cfg_scale * conditional_logits + (1.0 - current_cfg_scale) * unconditional_logits
+                logits[:actual_batch_size] = cfg_logits
+
+            logits = clear_forbidden_logits(logits, self.codebook_size, forbid_audio_eos=forbid_audio_eos)
+
+            for item_idx in unfinished_items:
+                logits[item_idx, self.audio_eos_id] = float('-inf')
+            for item_idx in finished_items:
+                logits[item_idx, :, :] = float('-inf')
+                logits[item_idx, :, self.audio_eos_id] = 0.0
+
+            logits_topk = torch.topk(logits, topk, dim=-1)[0]
+            indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1)
+            logits_rescored = logits.clone()
+            logits_rescored[indices_to_remove] = float('-inf')
+            probs = torch.softmax(logits_rescored / temperature, dim=-1)
+            sampled_codes = torch.multinomial(probs.view(B * codebook_seq_len, -1), 1).view(B, codebook_seq_len)
+            if use_cfg:
+                sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size]
+                probs[actual_batch_size:] = probs[:actual_batch_size]
+            if sampling_type != "purity_causal" and sampling_type != "purity_default":
+                confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1)
+            else:
+                confidences = probs.max(dim=2)[0]
+            sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes)
+            if noise_scale > 0.0:
+                noise = (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps)
+                confidences += noise
+                confidences[actual_batch_size:] = confidences[:actual_batch_size]
+            confidence_eps = 0.1
+            assert (
+                confidences.max() + confidence_eps < max_confidence
+            ), f"Predicted confidence is approaching max_confidence: {confidences.max()}"
+            confidences.scatter_(
+                index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float)
+            )
+        codes = sampled_codes
+        assert not (
+            codes == self.mask_token_id
+        ).any(), "Codes contain mask tokens after completion of MaskGit sampling"
+
+        codes = codes.reshape(B, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1)
+
+        if use_cfg:
+            codes = codes[:actual_batch_size]
+        return codes

From 7883ed976534bdb0ccf24f8865673ef36ef7e12f Mon Sep 17 00:00:00 2001
From: paarthneekhara <paarthneekhara@users.noreply.github.com>
Date: Thu, 12 Mar 2026 20:39:18 +0000
Subject: [PATCH 92/94] Apply isort and black reformatting

Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com>
---
 nemo/collections/tts/models/easy_magpietts.py | 27 +++++++++++++------
 .../tts/models/easy_magpietts_inference.py    | 11 ++++----
 .../easy_magpietts_preference_optimization.py |  3 ++-
 nemo/collections/tts/models/magpietts.py      | 10 +++----
 .../modules/magpietts_inference/inference.py  |  3 ++-
 5 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py
index ccc8f315a3c2..31ad48f9dbfe 100644
--- a/nemo/collections/tts/models/easy_magpietts.py
+++ b/nemo/collections/tts/models/easy_magpietts.py
@@ -238,17 +238,23 @@ def log_val_audio_example(
             codes=pred_audio_codes,
             codes_len=audio_codes_lens_target,
         )
-        pred_audio_codes, pred_audio_codes_lens = self._prepare_codes_for_decode(pred_audio_codes, audio_codes_lens_target - 1)
+        pred_audio_codes, pred_audio_codes_lens = self._prepare_codes_for_decode(
+            pred_audio_codes, audio_codes_lens_target - 1
+        )
         pred_audio, pred_audio_lens, _ = self._codec_helper.codes_to_audio(
-            pred_audio_codes, pred_audio_codes_lens,
+            pred_audio_codes,
+            pred_audio_codes_lens,
         )
         target_audio_codes, _ = remove_eos_token(
             codes=target_audio_codes,
             codes_len=audio_codes_lens_target,
         )
-        target_audio_codes, target_audio_codes_lens = self._prepare_codes_for_decode(target_audio_codes, audio_codes_lens_target - 1)
+        target_audio_codes, target_audio_codes_lens = self._prepare_codes_for_decode(
+            target_audio_codes, audio_codes_lens_target - 1
+        )
         target_audio, target_audio_lens, _ = self._codec_helper.codes_to_audio(
-            target_audio_codes, target_audio_codes_lens,
+            target_audio_codes,
+            target_audio_codes_lens,
         )
 
         context_audio, context_audio_lens = None, None
@@ -258,9 +264,12 @@ def log_val_audio_example(
                 codes=context_audio_codes,
                 codes_len=context_audio_codes_lens,
             )
-            context_audio_codes, context_audio_codes_lens = self._prepare_codes_for_decode(context_audio_codes, context_audio_codes_lens)
+            context_audio_codes, context_audio_codes_lens = self._prepare_codes_for_decode(
+                context_audio_codes, context_audio_codes_lens
+            )
             context_audio, context_audio_lens, _ = self._codec_helper.codes_to_audio(
-                context_audio_codes, context_audio_codes_lens,
+                context_audio_codes,
+                context_audio_codes_lens,
             )
 
         for logger in self.loggers:
@@ -1118,10 +1127,12 @@ def validation_step(self, batch, batch_idx):
                 codes_len=context_audio_codes_lens,
             )
             context_audio_codes_cleaned, context_audio_codes_lens_cleaned = self._prepare_codes_for_decode(
-                context_audio_codes_cleaned, context_audio_codes_lens_cleaned,
+                context_audio_codes_cleaned,
+                context_audio_codes_lens_cleaned,
             )
             context_audio_cleaned, context_audio_lens_cleaned, _ = self._codec_helper.codes_to_audio(
-                context_audio_codes_cleaned, context_audio_codes_lens_cleaned,
+                context_audio_codes_cleaned,
+                context_audio_codes_lens_cleaned,
             )
 
             for idx in range(infer_output.predicted_audio.size(0)):
diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py
index 555c30308e39..9167c14a92d5 100644
--- a/nemo/collections/tts/models/easy_magpietts_inference.py
+++ b/nemo/collections/tts/models/easy_magpietts_inference.py
@@ -1721,9 +1721,12 @@ def streaming_finalize(
 
             # No need to remove EOS - end_indices already point to the frame before EOS
             # Decode to audio (codes are already unstacked: B, C, T)
-            predicted_codes, predicted_codes_lens = self._prepare_codes_for_decode(predicted_codes, predicted_codes_lens)
+            predicted_codes, predicted_codes_lens = self._prepare_codes_for_decode(
+                predicted_codes, predicted_codes_lens
+            )
             audio, audio_len, decoded_codes = self._codec_helper.codes_to_audio(
-                predicted_codes, predicted_codes_lens,
+                predicted_codes,
+                predicted_codes_lens,
             )
 
             return StreamingFinalizeOutput(
@@ -1824,9 +1827,7 @@ def infer_batch(
                 elif 'audio' in batch:
                     gt_audio = batch['audio']
                     gt_audio_lens = batch['audio_lens']
-                    gt_audio_codes, gt_audio_codes_lens = self._codec_helper.audio_to_codes(
-                        gt_audio, gt_audio_lens
-                    )
+                    gt_audio_codes, gt_audio_codes_lens = self._codec_helper.audio_to_codes(gt_audio, gt_audio_lens)
                 else:
                     raise ValueError("Teacher forcing requires 'audio_codes' or 'audio' in batch")
 
diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
index 600ddda579bd..46287373909c 100644
--- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
+++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py
@@ -439,7 +439,8 @@ def _get_reference_audio_paths(self, batch_repeated: Dict) -> List[str]:
                 ).long()
             context_codes, context_lens = self._prepare_codes_for_decode(context_codes, context_lens)
             context_audio, context_audio_lens, _ = self._codec_helper.codes_to_audio(
-                context_codes, context_lens,
+                context_codes,
+                context_lens,
             )
             return self._save_waveforms_to_paths(
                 waveforms=context_audio,
diff --git a/nemo/collections/tts/models/magpietts.py b/nemo/collections/tts/models/magpietts.py
index f710bb853986..97b0de063008 100644
--- a/nemo/collections/tts/models/magpietts.py
+++ b/nemo/collections/tts/models/magpietts.py
@@ -1360,9 +1360,7 @@ def _prepare_audio_examples(
             pred_audio_codes, pred_audio_codes_lens = remove_eos_token(
                 codes=pred_audio_codes, codes_len=audio_codes_lens
             )
-            pred_audio, pred_audio_lens, _ = self._codec_helper.codes_to_audio(
-                pred_audio_codes, pred_audio_codes_lens
-            )
+            pred_audio, pred_audio_lens, _ = self._codec_helper.codes_to_audio(pred_audio_codes, pred_audio_codes_lens)
 
             # Decode targets: remove EOS token, then decode to audio
             target_audio_codes, target_audio_codes_lens = remove_eos_token(
@@ -1600,7 +1598,8 @@ def _get_context_audio_codes(self, batch: Dict[str, torch.Tensor]) -> Tuple[torc
             lens = batch['context_audio_codes_lens']
         else:
             codes, lens = self._codec_helper.audio_to_codes(
-                batch['context_audio'], batch['context_audio_lens'],
+                batch['context_audio'],
+                batch['context_audio_lens'],
                 sample_rate=batch.get('context_sample_rate'),
             )
 
@@ -2012,7 +2011,8 @@ def process_batch(self, batch):
 
         if 'audio_codes' not in batch:
             audio_codes, audio_codes_lens = self._codec_helper.audio_to_codes(
-                batch['audio'], batch['audio_lens'],
+                batch['audio'],
+                batch['audio_lens'],
                 sample_rate=batch.get('sample_rate'),
             )
         else:
diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index 2cebff638977..d7f6e48b7e9e 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -473,7 +473,8 @@ def _run_unified_inference(
             predicted_codes_lens_tensor = torch.tensor(predicted_codes_lens, dtype=torch.long, device='cuda')
 
             predicted_audio, predicted_audio_lens, _ = self.model._codec_helper.codes_to_audio(
-                predicted_codes, predicted_codes_lens_tensor,
+                predicted_codes,
+                predicted_codes_lens_tensor,
             )
 
             total_audio_samples = sum(predicted_audio_lens.cpu().tolist())

From 9d95ed4a87b7c3c489183615ecf196fc8e84918c Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Thu, 12 Mar 2026 17:22:57 -0400
Subject: [PATCH 93/94] bug fixed

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/modules/magpietts_inference/inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py
index 2cebff638977..3e0a7f36274a 100644
--- a/nemo/collections/tts/modules/magpietts_inference/inference.py
+++ b/nemo/collections/tts/modules/magpietts_inference/inference.py
@@ -581,7 +581,7 @@ def create_dataset(
             pad_context_text_to_max_duration=False,
             context_duration_min=context_duration_min,
             context_duration_max=context_duration_max,
-            ignore_phoneme_languages=self.config.get('ignore_phoneme_languages', []),
+            ignore_phoneme_languages=self.model.cfg.get('ignore_phoneme_languages', []),
             add_language_to_context_text=self.model.add_language_to_context_text,
         )
         dataset.text_tokenizer = self.model.tokenizer

From 50dd98d7e571cf60f23c0b0d8759a1cb439efa71 Mon Sep 17 00:00:00 2001
From: Paarth Neekhara <paarth.n@gmail.com>
Date: Sat, 14 Mar 2026 13:09:38 -0700
Subject: [PATCH 94/94] bug fix in easy  magpie LT training

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
---
 nemo/collections/tts/models/easy_magpietts_inference.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py
index 9167c14a92d5..65abd37ce957 100644
--- a/nemo/collections/tts/models/easy_magpietts_inference.py
+++ b/nemo/collections/tts/models/easy_magpietts_inference.py
@@ -489,6 +489,10 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
                 )
             self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections)
 
+            # EasyMagpie stacks frames into the channel dimension (B, C*S, T_stacked)
+            # via stack_codes, unlike Magpie which keeps them interleaved in time (B, C, T_full).
+            # We pass num_audio_codebooks=C*S and frame_stacking_factor=1 so the helper
+            # treats each stacked channel as an independent codebook without time-domain striding.
             self._lt_helper = LocalTransformerHelper(
                 local_transformer=self.local_transformer,
                 audio_embeddings=self.audio_embeddings,
@@ -496,8 +500,8 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
                 local_transformer_in_projection=self.local_transformer_in_projection,
                 local_transformer_audio_out_projection=self.local_transformer_audio_out_projection,
                 local_transformer_out_projections=self.local_transformer_out_projections,
-                num_audio_codebooks=self.num_audio_codebooks,
-                frame_stacking_factor=self.frame_stacking_factor,
+                num_audio_codebooks=self.num_audio_codebooks * self.frame_stacking_factor,
+                frame_stacking_factor=1,
                 audio_eos_id=self.audio_eos_id,
                 mask_token_id=self.mask_token_id,
                 codebook_size=self.codebook_size,