diff --git a/examples/nlp/entity_linking/build_index.py b/examples/nlp/entity_linking/build_index.py
deleted file mode 100644
index eeba5c83130e..000000000000
--- a/examples/nlp/entity_linking/build_index.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pickle as pkl
-import random
-from argparse import ArgumentParser
-
-import h5py
-import numpy as np
-import torch
-from omegaconf import DictConfig, OmegaConf
-from sklearn.decomposition import PCA
-from tqdm import tqdm
-
-from nemo.collections.nlp.models import EntityLinkingModel
-from nemo.utils import logging
-
-try:
-    import faiss
-except ModuleNotFoundError:
-    logging.warning("Faiss is required for building the index. Please install faiss-gpu")
-
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-
-def build_index(cfg: DictConfig, model: object):
-    """
-    Builds faiss index from index dataset specified in the config.
-        
-    Args:
-        cfg (DictConfig): Config file specifying index parameters
-        model (object): Encoder model
-    """
-
-    # Get index dataset embeddings
-    # PCA model exists and index embeddings have already been PCAed, no need to re-extract/PCA them
-    if cfg.apply_pca and os.path.isfile(cfg.pca.pca_save_name) and os.path.isfile(cfg.pca_embeddings_save_name):
-        logging.info("Loading reduced dimensionality embeddings")
-        embeddings = h5py.File(cfg.pca_embeddings_save_name, "r")
-        embeddings = embeddings[cfg.index_ds.name][:]
-
-    elif os.path.isfile(cfg.embedding_save_name):
-        logging.info("Loading previously extracted index dataset embeddings")
-        embeddings = h5py.File(cfg.embedding_save_name, "r")
-        embeddings = embeddings[cfg.index_ds.name][:]
-
-    else:
-        logging.info("Encoding index dataset, this may take a while")
-        index_dataloader = model.setup_dataloader(cfg.index_ds, is_index_data=True)
-        embeddings, concept_ids = get_index_embeddings(cfg, index_dataloader, model)
-
-    # Create pca model to reduce dimensionality of index dataset and decrease memory footprint
-    if cfg.apply_pca:
-
-        # Need to train PCA model and apply PCA transformation with newly trained model
-        if not os.path.isfile(cfg.pca.pca_save_name):
-            logging.info("Fitting PCA model for embedding dimensionality reduction")
-            pca_train_set = random.sample(list(embeddings), k=int(len(embeddings) * cfg.pca.sample_fraction))
-            pca = PCA(n_components=cfg.pca.output_dim)
-            pca.fit(pca_train_set)
-            pkl.dump(pca, open(cfg.pca.pca_save_name, "wb"))
-            embeddings = reduce_embedding_dim(pca, embeddings, cfg)
-
-        # PCA model already trained, just need to reduce dimensionality of all embeddings
-        elif not os.path.isfile(cfg.pca_embeddings_save_name):
-            pca = pkl.load(open(cfg.pca.pca_save_name, "rb"))
-            embeddings = reduce_embedding_dim(pca, embeddings, cfg)
-
-    # Build faiss index from embeddings
-    logging.info(f"Training index with embedding dim size {cfg.dims} using {faiss.get_num_gpus()} gpus")
-    quantizer = faiss.IndexFlatL2(cfg.dims)
-    index = faiss.IndexIVFFlat(quantizer, cfg.dims, cfg.nlist)
-    index = faiss.index_cpu_to_all_gpus(index)
-    index.train(embeddings)
-
-    logging.info("Adding dataset embeddings to index")
-    for i in tqdm(range(0, embeddings.shape[0], cfg.index_batch_size)):
-        index.add(embeddings[i : i + cfg.index_batch_size])
-
-    logging.info("Saving index")
-    faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index_save_name)
-    logging.info("Index built and saved")
-
-
-def reduce_embedding_dim(pca, embeddings, cfg):
-    """Apply PCA transformation to index dataset embeddings"""
-
-    logging.info("Applying PCA transformation to entire index dataset")
-    embeddings = np.array(pca.transform(embeddings), dtype=np.float32)
-    emb_file = h5py.File(cfg.pca_embeddings_save_name, "w")
-    emb_file.create_dataset(cfg.index_ds.name, data=embeddings)
-    emb_file.close()
-
-    return embeddings
-
-
-def get_index_embeddings(cfg: DictConfig, dataloader: object, model: object):
-    """Use entity linking encoder to get embeddings for full index dataset"""
-    embeddings = []
-    concept_ids = []
-
-    with torch.no_grad():
-        for batch in tqdm(dataloader):
-            input_ids, token_type_ids, input_mask, batch_concept_ids = batch
-            input_ids = input_ids.to(device)
-            token_type_ids = token_type_ids.to(device)
-            input_mask = input_mask.to(device)
-            batch_embeddings = model.forward(
-                input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=input_mask
-            )
-
-            embeddings.extend(batch_embeddings.detach().cpu().numpy())
-            concept_ids.extend(batch_concept_ids.numpy())
-
-    emb_file = h5py.File(cfg.embedding_save_name, "w")
-    emb_file.create_dataset(cfg.index_ds.name, data=embeddings)
-    emb_file.close()
-
-    pkl.dump(concept_ids, open(cfg.concept_id_save_name, "wb"))
-
-    return embeddings, concept_ids
-
-
-def load_model(cfg: DictConfig, restore: bool):
-    """
-    Loads encoder model.
-
-    Args:
-        cfg: Config file specifying model parameters
-        restore: Whether to restore model weights trained
-                 by the user. Otherwise will load weights
-                 used before self alignment pretraining. 
-    """
-
-    if restore:
-        model = EntityLinkingModel.restore_from(cfg.nemo_path)
-    else:
-        cfg.train_ds = None
-        cfg.validation_ds = None
-        cfg.test_ds = None
-        model = EntityLinkingModel(cfg)
-
-    model = model.to(device)
-
-    return model
-
-
-def main(cfg: DictConfig, restore: bool):
-    """
-    Builds new index if one hasn't been built yet.
-
-    Args:
-        cfg: Config file specifying index parameters
-        restore: Whether to restore model weights trained
-                 by the user. Otherwise will load weights
-                 used before self alignment pretraining.
-    """
-
-    logging.info("Loading entity linking encoder model")
-    model = load_model(cfg.model, restore)
-
-    if not os.path.isfile(cfg.index.index_save_name) or (
-        cfg.apply_pca and not os.path.isfile(cfg.index.pca.pca_save_name)
-    ):
-        logging.info("Building index")
-        build_index(cfg.index, model)
-    else:
-        logging.info("Index and pca model (if required) already exists. Skipping build index step.")
-
-    if not os.path.isfile(cfg.index.idx_to_id):
-        logging.info("Mapping entity index postions to ids")
-        map_idx_to_ids(cfg.index)
-    else:
-        logging.info("Map from concept index to id already exists. Skipping mapping step.")
-
-
-if __name__ == '__main__':
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--restore", action="store_true", help="Whether to restore encoder model weights from nemo path"
-    )
-    parser.add_argument("--project_dir", required=False, type=str, default=".")
-    parser.add_argument("--cfg", required=False, type=str, default="./conf/umls_medical_entity_linking_config.yaml")
-    args = parser.parse_args()
-
-    cfg = OmegaConf.load(args.cfg)
-    cfg.project_dir = args.project_dir
-
-    main(cfg, args.restore)
diff --git a/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml b/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml
deleted file mode 100644
index b7f538ccd68f..000000000000
--- a/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml
+++ /dev/null
@@ -1,90 +0,0 @@
-project_dir: null
-name: SelfAlignmentPretrainingForMedicalEntityLinking
-trainer:
-  devices: 1
-  num_nodes: 1
-  max_epochs: 2
-  max_steps: -1
-  accumulate_grad_batches: 1
-  precision: 16
-  accelerator: gpu
-  strategy: ddp
-  gradient_clip_val: 0.0
-  log_every_n_steps: 1
-  val_check_interval: 2
-  enable_checkpointing: False
-  logger: false
-model:
-  nemo_path: ???
-  max_seq_length: 128
-  language_model:
-    pretrained_model_name: bert-base-uncased
-    config_file: null
-    config: null
-    lm_checkpoint: null
-  tokenizer:
-    tokenizer_name: ${model.language_model.pretrained_model_name}
-    vocab_file: null
-    tokenizer_model: null
-    do_lower_case: true
-  loss_params: null
-  train_ds:
-    data_file: ???
-    max_seq_length: ${model.max_seq_length}
-    batch_size: 8
-    shuffle: true
-    num_workers: 2
-    pin_memory: false
-    drop_last: false
-  validation_ds:
-    data_file: ??? 
-    max_seq_length: ${model.max_seq_length}
-    batch_size: 8
-    shuffle: false
-    num_workers: 2
-    pin_memory: false
-    drop_last: false
-  optim:
-    name: adam
-    lr: 3.0e-05
-    weight_decay: 0.0
-    sched:
-      name: CosineAnnealing
-      warmup_steps: null
-      warmup_ratio: 0.1
-      min_lr: 0.0
-      last_epoch: -1
-index:
-  dims: 768
-  nlist: 2
-  top_n: 3
-  query_num_factor: 20
-  index_save_name: ???
-  index_batch_size: 10
-  index_ds:
-    name: tiny_example
-    data_file: ???
-    max_seq_length: ${model.max_seq_length}
-    batch_size: 100
-    shuffle: false
-    num_workers: 2
-    pin_memory: false
-    drop_last: false
-  idx_to_id: ${project_dir}/idx_to_id.pkl
-  id_to_string: ${project_dir}/id_to_string.pkl
-  concept_id_save_name: ${project_dir}/tiny_example_concept_ids.pkl
-  embedding_save_name: ${project_dir}/tiny_example_concept_embeddings.hdf5
-  pca_embeddings_save_name: null
-  apply_pca: false
-  pca: null
-exp_manager:
-  exp_dir: .
-  name: ${project_dir}/SelfAlignmentPretrainingTinyExample
-  create_tensorboard_logger: true
-  create_checkpoint_callback: true
-hydra:
-  run:
-    dir: .
-  job_logging:
-    root:
-      handlers: null
diff --git a/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml b/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml
deleted file mode 100644
index ad636ef23e18..000000000000
--- a/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml
+++ /dev/null
@@ -1,95 +0,0 @@
-project_dir: ???
-name: SelfAlignmentPretrainingForMedicalEntityLinking
-trainer:
-  devices: 1
-  num_nodes: 1
-  max_epochs: 2
-  max_steps: -1
-  accumulate_grad_batches: 1
-  precision: 16
-  accelerator: gpu
-  strategy: ddp
-  gradient_clip_val: 0.0
-  log_every_n_steps: 1
-  val_check_interval: 1000
-  enable_checkpointing: False
-  logger: false
-model:
-  nemo_path: ${project_dir}/sap_bert_umls.nemo
-  raw_data: ${project_dir}/data/MRCONSO.RRF
-  max_seq_length: 128
-  language_model:
-    pretrained_model_name: bert-base-uncased
-    config_file: null
-    config: null
-    lm_checkpoint: null
-  tokenizer:
-    tokenizer_name: ${model.language_model.pretrained_model_name}
-    vocab_file: null
-    tokenizer_model: null
-    do_lower_case: true
-  train_ds:
-    data_file: ${project_dir}/data/umls_train_pairs.tsv
-    max_seq_length: ${model.max_seq_length}
-    batch_size: 128
-    shuffle: true
-    num_workers: 2
-    pin_memory: false
-    drop_last: false
-  validation_ds:
-    data_file: ${project_dir}/data/umls_validation_pairs.tsv
-    max_seq_length: ${model.max_seq_length}
-    batch_size: 128
-    shuffle: false
-    num_workers: 2
-    pin_memory: false
-    drop_last: false
-  optim:
-    name: adam
-    lr: 3.0e-05
-    weight_decay: 0.0
-    sched:
-      name: CosineAnnealing
-      warmup_steps: null
-      warmup_ratio: 0.1
-      min_lr: 0.0
-      last_epoch: -1
-index:
-  dims: 256
-  nlist: 300
-  top_n: 5
-  query_num_factor: 20
-  index_save_name: ${project_dir}/medical_entity_linking_index
-  index_batch_size: 1000
-  raw_data: ${model.raw_data}
-  index_ds:
-    name: umls
-    data_file: ${project_dir}/data/umls_index_concepts.tsv
-    max_seq_length: ${model.max_seq_length}
-    batch_size: 128
-    shuffle: false
-    num_workers: 2
-    pin_memory: false
-    drop_last: false
-  idx_to_id: ${project_dir}/data/idx_to_id.pkl
-  id_to_string: ${project_dir}/data/id_to_string.pkl
-  concept_id_save_name: ${project_dir}/data/concept_ids.pkl
-  embedding_save_name: ${project_dir}/data/medical_concept_embeddings.hdf5
-  pca_embeddings_save_name: ${project_dir}/data/medical_concept_reduced_${index.dims}dim_embeddings.hdf5
-  apply_pca: true
-  pca:
-    input_dim: 756
-    output_dim: ${index.dims}
-    sample_fraction: 0.5
-    pca_save_name: ${project_dir}/${index.pca.input_dim}_to_${index.pca.output_dim}_pca_model.pkl
-exp_manager:
-  exp_dir: ${project_dir}/medical_entity_linking_experiments
-  name: sap_bert_umls
-  create_tensorboard_logger: true
-  create_checkpoint_callback: true
-hydra:
-  run:
-    dir: .
-  job_logging:
-    root:
-      handlers: null
diff --git a/examples/nlp/entity_linking/data/umls_dataset_processing.py b/examples/nlp/entity_linking/data/umls_dataset_processing.py
deleted file mode 100644
index 03a17da3c0bc..000000000000
--- a/examples/nlp/entity_linking/data/umls_dataset_processing.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import itertools
-import pickle as pkl
-import random
-from argparse import ArgumentParser
-
-import pandas as pd
-from omegaconf import OmegaConf
-from tqdm import tqdm
-
-# Info on these headers can be found here on the UMLS website https://www.ncbi.nlm.nih.gov/books/NBK9685/
-# section 3.3.4 Table 1
-HEADERS = [
-    'CUI',
-    'LAT',
-    'TS',
-    'LUI',
-    'STT',
-    'SUI',
-    'ISPREF',
-    'AUI',
-    'SAUI',
-    'SCUI',
-    'SDUI',
-    'SAB',
-    'TTY',
-    'CODE',
-    'STR',
-    'SRL',
-    'SUPPRESS',
-    'CVF',
-]
-
-
-def process_umls_training_dataset(data_path, train_save_name, val_save_name, max_pairs, train_split, headers):
-    """
-    Generates and saves UMLS self alignment pretraining train and validation data. Takes the raw .RRF UMLS 
-    data file and creates different pair combinations for entities with the same CUI. Each row in the output
-    will be formatted as 'CUI EntitySynonym1 EntitySynonym2' with each item in a row separated by tabs.
-    Saves two .tsv output files, one for the train split and one for the validation split.
-    Only data marked as English is added to the train and val splits. 
-
-    Arguments:
-        data_path (str): path to MRCONSO.RRF UMLS data file
-        train_save_name (str): path to where training data will be saved
-        val_save_name (str): path to where validation data will be saved
-        max_pairs (int): max number of pairs for any one CUI added to the train 
-                   or validation splits
-        train_split (float): precentage of raw data to be added to train set split
-        headers (list): column lables within MRCONSO.RRF
-    """
-
-    print("Loading training data file...")
-    df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|')
-    train_file = open(train_save_name, 'w')
-    val_file = open(val_save_name, 'w')
-
-    cui = df["CUI"].iloc[0]
-    names = []
-    random.seed(2021)
-
-    for idx in tqdm(range(len(df))):
-        # Address incorrectly formatted data
-        if type(df["STR"].iloc[idx]) != str or "|" in df["STR"].iloc[idx]:
-            continue
-
-        # Collect all english concept strings matching the current CUI
-        if df["CUI"].iloc[idx] == cui and df["LAT"].iloc[idx] == "ENG":
-            concept_string = df["STR"].iloc[idx]
-            names.append(concept_string)
-
-        else:
-            # Pair off concept synonyms to make training and val sets
-            pairs = list(itertools.combinations(names, 2))
-
-            if len(pairs) == 0:
-                # Not enough concepts gathered to make a pair
-                cui = df["CUI"].iloc[idx]
-                names = [df["STR"].iloc[idx]]
-                continue
-
-            # Removing leading C to convert label string to int
-            cui = int(cui[1:])
-            random.shuffle(pairs)
-
-            # Keep up to max pairs number pairs for any one concept
-            for pair in pairs[:max_pairs]:
-
-                # Want concepts in train and val splits to be randomly selected and mutually exclusive
-                add_to_train = random.random()
-
-                if add_to_train <= train_split:
-                    train_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n')
-                else:
-                    val_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n')
-
-            # Switch to next concept
-            cui = df["CUI"].iloc[idx]
-            names = [df["STR"].iloc[idx]]
-
-    train_file.close()
-    val_file.close()
-    print("Finished making training and validation data")
-
-
-def process_umls_index_dataset(data_path, data_savename, id2string_savename, headers):
-    """
-    Generates data file needed to build a UMLS index and a hash table mapping each
-    CUI to one canonical concept string. Takes the raw .RRF data file and saves 
-    a .tsv indec concept file as well as the a .pkl file of cui to concept string 
-    mappings. Only data marked as English is added to the index data file. 
-
-    Arguments:
-        data_path (str): path to MRCONSO.RRF UMLS data file
-        data_savename (str): path to where .tsv index data will be saved
-        id2string_savename (str): path to where .pkl cui to string mapping will
-                                  be saved
-        headers (list): column lables within MRCONSO.RRF
-    """
-
-    print("Loading index data file...")
-    df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|')
-    id2string = {}
-
-    with open(data_savename, "w") as outfile:
-        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
-            # Address incorrectly formatted data
-            if type(row["STR"]) != str or "|" in row["STR"]:
-                continue
-
-            cui = row["CUI"]
-            sent = row["STR"]
-
-            # Removing leading C to convert label string to int
-            cui = int(cui[1:])
-
-            # Only keeping english concepts
-            if row["LAT"] == "ENG":
-                outfile.write(f'{cui}\t{sent}\n')
-
-                # Matching each cui to one canonical string represention
-                if cui not in id2string and ":" not in sent:
-                    id2string[cui] = sent
-
-    outfile.close()
-    pkl.dump(id2string, open(id2string_savename, "wb"))
-    print("Finished saving index data and id to concept mapping")
-
-
-if __name__ == '__main__':
-    parser = ArgumentParser()
-    parser.add_argument("--index", action="store_true", help="Whether to process data for building an index")
-    parser.add_argument("--project_dir", required=False, type=str, default=".")
-    parser.add_argument("--cfg", required=False, type=str, default="conf/umls_medical_entity_linking_config.yaml")
-    parser.add_argument(
-        "--max_pairs", required=False, type=int, default=50, help="Max number of train pairs for a single concepts"
-    )
-    parser.add_argument(
-        "--train_split", required=False, type=float, default=0.99, help="Precentage of data to add to train set"
-    )
-
-    args = parser.parse_args()
-    cfg = OmegaConf.load(args.cfg)
-    cfg.project_dir = args.project_dir
-
-    if args.index:
-        process_umls_index_dataset(cfg.index.raw_data, cfg.index.index_ds.data_file, cfg.index.id_to_string, HEADERS)
-    else:
-        process_umls_training_dataset(
-            cfg.model.raw_data,
-            cfg.model.train_ds.data_file,
-            cfg.model.validation_ds.data_file,
-            args.max_pairs,
-            args.train_split,
-            HEADERS,
-        )
diff --git a/examples/nlp/entity_linking/query_index.py b/examples/nlp/entity_linking/query_index.py
deleted file mode 100644
index 6cb51a7de160..000000000000
--- a/examples/nlp/entity_linking/query_index.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pickle as pkl
-from argparse import ArgumentParser
-from collections import OrderedDict
-from typing import Dict
-
-import numpy as np
-import torch
-from build_index import load_model
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.utils import logging
-
-try:
-    import faiss
-except ModuleNotFoundError:
-    logging.warning("Faiss is required for building the index. Please install faiss-gpu")
-
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-
-def get_query_embedding(query, model):
-    """Use entity linking encoder to get embedding for index query"""
-    model_input = model.tokenizer(
-        query,
-        add_special_tokens=True,
-        padding=True,
-        truncation=True,
-        max_length=512,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-    )
-
-    query_emb = model.forward(
-        input_ids=torch.LongTensor([model_input["input_ids"]]).to(device),
-        token_type_ids=torch.LongTensor([model_input["token_type_ids"]]).to(device),
-        attention_mask=torch.LongTensor([model_input["attention_mask"]]).to(device),
-    )
-
-    return query_emb
-
-
-def query_index(
-    query: str, cfg: DictConfig, model: object, index: object, pca: object, idx2id: dict, id2string: dict,
-) -> Dict:
-
-    """
-    Query the nearest neighbor index of entities to find the 
-    concepts in the index dataset that are most similar to the 
-    query.
-
-    Args:
-        query (str): entity to look up in the index
-        cfg (DictConfig): config object to specifiy query parameters
-        model (EntityLinkingModel): entity linking encoder model
-        index (object): faiss index
-        pca (object): sklearn pca transformation to be applied to queries 
-        idx2id (dict): dictionary mapping unique concept dataset index to 
-                       its CUI
-        id2string (dict): dictionary mapping each unqiue CUI to a 
-                          representative english description of
-                          the concept
-    Returns:
-        A dictionary with the concept ids of the index's most similar 
-        entities as the keys and a tuple containing the string 
-        representation of that concept and its cosine similarity to 
-        the query as the values. 
-    """
-    query_emb = get_query_embedding(query, model).detach().cpu().numpy()
-
-    if cfg.apply_pca:
-        query_emb = pca.transform(query_emb)
-
-    dist, neighbors = index.search(query_emb.astype(np.float32), cfg.query_num_factor * cfg.top_n)
-    dist, neighbors = dist[0], neighbors[0]
-    unique_ids = OrderedDict()
-    neighbor_idx = 0
-
-    # Many of nearest neighbors could map to the same concept id, their idx is their unique identifier
-    while len(unique_ids) < cfg.top_n and neighbor_idx < len(neighbors):
-        concept_id_idx = neighbors[neighbor_idx]
-        concept_id = idx2id[concept_id_idx]
-
-        # Only want one instance of each unique concept
-        if concept_id not in unique_ids:
-            concept = id2string[concept_id]
-            unique_ids[concept_id] = (concept, 1 - dist[neighbor_idx])
-
-        neighbor_idx += 1
-
-    unique_ids = dict(unique_ids)
-
-    return unique_ids
-
-
-def main(cfg: DictConfig, restore: bool):
-    """
-    Loads faiss index and allows commandline queries 
-    to the index. Builds new index if one hasn't been built yet.
-
-    Args:
-        cfg: Config file specifying index parameters
-        restore: Whether to restore model weights trained
-                 by the user. Otherwise will load weights
-                 used before self alignment pretraining.
-    """
-
-    if not os.path.isfile(cfg.index.index_save_name) or (
-        cfg.apply_pca and not os.path.isfile(cfg.index.pca.pca_save_name) or not os.path.isfile(cfg.index.idx_to_id)
-    ):
-        logging.warning("Either no index and/or no mapping from entity idx to ids exists. Please run `build_index.py`")
-        return
-
-    logging.info("Loading entity linking encoder model")
-    model = load_model(cfg.model, restore)
-
-    logging.info("Loading index and associated files")
-    index = faiss.read_index(cfg.index.index_save_name)
-    idx2id = pkl.load(open(cfg.index.idx_to_id, "rb"))
-    id2string = pkl.load(open(cfg.index.id_to_string, "rb"))  # Should be created during dataset prep
-
-    if cfg.index.apply_pca:
-        pca = pkl.load(open(cfg.index.pca.pca_save_name, "rb"))
-
-    while True:
-        query = input("enter index query: ")
-        output = query_index(query, cfg.top_n, cfg.index, model, index, pca, idx2id, id2string)
-
-        if query == "exit":
-            break
-
-        for concept_id in output:
-            concept_details = output[concept_id]
-            concept_id = "C" + str(concept_id).zfill(7)
-            print(concept_id, concept_details)
-
-        print("----------------\n")
-
-
-if __name__ == '__main__':
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--restore", action="store_true", help="Whether to restore encoder model weights from nemo path"
-    )
-    parser.add_argument("--project_dir", required=False, type=str, default=".")
-    parser.add_argument("--cfg", required=False, type=str, default="./conf/umls_medical_entity_linking_config.yaml")
-    args = parser.parse_args()
-
-    cfg = OmegaConf.load(args.cfg)
-    cfg.project_dir = args.project_dir
-
-    main(cfg, args.restore)
diff --git a/examples/nlp/entity_linking/self_alignment_pretraining.py b/examples/nlp/entity_linking/self_alignment_pretraining.py
deleted file mode 100644
index 58b20f384d04..000000000000
--- a/examples/nlp/entity_linking/self_alignment_pretraining.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Please see tutorial at Nemo/tutorials/nlp/Entity_Linking_Medical.ipynb for
-# more information on entity linking and self alignment pretraining.
-
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models import EntityLinkingModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-@hydra_runner(config_path="conf", config_name="umls_medical_entity_linking_config.yaml")
-def main(cfg: DictConfig) -> None:
-    # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True
-    # when there are unused parameters here
-    if cfg.trainer.strategy == 'ddp':
-        cfg.trainer.strategy = "ddp_find_unused_parameters_true"
-    logging.info(f"\nConfig Params:\n{OmegaConf.to_yaml(cfg)}")
-    trainer = Trainer(**cfg.trainer)
-    exp_manager(trainer, cfg.get("exp_manager", None))
-
-    logging.info(f"Loading weights from pretrained model {cfg.model.language_model.pretrained_model_name}")
-    model = EntityLinkingModel(cfg=cfg.model, trainer=trainer)
-    logging.info("===========================================================================================")
-    logging.info('Starting training...')
-    trainer.fit(model)
-    logging.info('Training finished!')
-    logging.info("===========================================================================================")
-
-    if cfg.model.nemo_path:
-        # '.nemo' file contains the last checkpoint and the params to initialize the model
-        model.save_to(cfg.model.nemo_path)
-        logging.info(f'Model is saved into `.nemo` file: {cfg.model.nemo_path}')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/glue_benchmark/glue_benchmark.py b/examples/nlp/glue_benchmark/glue_benchmark.py
deleted file mode 100644
index 28efb9520fbd..000000000000
--- a/examples/nlp/glue_benchmark/glue_benchmark.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-## Tasks
-This script works with all GLUE Benchmark tasks, more details about the GLUE Benchmark could be found at
-https://gluebenchmark.com/
-
-More details on how to use this script could be found in tutorials/nlp/GLUE_Benchmark.ipynb
-
-## Model Training
-
-To train GLUEModel with the default config file, run:
-    python glue_benchmark.py \
-    model.dataset.data_dir=<PATH_TO_DATA_DIR>  \
-    model.task_name=TASK_NAME \
-    trainer.max_epochs=<NUM_EPOCHS> \
-    trainer.devices="[<CHANGE_TO_GPU_YOU_WANT_TO_USE>]
-
-Supported task names:
-["cola", "sst-2", "mrpc", "sts-b", "qqp", "mnli", "qnli", "rte", "wnli"]
-Note, MNLI task includes both matched and mismatched dev sets
-"""
-
-import os
-
-import lightning.pytorch as pl
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models import GLUEModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-@hydra_runner(config_name="glue_benchmark_config")
-def main(cfg: DictConfig) -> None:
-    # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True
-    # when there are unused parameters like here
-    if cfg.trainer.strategy == 'ddp':
-        cfg.trainer.strategy = "ddp_find_unused_parameters_true"
-    logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
-    trainer = pl.Trainer(**cfg.trainer)
-    exp_manager_cfg = cfg.get("exp_manager", None)
-
-    if exp_manager_cfg:
-        exp_manager_cfg.name = cfg.model.task_name
-        logging.info(f'Setting task_name to {exp_manager_cfg.name} in exp_manager')
-    exp_manager(trainer, exp_manager_cfg)
-
-    if cfg.model.nemo_path and os.path.exists(cfg.model.nemo_path):
-        model = GLUEModel.restore_from(cfg.model.nemo_path)
-        logging.info(f'Restoring model from {cfg.model.nemo_path}')
-        model.update_data_dir(data_dir=cfg.model.dataset.data_dir)
-        model.setup_training_data()
-        model.setup_multiple_validation_data()
-        trainer.fit(model)
-    else:
-        model = GLUEModel(cfg.model, trainer=trainer)
-        trainer.fit(model)
-        if cfg.model.nemo_path:
-            model.save_to(cfg.model.nemo_path)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/glue_benchmark/glue_benchmark_config.yaml b/examples/nlp/glue_benchmark/glue_benchmark_config.yaml
deleted file mode 100644
index 21cdc04db22f..000000000000
--- a/examples/nlp/glue_benchmark/glue_benchmark_config.yaml
+++ /dev/null
@@ -1,82 +0,0 @@
-# GLUE Benchmark with pre-trained BERT models
-supported_tasks: &supported_tasks ['cola', 'sst-2', 'mrpc', 'sts-b', 'qqp', 'mnli', 'qnli', 'rte', 'wnli']
-
-trainer:
-  devices: 1 # the number of gpus, 0 for CPU
-  num_nodes: 1
-  max_epochs: 3
-  max_steps: -1 # precedence over max_epochs
-  accumulate_grad_batches: 1 # accumulates grads every k batches
-  precision: 16
-  accelerator: gpu
-  strategy: ddp
-  enable_checkpointing: False  # Provided by exp_manager
-  logger: False  # Provided by exp_manager
-
-model:
-  task_name: &task_name mrpc # choose from: ["cola", "sst-2", "mrpc", "sts-b", "qqp", "mnli", "qnli", "rte", "wnli"] GLUE task name, MNLI includes both matched and mismatched dev sets
-  supported_tasks: *supported_tasks
-  output_dir: null # dir to write write predictions
-  nemo_path: null # filename to save the model and associated artifacts to .nemo file
-  dataset:
-    data_dir: ??? # /path/to/data
-    max_seq_length: 128
-    use_cache: true
-
-    # shared across dataloaders:
-    num_workers:  2
-    pin_memory: false
-    drop_last: false
-
-  train_ds:
-    ds_item: 'train.tsv'
-    shuffle: true
-    num_samples: -1
-    batch_size: 32
-
-  validation_ds:
-    ds_item: 'dev.tsv' # for MNLI 'dev_matched.tsv' and 'dev_mismatched.tsv' will de used
-    shuffle: false
-    num_samples: -1
-    batch_size: 32
-
-  tokenizer:
-      tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
-      vocab_file: null # path to vocab file
-      tokenizer_model: null # only used if tokenizer is sentencepiece
-      special_tokens: null # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently.
-
-  language_model:
-    pretrained_model_name: bert-base-uncased
-    lm_checkpoint: null
-    config_file: null # json file, precedence over config
-    config: null
-
-  optim:
-    name: adam
-    lr: 5e-5
-    weight_decay: 0.00
-
-    sched:
-      name: WarmupAnnealing
-      # Scheduler params
-      warmup_steps: null
-      warmup_ratio: 0.1
-      last_epoch: -1
-
-      # pytorch lightning args
-      monitor: val_loss
-      reduce_on_plateau: false
-
-exp_manager:
-  exp_dir: null  # exp_dir for your experiment, if None, defaults to "./NeMo_experiments"
-  name: *task_name # The name of your model
-  create_tensorboard_logger: True  # Whether you want exp_manger to create a tb logger
-  create_checkpoint_callback: True  # Whether you want exp_manager to create a modelcheckpoint callback
-
-hydra:
-  run:
-    dir: .
-  job_logging:
-    root:
-      handlers: null
diff --git a/examples/nlp/information_retrieval/bert_dpr.py b/examples/nlp/information_retrieval/bert_dpr.py
deleted file mode 100644
index 4fc791da04fd..000000000000
--- a/examples/nlp/information_retrieval/bert_dpr.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import lightning.pytorch as pl
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models import BertDPRModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-@hydra_runner(config_path="conf", config_name="bert_ir_config")
-def main(cfg: DictConfig) -> None:
-    logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
-    trainer = pl.Trainer(**cfg.trainer)
-    exp_manager(trainer, cfg.get("exp_manager", None))
-    bert_dpr_model = BertDPRModel(cfg.model, trainer=trainer)
-    trainer.fit(bert_dpr_model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/information_retrieval/bert_joint_ir.py b/examples/nlp/information_retrieval/bert_joint_ir.py
deleted file mode 100644
index f95cdd04e036..000000000000
--- a/examples/nlp/information_retrieval/bert_joint_ir.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import lightning.pytorch as pl
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models import BertJointIRModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-@hydra_runner(config_path="conf", config_name="bert_ir_config")
-def main(cfg: DictConfig) -> None:
-    logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
-    trainer = pl.Trainer(**cfg.trainer)
-    exp_manager(trainer, cfg.get("exp_manager", None))
-    bert_joint_ir_model = BertJointIRModel(cfg.model, trainer=trainer)
-    trainer.fit(bert_joint_ir_model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/information_retrieval/conf/bert_ir_config.yaml b/examples/nlp/information_retrieval/conf/bert_ir_config.yaml
deleted file mode 100644
index 56e573e0bcf6..000000000000
--- a/examples/nlp/information_retrieval/conf/bert_ir_config.yaml
+++ /dev/null
@@ -1,99 +0,0 @@
-# Fine-tuning BERT model for information retrieval
-name: &name BertIR
-trainer:
-  devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices
-  num_nodes: 1
-  max_epochs: 2 # the number of training epochs
-  max_steps: -1 # precedence over max_epochs
-  accumulate_grad_batches: 1 # accumulates grads every k batches
-  precision: 16 # 16 to use AMP
-  accelerator: gpu
-  strategy: ddp
-  log_every_n_steps: 1  # Interval of logging.
-  val_check_interval: 0.05  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
-  enable_checkpointing: False  # provided by exp_manager
-  logger: false  # provided by exp_manager
-
-model:
-  nemo_path: null # exported .nemo path
-
-  language_model:
-    pretrained_model_name: bert-base-uncased
-    sim_score_dropout: 0.1
-    lm_checkpoint: null
-    config:
-      attention_probs_dropout_prob: 0.1
-      hidden_act: gelu
-      hidden_dropout_prob: 0.1
-      hidden_size: 768
-      initializer_range: 0.02
-      intermediate_size: 3072
-      max_position_embeddings: 512
-      num_attention_heads: 12
-      num_hidden_layers: 12
-      type_vocab_size: 2
-      vocab_size: 30522
-    config_file: null # json file, precedence over config
-
-  tokenizer:
-    tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec
-    vocab_file: null # path to vocab file
-    tokenizer_model: null # tokenizer model for sentencepiece
-    special_tokens: null
-
-  train_ds:
-    passages: null # path to file with passages and their indices
-    queries: null # path to file with training queries and their indices
-    query_to_passages: null
-    # path to file with training examples which have the form of
-    # (query_id, relevant_passage_id, irrelevant_passage_1_id, ..., irrelevant_passage_n_id)
-    num_negatives: 10
-    batch_size: 6
-    psg_cache_format: npz
-    shuffle: true
-    num_samples: -1 # number of samples to be considered, -1 means all the dataset
-    num_workers: 1
-    drop_last: false
-    pin_memory: false
-
-  validation_ds:
-    passages: null # path to file with passages and their indices
-    queries: null # path to file with validation queries and their indices
-    query_to_passages: null # path to file with passages to re-rank for each validation query
-    num_negatives: 10
-    batch_size: 6
-    psg_cache_format: pkl
-    shuffle: false
-    num_samples: -1 # number of samples to be considered, -1 means all the dataset
-    num_workers: 1
-    drop_last: false
-    pin_memory: false
-
-  optim:
-    name: adam
-    lr: 1e-5
-    betas: [0.9, 0.999]
-    weight_decay: 0
-
-    sched:
-      name: WarmupAnnealing
-      warmup_steps: null
-      warmup_ratio: 0.05
-      last_epoch: -1
-
-      # pytorch lightning args
-      monitor: val_loss
-      reduce_on_plateau: false
-
-exp_manager:
-  exp_dir: null  # where to store logs and checkpoints
-  name: *name  # name of experiment
-  create_tensorboard_logger: True
-  create_checkpoint_callback: True
-
-hydra:
-  run:
-    dir: .
-  job_logging:
-    root:
-      handlers: null
diff --git a/examples/nlp/information_retrieval/conf/megatron_bert_embedding_config.yaml b/examples/nlp/information_retrieval/conf/megatron_bert_embedding_config.yaml
deleted file mode 100644
index 7e4ecf09f5a0..000000000000
--- a/examples/nlp/information_retrieval/conf/megatron_bert_embedding_config.yaml
+++ /dev/null
@@ -1,160 +0,0 @@
-name: megatron_bert
-restore_from_path: null # used when starting from a .nemo file
-
-trainer:
-  devices: 1
-  num_nodes: 1
-  accelerator: gpu
-  precision: 16
-  logger: False # logger provided by exp_manager
-  enable_checkpointing: False
-  use_distributed_sampler: False
-  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
-  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 10
-  val_check_interval: 100
-  limit_val_batches: 50
-  limit_test_batches: 500
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1.0
-  benchmark: False
-
-exp_manager:
-  explicit_log_dir: null
-  exp_dir: null
-  name: megatron_bert
-  create_wandb_logger: False
-  wandb_logger_kwargs:
-    project: null
-    name: null
-  resume_if_exists: True
-  resume_ignore_no_checkpoint: True
-  create_checkpoint_callback: True
-  checkpoint_callback_params:
-    monitor: val_loss
-    save_top_k: 10
-    mode: min
-    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
-    filename: 'megatron_bert--{val_loss:.2f}-{step}-{consumed_samples}'
-    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
-
-
-model:
-  # model parallelism 
-  mcore_bert: True
-  micro_batch_size: 4
-  global_batch_size: 8
-  tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1
-  virtual_pipeline_model_parallel_size: null
-
-  # model architecture
-  encoder_seq_length: 512
-  max_position_embeddings: ${.encoder_seq_length}
-  position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
-  num_layers: 24
-  hidden_size: 1024
-  ffn_hidden_size: 4096 # Transformer FFN hidden size. Usually 4 * hidden_size.
-  num_attention_heads: 16
-  transformer_block_type: post_ln
-  add_pooler: True
-  add_lm_head: False
-  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
-  hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
-  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
-  apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number.
-  normalization: layernorm
-  layernorm_epsilon: 1e-12
-  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
-  pre_process: True # add embedding
-  post_process: True # add pooler
-  bert_binary_head: True # BERT binary head
-  megatron_legacy: False
-  tokenizer:
-    library: 'huggingface'
-    type: 'intfloat/e5-large-unsupervised'
-    model: null
-    vocab_file: null
-    merge_file: null 
-
-  # embedding-specific arguemnts
-  softmax_temp: 0.02 # softmax temp for contrastive loss
-  global_inbatch_negatives: True # whether to use in-batch negatives from other ranks during training
-  backprop_type: 'global' # whether to use `global` or `local` backpropagation during training. Refer to Flava paper for details. 
-  
-  # precision
-  native_amp_init_scale: 4294967296 # 2 ** 32
-  native_amp_growth_interval: 1000
-  fp32_residual_connection: False # Move residual connections to fp32
-  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
-
-  # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
-  grad_allreduce_chunk_size_mb: 125
-  grad_div_ar_fusion: False 
-
-  # miscellaneous
-  seed: 1234
-  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
-  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
-  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-
-  ## Activation Checkpointing
-  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
-  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
-  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
-  # 'full' will checkpoint the entire transformer layer.
-  activations_checkpoint_granularity: null # 'selective' or 'full' 
-  activations_checkpoint_method: null # 'uniform', 'block'
-  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
-  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
-  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
-  activations_checkpoint_num_layers: null
-  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
-  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
-  num_micro_batches_with_partial_activation_checkpoints: null
-  # This feature is valid only when used with pipeline-model-parallelism.
-  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
-  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
-  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
-  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
-  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
-  activations_checkpoint_layers_per_pipeline: null
-  # This feature is valid only when used with pipeline-model-parallelism.
-  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
-  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
-  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
-  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
-  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
-  sequence_parallel: False
-  
-  data:
-    # Path to data must be specified by the user.
-    data_train: null
-    data_validation: null
-    hard_negatives_to_train: 4 # number of hard negatives to use per example for training
-    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
-    data_impl: mmap
-    splits_string: 900,50,50
-    seq_length: ${model.encoder_seq_length}
-    skip_warmup: True
-    num_workers: 0
-    dataloader_type: single  # cyclic, LDDL
-    reset_position_ids: False # Reset position ids after end-of-document token
-    reset_attention_mask: False # Reset attention mask after end-of-document token
-    eod_mask_loss: False # Mask loss for the end of document tokens
-    masked_lm_prob: 0.15 # Probability of replacing a token with mask.
-    short_seq_prob: 0.1 # Probability of producing a short sequence.
-  
-  optim:
-    name: fused_adam
-    lr: 2e-4
-    weight_decay: 0.01 
-    betas: 
-    - 0.9
-    - 0.98
-    sched:
-      name: CosineAnnealing
-      warmup_steps: 500
-      constant_steps: 50000
-      min_lr: 2e-5
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
deleted file mode 100644
index e407aec167e9..000000000000
--- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
+++ /dev/null
@@ -1,221 +0,0 @@
-name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning
-
-trainer:
-  devices: 1
-  accelerator: gpu
-  num_nodes: 1
-  precision: bf16
-  logger: False # logger provided by exp_manager
-  enable_checkpointing: False
-  use_distributed_sampler: False
-  max_epochs: 9999
-  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 10 # frequency with which training steps are logged
-  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
-  gradient_clip_val: 1.0
-
-exp_manager:
-  explicit_log_dir: null
-  exp_dir: null
-  name: ${name}
-  create_wandb_logger: False
-  wandb_logger_kwargs:
-    project: null
-    name: null
-  resume_if_exists: True
-  resume_ignore_no_checkpoint: True
-  create_checkpoint_callback: True
-  checkpoint_callback_params:
-    monitor: validation_${model.data.test_ds.metric.name}
-    save_top_k: 1
-    mode: min
-    save_nemo_on_train_end: True
-    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
-    model_parallel_size: ${model.tensor_model_parallel_size}
-    always_save_nemo: True
-    save_best_model: True
-
-model:
-  seed: 1234
-  tensor_model_parallel_size: 1 # intra-layer model parallelism
-  pipeline_model_parallel_size: 1 # inter-layer model parallelism
-
-  global_batch_size: 1
-  micro_batch_size: 1
-  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
-  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
-  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
-  sync_batch_comm: False
-  megatron_amp_O2: False
-
-  ## Sequence Parallelism
-  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
-  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
-  sequence_parallel: False
-
-  ## Activation Checkpoint
-  activations_checkpoint_granularity: null # 'selective' or 'full'
-  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
-  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
-  # of each chunk at the specified granularity
-  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
-  activations_checkpoint_num_layers: null # not used with 'selective'
-  activations_checkpoint_layers_per_pipeline: null
-  gradient_as_bucket_view: False
-
-  hidden_dropout: 0.0
-  attention_dropout: 0.0
-  ffn_dropout: 0.0
-  temperature: 0.02
-  num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only
-  use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only
-  post_process: False # should be False.
-  transformer_engine: True # required to be True for newer versions of Megatron-LM based models
-  mcore_gpt: True # required to be True for newer versions of Megatron-LM based models
-  use_flash_attention: True
-  precision: bf16
-
-  peft:
-    peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
-    restore_from_path: null
-    restore_from_ckpt:
-      checkpoint_dir: null
-      checkpoint_name: null
-
-    # Used for adapter peft training
-    adapter_tuning:
-      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
-      adapter_dim: 32
-      adapter_dropout: 0.0
-      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
-      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
-      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
-      weight_tying: False
-      position_embedding_strategy: null # used only when weight_tying is True
-
-    lora_tuning:
-      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
-      adapter_dim: 32
-      alpha: ${peft.lora_tuning.adapter_dim} 
-      adapter_dropout: 0.0
-      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
-      weight_tying: False
-      position_embedding_strategy: null # used only when weight_tying is True
-
-    # Used for p-tuning peft training
-    p_tuning:
-      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
-      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
-      embedding_dim: 1024  # the size of the prompt encoder embeddings
-      init_std: 0.023
-
-    ia3_tuning:
-      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
-    
-    selective_tuning:
-      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
-
-  data:
-    test_ds:
-      query_file_names: ??? # Path to a list of JSONL files corresponding to the query data. Data format is identical to validation_ds.
-      doc_file_names: ??? # Path to a list of JSONL files corresponding to the doc data. Data format is identical to validation_ds.
-      names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics.
-      global_batch_size: ${global_batch_size}
-      micro_batch_size: ${micro_batch_size}
-      shuffle: False
-      num_workers: 0
-      pin_memory: True
-      max_seq_length: 2048
-      min_seq_length: 1
-      drop_last: False
-      add_eos: True
-      add_bos: False
-      write_embeddings_to_file: True
-      output_file_path_prefix: "test_embeddings" # Prefix of the file to write predictions to.
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
-
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-    
-inference:
-  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
-  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
-  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-  temperature: 1.0 # sampling temperature
-  all_probs: False  # whether return the log prob for all the tokens in vocab
-  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
-  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
-  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-  outfile_path: output.txt
-  compute_attention_mask: True
-
-# server-related configs
-server: False  # whether launch the API server
-port: 5555 # the port number for the inference server
-web_server: False # whether launch the web inference server
-share: True  # whether create a public URL
-username: test # user name for web client
-password: test2  # password for web client
-web_port: 9889 # the port number of the web server 1058
-chat: False # use the chat interface
-chatbot_config:
-  value: False   # whether to inject the value attributes
-  attributes:
-    - name: Quality
-      min: 0
-      max: 4
-      key: quality
-      type: int
-      default: 4
-    - name: Toxicity
-      min: 0
-      max: 4
-      key: toxcity
-      type: int
-      default: 0
-    - name: Humor
-      min: 0
-      max: 4
-      key: humor
-      type: int
-      default: 0
-    - name: Creativity
-      min: 0
-      max: 4
-      key: creativity
-      type: int
-      default: 0
-    - name: Violence
-      min: 0
-      max: 4
-      key: violence
-      type: int
-      default: 0
-    - name: Helpfulness
-      min: 0
-      max: 4
-      key: helpfulness
-      type: int
-      default: 4
-    - name: Not_Appropriate
-      min: 0
-      max: 4
-      key: not_appropriate
-      type: int
-      default: 0
-    - name: Language
-      choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
-      key: lang
-      type: list
-      default: en
-   
-  user: User
-  assistant: Assistant
-  system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
deleted file mode 100644
index 1c2db1a862f4..000000000000
--- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
+++ /dev/null
@@ -1,220 +0,0 @@
-name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning
-
-trainer:
-  devices: 1
-  accelerator: gpu
-  num_nodes: 1
-  precision: bf16
-  logger: False # logger provided by exp_manager
-  enable_checkpointing: False
-  use_distributed_sampler: False
-  max_epochs: null
-  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 10 # frequency with which training steps are logged
-  val_check_interval: ${trainer.max_steps} # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
-  gradient_clip_val: null
-  num_sanity_val_steps: 0
-
-exp_manager:
-  explicit_log_dir: null
-  exp_dir: null
-  name: ${name}
-  create_wandb_logger: False
-  wandb_logger_kwargs:
-    project: null
-    name: null
-  resume_if_exists: True
-  resume_ignore_no_checkpoint: True
-  create_checkpoint_callback: True
-  checkpoint_callback_params:
-    monitor: validation_${model.data.validation_ds.metric.name}
-    save_top_k: 1
-    mode: min
-    save_nemo_on_train_end: True
-    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
-    model_parallel_size: ${model.tensor_model_parallel_size}
-    always_save_nemo: False
-    save_best_model: True
-  create_early_stopping_callback: False
-  early_stopping_callback_params:
-    monitor: "val_loss"
-    mode: "min"
-    min_delta: 0.001
-    patience: 10
-    verbose: True
-    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
-
-model:
-  seed: 1234
-  tensor_model_parallel_size: 1 # intra-layer model parallelism
-  pipeline_model_parallel_size: 1 # inter-layer model parallelism
-
-  global_batch_size: 128
-  micro_batch_size: 4
-  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
-  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
-  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
-  sync_batch_comm: False
-  megatron_amp_O2: True 
-
-  ## Sequence Parallelism
-  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
-  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
-  sequence_parallel: False
-
-  ## Activation Checkpoint
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
-  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
-  # of each chunk at the specified granularity
-  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
-  activations_checkpoint_num_layers: null # not used with 'selective'
-  activations_checkpoint_layers_per_pipeline: null
-  gradient_as_bucket_view: False
-
-  hidden_dropout: 0.0
-  attention_dropout: 0.0
-  ffn_dropout: 0.0
-  temperature: 0.02
-  num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only
-  use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only
-  post_process: False # should be False.
-  transformer_engine: True # required to be True for newer versions of Megatron-LM based models
-  mcore_gpt: True # required to be True for newer versions of Megatron-LM based models
-  use_flash_attention: True
-  precision: bf16
-  apply_rope_fusion: False
-  reward_model_loss: False  # Set this to true to perform RLHF style reward model loss -log(sigmoid(accept_logit - reject_logit))
-
-  peft:
-    peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
-    restore_from_path: null
-
-    # Used for adapter peft training
-    adapter_tuning:
-      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
-      adapter_dim: 32
-      adapter_dropout: 0.0
-      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
-      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
-      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
-      weight_tying: False
-      position_embedding_strategy: null # used only when weight_tying is True
-
-    lora_tuning:
-      target_modules: ['attention_qkv', 'attention_dense', 'mlp_fc1', 'mlp_fc2'] #
-      adapter_dim: 32
-      adapter_dropout: 0.0
-      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
-      weight_tying: False
-      position_embedding_strategy: null # used only when weight_tying is True
-
-    # Used for p-tuning peft training
-    p_tuning:
-      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
-      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
-      embedding_dim: 1024  # the size of the prompt encoder embeddings
-      init_std: 0.023
-
-    ia3_tuning:
-      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
-    
-    selective_tuning:
-      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
-
-  data:
-    train_ds:
-      # Example of how to specify paths to multiple datasets
-      # file_names:
-      #   - /path/to/squad.jsonl
-      #   - /path/to/mnli.jsonl
-      #   - /path/to/boolq.jsonl
-      # Example of how each dataset is formatted
-      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
-      file_names: ??? # Path to a list of JSONL files corresponding to the source data.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: True
-      num_workers: 0
-      memmap_workers: 2
-      pin_memory: True
-      max_seq_length: 512  # Even if the base model can handle longer sequences, 512 is generally a good choice for training efficiency.
-      min_seq_length: 1
-      drop_last: True
-      # Example of how to specify concat_sampling_probabilities
-      # concat_sampling_probabilities:
-      #   - 0.5
-      #   - 0.25
-      #   - 0.25
-      concat_sampling_probabilities: 
-        - 1.0 
-      label_key: 'output'
-      add_eos: True
-      add_bos: False
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
-    validation_ds:
-      query_file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      doc_file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: False
-      num_workers: 0
-      memmap_workers: ${model.data.train_ds.memmap_workers}
-      pin_memory: True
-      max_seq_length: ${model.data.train_ds.max_seq_length}
-      min_seq_length: 1
-      drop_last: False
-      label_key: ${model.data.train_ds.label_key}
-      add_eos: ${model.data.train_ds.add_eos}
-      add_bos: ${model.data.train_ds.add_bos}
-      write_embeddings_to_file: False
-      output_file_path_prefix: "validation_embeddings" # Prefix of the file to write predictions to.
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-    test_ds:
-      file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      names: null # Names of the corresponding datasets used to log metrics.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: False
-      num_workers: 0
-      memmap_workers: ${model.data.train_ds.memmap_workers}
-      pin_memory: True
-      max_seq_length: ${model.data.train_ds.max_seq_length}
-      min_seq_length: 1
-      drop_last: False
-      add_eos: ${model.data.train_ds.add_eos}
-      add_bos: ${model.data.train_ds.add_bos}
-      write_predictions_to_file: True
-      output_file_path_prefix: "test_embeddings" # Prefix of the file to write predictions to.
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-
-  optim:
-    name: fused_adam
-    lr: 1e-4
-    weight_decay: 0.01 
-    betas: 
-    - 0.9
-    - 0.98
-    sched:
-      name: CosineAnnealing
-      warmup_steps: 50
-      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
-      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
-      monitor: val_loss
-      reduce_on_plateau: false
\ No newline at end of file
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_reranker_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_reranker_tuning_config.yaml
deleted file mode 100644
index 863b5fb475a0..000000000000
--- a/examples/nlp/information_retrieval/conf/megatron_gpt_reranker_tuning_config.yaml
+++ /dev/null
@@ -1,222 +0,0 @@
-name: megatron_gpt_peft_reranker_tuning
-
-trainer:
-  devices: 1
-  accelerator: gpu
-  num_nodes: 1
-  precision: bf16
-  logger: False # logger provided by exp_manager
-  enable_checkpointing: False
-  use_distributed_sampler: False
-  max_epochs: null
-  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 10 # frequency with which training steps are logged
-  val_check_interval: ${trainer.max_steps} # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
-  gradient_clip_val: null
-  num_sanity_val_steps: 0
-
-exp_manager:
-  explicit_log_dir: null
-  exp_dir: null
-  name: ${name}
-  create_wandb_logger: False
-  wandb_logger_kwargs:
-    project: null
-    name: null
-  resume_if_exists: True
-  resume_ignore_no_checkpoint: True
-  create_checkpoint_callback: True
-  checkpoint_callback_params:
-    monitor: validation_${model.data.validation_ds.metric.name}
-    save_top_k: 1
-    mode: min
-    save_nemo_on_train_end: True
-    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
-    model_parallel_size: ${model.tensor_model_parallel_size}
-    always_save_nemo: False
-    save_best_model: True
-  create_early_stopping_callback: False
-  early_stopping_callback_params:
-    monitor: "val_loss"
-    mode: "min"
-    min_delta: 0.001
-    patience: 10
-    verbose: True
-    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
-
-model:
-  seed: 1234
-  tensor_model_parallel_size: 1 # intra-layer model parallelism
-  pipeline_model_parallel_size: 1 # inter-layer model parallelism
-
-  global_batch_size: 128
-  micro_batch_size: 4
-  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
-  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
-  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
-  sync_batch_comm: False
-  megatron_amp_O2: True 
-
-  ## Sequence Parallelism
-  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
-  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
-  sequence_parallel: False
-
-  ## Activation Checkpoint
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
-  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
-  # of each chunk at the specified granularity
-  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
-  activations_checkpoint_num_layers: null # not used with 'selective'
-  activations_checkpoint_layers_per_pipeline: null
-  gradient_as_bucket_view: False
-
-  hidden_dropout: 0.0
-  attention_dropout: 0.0
-  ffn_dropout: 0.0
-  temperature: 0.02
-  num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only
-  use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only
-  post_process: False # should be False.
-  apply_rope_fusion: False
-  transformer_engine: True # required to be True for newer versions of Megatron-LM based models
-  mcore_gpt: True # required to be True for newer versions of Megatron-LM based models
-  use_flash_attention: True
-  precision: bf16
-
-  peft:
-    peft_scheme: "mlp_head,lora"  # can be either adapter,ia3, or ptuning
-    restore_from_path: null
-
-    # Used for adapter peft training
-    adapter_tuning:
-      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
-      adapter_dim: 32
-      adapter_dropout: 0.0
-      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
-      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
-      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
-      weight_tying: False
-      position_embedding_strategy: null # used only when weight_tying is True
-
-    lora_tuning:
-      target_modules: ['attention_qkv', 'attention_dense', 'mlp_fc1', 'mlp_fc2'] #
-      adapter_dim: 32
-      adapter_dropout: 0.0
-      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
-      weight_tying: False
-      position_embedding_strategy: null # used only when weight_tying is True
-
-    # Used for p-tuning peft training
-    p_tuning:
-      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
-      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
-      embedding_dim: 1024  # the size of the prompt encoder embeddings
-      init_std: 0.023
-    
-    # Instead of using the GPT LM Head, we can use a custom head for the reranking task
-    mlp_head_tuning:
-      out_features: 1
-
-    ia3_tuning:
-      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
-    
-    selective_tuning:
-      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
-
-  data:
-    train_ds:
-      # Example of how to specify paths to multiple datasets
-      # file_names:
-      #   - /path/to/squad.jsonl
-      #   - /path/to/mnli.jsonl
-      #   - /path/to/boolq.jsonl
-      # Example of how each dataset is formatted
-      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
-      file_names: ??? # Path to a list of JSONL files corresponding to the source data.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: True
-      num_workers: 0
-      memmap_workers: 2
-      pin_memory: True
-      max_seq_length: 512  # Even if the base model can handle longer sequences, 512 is generally a good choice for training efficiency.
-      min_seq_length: 1
-      drop_last: True
-      # Example of how to specify concat_sampling_probabilities
-      # concat_sampling_probabilities:
-      #   - 0.5
-      #   - 0.25
-      #   - 0.25
-      concat_sampling_probabilities: 
-        - 1.0 
-      label_key: 'output'
-      add_eos: True
-      add_bos: False
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
-    validation_ds:
-      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      names: ["validation"] # Names of the corresponding datasets used to log metrics.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: False
-      num_workers: 0
-      memmap_workers: ${model.data.train_ds.memmap_workers}
-      pin_memory: True
-      max_seq_length: ${model.data.train_ds.max_seq_length}
-      min_seq_length: 1
-      drop_last: False
-      label_key: ${model.data.train_ds.label_key}
-      add_eos: ${model.data.train_ds.add_eos}
-      add_bos: ${model.data.train_ds.add_bos}
-      write_embeddings_to_file: False
-      output_file_path_prefix: "validation_rankings" # Prefix of the file to write predictions to.
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-    test_ds:
-      file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      names: null # Names of the corresponding datasets used to log metrics.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: False
-      num_workers: 0
-      memmap_workers: ${model.data.train_ds.memmap_workers}
-      pin_memory: True
-      max_seq_length: ${model.data.train_ds.max_seq_length}
-      min_seq_length: 1
-      drop_last: False
-      add_eos: ${model.data.train_ds.add_eos}
-      add_bos: ${model.data.train_ds.add_bos}
-      write_predictions_to_file: True
-      output_file_path_prefix: "test_embeddings" # Prefix of the file to write predictions to.
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-
-  optim:
-    name: fused_adam
-    lr: 1e-4
-    weight_decay: 0.01 
-    betas: 
-    - 0.9
-    - 0.98
-    sched:
-      name: CosineAnnealing
-      warmup_steps: 50
-      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
-      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
-      monitor: val_loss
-      reduce_on_plateau: false
\ No newline at end of file
diff --git a/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py b/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py
deleted file mode 100644
index 7486b470425a..000000000000
--- a/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf, open_dict
-
-from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronBertTrainerBuilder
-from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-@hydra_runner(config_path="conf", config_name="megatron_bert_embedding_config")
-def main(cfg) -> None:
-    if cfg.model.data.dataloader_type != "LDDL":
-        mp.set_start_method("spawn", force=True)
-
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
-
-    trainer = MegatronBertTrainerBuilder(cfg).create_trainer()
-    exp_manager(trainer, cfg.exp_manager)
-
-    model_cfg = MegatronBertEmbeddingModel.merge_cfg_with(cfg.restore_from_path, cfg)
-
-    assert (
-        model_cfg.micro_batch_size * cfg.trainer.devices * cfg.trainer.num_nodes == model_cfg.global_batch_size
-    ), "Gradiant accumulation is not supported for contrastive learning yet"
-
-    OmegaConf.set_struct(model_cfg, True)
-    with open_dict(model_cfg):
-        model_cfg.precision = trainer.precision
-
-    logging.info(f"Loading model from {cfg.restore_from_path}")
-    model = MegatronBertEmbeddingModel.restore_from(
-        restore_path=cfg.restore_from_path,
-        trainer=trainer,
-        save_restore_connector=NLPSaveRestoreConnector(),
-        override_config_path=model_cfg,
-        strict=True,
-    )
-
-    trainer.fit(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/information_retrieval/megatron_bert_embedding_generate.py b/examples/nlp/information_retrieval/megatron_bert_embedding_generate.py
deleted file mode 100644
index 9814129b837d..000000000000
--- a/examples/nlp/information_retrieval/megatron_bert_embedding_generate.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf, open_dict
-
-from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronBertTrainerBuilder
-from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-@hydra_runner(config_path="conf", config_name="megatron_bert_embedding_config")
-def main(cfg) -> None:
-    if cfg.model.data.dataloader_type != "LDDL":
-        mp.set_start_method("spawn", force=True)
-
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
-
-    trainer = MegatronBertTrainerBuilder(cfg).create_trainer()
-    exp_manager(trainer, cfg.exp_manager)
-
-    model_cfg = MegatronBertEmbeddingModel.merge_cfg_with(cfg.restore_from_path, cfg)
-
-    OmegaConf.set_struct(model_cfg, True)
-    with open_dict(model_cfg):
-        model_cfg.precision = trainer.precision
-
-    logging.info(f"Loading model from {cfg.restore_from_path}")
-    model = MegatronBertEmbeddingModel.restore_from(
-        restore_path=cfg.restore_from_path,
-        trainer=trainer,
-        save_restore_connector=NLPSaveRestoreConnector(),
-        override_config_path=model_cfg,
-        strict=True,
-    )
-
-    trainer.test(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py b/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py
deleted file mode 100644
index 9cb5cb5d3d19..000000000000
--- a/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections.abc import MutableMapping
-
-import torch.multiprocessing as mp
-from lightning.pytorch.loggers import WandbLogger
-from omegaconf.omegaconf import OmegaConf
-
-from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import MegatronGPTEmbeddingModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-mp.set_start_method("spawn", force=True)
-
-
-def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.') -> MutableMapping:
-    items = []
-    for k, v in d.items():
-        new_key = parent_key + sep + k if parent_key else k
-        if isinstance(v, MutableMapping):
-            items.extend(flatten_dict(v, new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
-
-
-@hydra_runner(config_path="conf", config_name="megatron_gpt_embedder_tuning_config")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
-
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-    exp_manager(trainer, cfg.exp_manager)
-
-    model_cfg = MegatronGPTEmbeddingModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
-    if trainer.global_rank == 0:
-        for logger in trainer.loggers:
-            if isinstance(logger, WandbLogger):
-                fd = flatten_dict(dict(model_cfg), sep="/")
-                logger.experiment.config.update(fd)
-    model = MegatronGPTEmbeddingModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
-
-    if cfg.model.peft.restore_from_path is not None:
-        # initialize peft weights from a checkpoint instead of randomly
-        # This is not the same as resume training because optimizer states are not restored.
-        logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
-        model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg))
-    elif peft_cfg_cls is not None:
-        logging.info("Adding adapter weights to the model for PEFT")
-        model.add_adapter(peft_cfg_cls(model_cfg))
-    else:
-        logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
-
-    trainer.fit(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py b/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py
deleted file mode 100644
index d66ddb339773..000000000000
--- a/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import asyncio
-import os
-import threading
-from functools import partial
-
-import torch
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf, open_dict
-
-from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import MegatronGPTEmbeddingModel
-from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer
-from nemo.collections.nlp.modules.common.text_generation_utils import generate
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.model_utils import inject_model_parallel_rank
-
-try:
-    from megatron.core import parallel_state
-
-    HAVE_MEGATRON_CORE = True
-except (ImportError, ModuleNotFoundError):
-
-    HAVE_MEGATRON_CORE = False
-
-mp.set_start_method("spawn", force=True)
-
-
-def use_inference_server(cfg, model, trainer):
-    if not HAVE_MEGATRON_CORE:
-        raise ValueError('Megatron-core needs to be installed to use this feature!')
-
-    from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo
-
-    trainer.test(model, dataloaders=None)
-
-    if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0:
-        if cfg.web_server:
-            if cfg.chat:
-                defaults = {
-                    'user': cfg.chatbot_config.user,
-                    'assistant': cfg.chatbot_config.assistant,
-                    'system': cfg.chatbot_config.system,
-                }
-                web_ui = partial(
-                    get_chatbot_demo,
-                    defaults=defaults,
-                    value=cfg.chatbot_config.value,
-                    attributes=cfg.chatbot_config.attributes,
-                )
-            else:
-                web_ui = get_demo
-            loop = asyncio.new_event_loop()
-            thread = threading.Thread(
-                target=web_ui,
-                daemon=True,
-                args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop),
-            )
-            thread.start()
-        server = MegatronServer(model.cuda())
-        server.run("0.0.0.0", port=cfg.port)
-
-    while True:
-        choice = torch.cuda.LongTensor(1)
-        torch.distributed.broadcast(choice, 0)
-        if choice[0].item() == 0:
-            generate(model.cuda())
-
-
-@hydra_runner(config_path="conf", config_name="megatron_gpt_embedder_generate_config")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-
-    if cfg.model.peft.restore_from_path:
-        model_cfg = MegatronGPTEmbeddingModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg)
-    else:
-        model_cfg = MegatronGPTEmbeddingModel.merge_inference_cfg(cfg.model.restore_from_path, cfg)
-
-    with open_dict(model_cfg):
-        model_cfg.post_process = False
-
-    model = MegatronGPTEmbeddingModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-
-    if cfg.model.peft.restore_from_path:
-        model.load_adapters(cfg.model.peft.restore_from_path)
-    elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
-        peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
-        checkpoint_path = os.path.join(
-            cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
-        )
-        # checkpoint_path is a dir in case of distributed checkpointing
-        if not os.path.isdir(checkpoint_path):
-            # legacy checkpoint needs model parallel rank injection
-            checkpoint_path = inject_model_parallel_rank(
-                os.path.join(
-                    cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
-                )
-            )
-            model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg))
-        else:
-            raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
-
-    model.freeze()
-    logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}")
-
-    if not cfg.model.get('use_flash_attention', False):
-        cfg.inference.compute_attention_mask = True
-    config = OmegaConf.to_container(cfg.inference, resolve=True)
-    model.set_inference_config(config)
-
-    if not cfg.server:
-        trainer.test(model)
-    else:
-        use_inference_server(cfg, model, trainer)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py b/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py
deleted file mode 100644
index 5aad85646e3b..000000000000
--- a/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections.abc import MutableMapping
-
-import torch.multiprocessing as mp
-from lightning.pytorch.loggers import WandbLogger
-from omegaconf.omegaconf import OmegaConf
-
-from nemo.collections.nlp.models.information_retrieval.megatron_gpt_reranker_model import MegatronGPTRerankerModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-mp.set_start_method("spawn", force=True)
-
-
-def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.') -> MutableMapping:
-    items = []
-    for k, v in d.items():
-        new_key = parent_key + sep + k if parent_key else k
-        if isinstance(v, MutableMapping):
-            items.extend(flatten_dict(v, new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
-
-
-@hydra_runner(config_path="conf", config_name="megatron_gpt_reranker_tuning_config")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
-
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-    exp_manager(trainer, cfg.exp_manager)
-
-    model_cfg = MegatronGPTRerankerModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
-    if trainer.global_rank == 0:
-        for logger in trainer.loggers:
-            if isinstance(logger, WandbLogger):
-                fd = flatten_dict(dict(model_cfg), sep="/")
-                logger.experiment.config.update(fd)
-    model = MegatronGPTRerankerModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-    peft_cfg_cls_lst = [PEFT_CONFIG_MAP[s] for s in cfg.model.peft.peft_scheme.split(",")]
-    peft_cfg_cls = [_peft_cfg(model_cfg) for _peft_cfg in peft_cfg_cls_lst]
-
-    if cfg.model.peft.restore_from_path is not None:
-        # initialize peft weights from a checkpoint instead of randomly
-        # This is not the same as resume training because optimizer states are not restored.
-        logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
-        model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls)
-    elif peft_cfg_cls is not None:
-        logging.info("Adding adapter weights to the model for PEFT")
-        # model.add_adapter(peft_cfg_cls(model_cfg))
-        model.add_adapter(peft_cfg_cls)
-    else:
-        logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
-
-    trainer.fit(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/information_retrieval/megatron_gpt_reranker_generate.py b/examples/nlp/information_retrieval/megatron_gpt_reranker_generate.py
deleted file mode 100644
index dea855963713..000000000000
--- a/examples/nlp/information_retrieval/megatron_gpt_reranker_generate.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import asyncio
-import os
-import threading
-from functools import partial
-
-import torch
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf, open_dict
-
-from nemo.collections.nlp.models.information_retrieval.megatron_gpt_reranker_model import MegatronGPTRerankerModel
-from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer
-from nemo.collections.nlp.modules.common.text_generation_utils import generate
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.model_utils import inject_model_parallel_rank
-
-try:
-    from megatron.core import parallel_state
-
-    HAVE_MEGATRON_CORE = True
-except (ImportError, ModuleNotFoundError):
-
-    HAVE_MEGATRON_CORE = False
-
-mp.set_start_method("spawn", force=True)
-
-
-def use_inference_server(cfg, model, trainer):
-    if not HAVE_MEGATRON_CORE:
-        raise ValueError('Megatron-core needs to be installed to use this feature!')
-
-    from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo
-
-    trainer.test(model, dataloaders=None)
-
-    if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0:
-        if cfg.web_server:
-            if cfg.chat:
-                defaults = {
-                    'user': cfg.chatbot_config.user,
-                    'assistant': cfg.chatbot_config.assistant,
-                    'system': cfg.chatbot_config.system,
-                }
-                web_ui = partial(
-                    get_chatbot_demo,
-                    defaults=defaults,
-                    value=cfg.chatbot_config.value,
-                    attributes=cfg.chatbot_config.attributes,
-                )
-            else:
-                web_ui = get_demo
-            loop = asyncio.new_event_loop()
-            thread = threading.Thread(
-                target=web_ui,
-                daemon=True,
-                args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop),
-            )
-            thread.start()
-        server = MegatronServer(model.cuda())
-        server.run("0.0.0.0", port=cfg.port)
-
-    while True:
-        choice = torch.cuda.LongTensor(1)
-        torch.distributed.broadcast(choice, 0)
-        if choice[0].item() == 0:
-            generate(model.cuda())
-
-
-@hydra_runner(config_path="conf", config_name="megatron_gpt_reranker_generate_config")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-
-    if cfg.model.peft.restore_from_path:
-        model_cfg = MegatronGPTRerankerModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg)
-    else:
-        model_cfg = MegatronGPTRerankerModel.merge_inference_cfg(cfg.model.restore_from_path, cfg)
-
-    with open_dict(model_cfg):
-        model_cfg.post_process = False
-
-    model = MegatronGPTRerankerModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-
-    if cfg.model.peft.restore_from_path:
-        model.load_adapters(cfg.model.peft.restore_from_path)
-    elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
-        peft_cfg_cls_lst = [PEFT_CONFIG_MAP[s] for s in cfg.model.peft.peft_scheme.split(",")]
-        peft_cfg_cls = [_peft_cfg(model_cfg) for _peft_cfg in peft_cfg_cls_lst]
-
-        checkpoint_path = os.path.join(
-            cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
-        )
-        # checkpoint_path is a dir in case of distributed checkpointing
-        if not os.path.isdir(checkpoint_path):
-            # legacy checkpoint needs model parallel rank injection
-            checkpoint_path = inject_model_parallel_rank(
-                os.path.join(
-                    cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
-                )
-            )
-            model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls)
-        else:
-            raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
-
-    model.freeze()
-    logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}")
-
-    if not cfg.model.get('use_flash_attention', False):
-        cfg.inference.compute_attention_mask = True
-    config = OmegaConf.to_container(cfg.inference, resolve=True)
-    model.set_inference_config(config)
-
-    if not cfg.server:
-        trainer.test(model)
-    else:
-        use_inference_server(cfg, model, trainer)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml b/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml
deleted file mode 100644
index df66111375cb..000000000000
--- a/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml
+++ /dev/null
@@ -1,110 +0,0 @@
-# Intent and Slot classification with pretrained BERT models
-
-trainer:
-  devices: 1 # the number of gpus, 0 for CPU
-  num_nodes: 1
-  max_epochs: 50
-  max_steps: -1 # precedence over max_epochs
-  accumulate_grad_batches: 1 # accumulates grads every k batches
-  precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP.
-  accelerator: gpu
-  strategy: ddp
-  log_every_n_steps: 1  # Interval of logging.
-  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
-
-  enable_checkpointing: False
-  logger: false  # Provided by exp_manager
-
-model:
-  nemo_path: null # filename to save the model and associated artifacts to .nemo file
-  data_dir: ??? # /path/to/data
-  class_labels:
-    intent_labels_file: intent_labels.csv
-    slot_labels_file: slot_labels.csv
-  class_balancing: null # or weighted_loss
-  intent_loss_weight: 0.6 # relation of intent to slot loss in total loss (between 0 to 1)
-  pad_label: -1 # if -1 not slot token will be used
-  ignore_extra_tokens: false
-  ignore_start_end: true # do not use first and last token for slot training
-
-  train_ds:
-    prefix: train
-    batch_size: 32
-    shuffle: true
-    num_samples: -1
-    num_workers: 2
-    drop_last: false
-    pin_memory: false
-
-  validation_ds:
-    prefix: test
-    batch_size: 32
-    shuffle: false
-    num_samples: -1
-    num_workers: 2
-    drop_last: false
-    pin_memory: false
-
-  test_ds:
-    prefix: test
-    batch_size: 32
-    shuffle: false
-    num_samples: -1
-    num_workers: 2
-    drop_last: false
-    pin_memory: false
-
-  tokenizer:
-    tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
-    vocab_file: null # path to vocab file
-    tokenizer_model: null # only used if tokenizer is sentencepiece
-    special_tokens: null
-
-  language_model:
-    max_seq_length: 50
-    pretrained_model_name: bert-base-uncased
-    lm_checkpoint: null
-    config_file: null # json file, precedence over config
-    config: null
-
-  head:
-    num_output_layers: 2
-    fc_dropout: 0.1
-
-  optim:
-    name: adam
-    lr: 2e-5
-    args:
-      name: auto
-      params:
-        weight_decay: 0.01
-
-    sched:
-      name: WarmupAnnealing
-      iters_per_batch: null # computed at runtime
-      max_steps: -1 # computed at runtime or explicitly set here
-
-      # pytorch lightning args
-      monitor: val_loss
-      reduce_on_plateau: false
-
-      # scheduler config override
-      args:
-        name: auto
-        params:
-          warmup_steps: null
-          warmup_ratio: 0.1
-          last_epoch: -1
-
-exp_manager:
-  exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
-  name: "IntentSlot"  # The name of your model
-  create_tensorboard_logger: true  # Whether you want exp_manger to create a tb logger
-  create_checkpoint_callback: true  # Whether you want exp_manager to create a modelcheckpoint callback
-
-hydra:
-  run:
-    dir: .
-  job_logging:
-    root:
-      handlers: null
diff --git a/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml b/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml
deleted file mode 100644
index c15c71e67c07..000000000000
--- a/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml
+++ /dev/null
@@ -1,110 +0,0 @@
-# Intent and Slot classification with pretrained BERT models
-
-trainer:
-  devices: -1 # number of GPUs, -1 would use all available GPUs
-  num_nodes: 1
-  max_epochs: 5
-  max_steps: -1 # precedence over max_epochs
-  accumulate_grad_batches: 1 # accumulates grads every k batches
-  precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP.
-  accelerator: auto
-  strategy: ddp
-  log_every_n_steps: 1  # Interval of logging.
-  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
-
-  enable_checkpointing: false # Provided by exp_manager
-  logger: false  # Provided by exp_manager
-
-model:
-  nemo_path: null # filename to save the model and associated artifacts to .nemo file
-  data_dir: ??? # /path/to/data
-  class_labels:
-    intent_labels_file: intent_labels.csv
-    slot_labels_file: slot_labels.csv
-  class_balancing: null # or weighted_loss
-  intent_loss_weight: 0.6 # relation of intent to slot loss in total loss (between 0 to 1)
-  pad_label: -1 # if -1 not slot token will be used
-  ignore_extra_tokens: false
-  ignore_start_end: true # do not use first and last token for slot training
-
-  train_ds:
-    prefix: train
-    batch_size: 32
-    shuffle: true
-    num_samples: -1
-    num_workers: 8
-    drop_last: false
-    pin_memory: false
-
-  validation_ds:
-    prefix: dev
-    batch_size: 32
-    shuffle: false
-    num_samples: -1
-    num_workers: 8
-    drop_last: false
-    pin_memory: false
-
-  test_ds:
-    prefix: dev
-    batch_size: 32
-    shuffle: false
-    num_samples: -1
-    num_workers: 8
-    drop_last: false
-    pin_memory: false
-
-  tokenizer:
-    tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
-    vocab_file: null # path to vocab file
-    tokenizer_model: null # only used if tokenizer is sentencepiece
-    special_tokens: null
-
-  language_model:
-    max_seq_length: 50
-    pretrained_model_name: bert-base-uncased
-    lm_checkpoint: null
-    config_file: null # json file, precedence over config
-    config: null
-
-  head:
-    num_output_layers: 2
-    fc_dropout: 0.1
-
-  optim:
-    name: adam
-    lr: 2e-5
-    args:
-      name: auto
-      params:
-        weight_decay: 0.01
-
-    sched:
-      name: WarmupAnnealing
-      iters_per_batch: null # computed at runtime
-      max_steps: -1 # computed at runtime or explicitly set here
-
-      # pytorch lightning args
-      monitor: val_loss
-      reduce_on_plateau: false
-
-      # scheduler config override
-      args:
-        name: auto
-        params:
-          warmup_steps: null
-          warmup_ratio: 0.1
-          last_epoch: -1
-
-language_model:
-  max_seq_length: 50
-  pretrained_model_name: bert-base-uncased
-  lm_checkpoint: null
-  config_file: null # json file, precedence over config
-  config: null
-
-exp_manager:
-  exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
-  name: "MultiLabelIntentSlot"  # The name of your model
-  create_tensorboard_logger: False  # Whether you want exp_manger to create a tb logger
-  create_checkpoint_callback: False  # Whether you want exp_manager to create a modelcheckpoint callback
diff --git a/examples/nlp/intent_slot_classification/intent_slot_classification.py b/examples/nlp/intent_slot_classification/intent_slot_classification.py
deleted file mode 100644
index 2025f48f330f..000000000000
--- a/examples/nlp/intent_slot_classification/intent_slot_classification.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import lightning.pytorch as pl
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models import IntentSlotClassificationModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-@hydra_runner(config_path="conf", config_name="intent_slot_classification_config")
-def main(cfg: DictConfig) -> None:
-    # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True
-    # when there are unused parameters like here
-    if cfg.trainer.strategy == 'ddp':
-        cfg.trainer.strategy = "ddp_find_unused_parameters_true"
-    logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}')
-    trainer = pl.Trainer(**cfg.trainer)
-    exp_manager(trainer, cfg.get("exp_manager", None))
-
-    # initialize the model using the config file
-    model = IntentSlotClassificationModel(cfg.model, trainer=trainer)
-
-    # training
-    logging.info("================================================================================================")
-    logging.info('Starting training...')
-    trainer.fit(model)
-    logging.info('Training finished!')
-
-    # Stop further testing as fast_dev_run does not save checkpoints
-    if trainer.fast_dev_run:
-        return
-
-    # after model training is done, you can load the model from the saved checkpoint
-    # and evaluate it on a data file or on given queries.
-    logging.info("================================================================================================")
-    logging.info("Starting the testing of the trained model on test set...")
-    logging.info("We will load the latest model saved checkpoint from the training...")
-
-    # for evaluation and inference you can load the previously trained model saved in .nemo file
-    # like this in your code, but we will just reuse the trained model here
-    # eval_model = IntentSlotClassificationModel.restore_from(restore_path=checkpoint_path)
-    eval_model = model
-
-    # we will setup testing data reusing the same config (test section)
-    eval_model.update_data_dir_for_testing(data_dir=cfg.model.data_dir)
-    eval_model.setup_test_data(test_data_config=cfg.model.test_ds)
-
-    trainer.test(model=eval_model, ckpt_path=None, verbose=False)
-    logging.info("Testing finished!")
-
-    # run an inference on a few examples
-    logging.info("======================================================================================")
-    logging.info("Evaluate the model on the given queries...")
-
-    # this will work well if you train the model on Assistant dataset
-    # for your own dataset change the examples appropriately
-    queries = [
-        'set alarm for seven thirty am',
-        'lower volume by fifty percent',
-        'what is my schedule for tomorrow',
-    ]
-
-    pred_intents, pred_slots = eval_model.predict_from_examples(queries, cfg.model.test_ds)
-
-    logging.info('The prediction results of some sample queries with the trained model:')
-    for query, intent, slots in zip(queries, pred_intents, pred_slots):
-        logging.info(f'Query : {query}')
-        logging.info(f'Predicted Intent: {intent}')
-        logging.info(f'Predicted Slots: {slots}')
-
-    logging.info("Inference finished!")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py b/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py
deleted file mode 100644
index 232aa7d4d230..000000000000
--- a/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""" 
-Sample command to run the script:
-
-python multi_label_intent_slot_classification.py \
-            model.data_dir=/home/user/multiatis \
-            model.validation_ds.prefix=dev \
-            model.test_ds.prefix=dev \
-            trainer.devices=[0] \
-            +trainer.fast_dev_run=true \
-            exp_manager.exp_dir=checkpoints
-
-fast_dev_run=false will save checkpoints for the model
-"""
-
-
-import lightning.pytorch as pl
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models import MultiLabelIntentSlotClassificationModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-@hydra_runner(config_path="conf", config_name="multi_label_intent_slot_classification_config")
-def main(cfg: DictConfig) -> None:
-    logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}')
-    trainer = pl.Trainer(**cfg.trainer)
-    exp_manager(trainer, cfg.get("exp_manager", None))
-
-    # initialize the model using the config file
-    model = MultiLabelIntentSlotClassificationModel(cfg.model, trainer=trainer)
-
-    # training
-    logging.info("================================================================================================")
-    logging.info('Starting training...')
-    trainer.fit(model)
-    logging.info('Training finished!')
-
-    # Stop further testing as fast_dev_run does not save checkpoints
-    if trainer.fast_dev_run:
-        return
-
-    # after model training is done, you can load the model from the saved checkpoint
-    # and evaluate it on a data file or on given queries.
-    logging.info("================================================================================================")
-    logging.info("Starting the testing of the trained model on test set...")
-    logging.info("We will load the latest model saved checkpoint from the training...")
-
-    # for evaluation and inference you can load the previously trained model saved in .nemo file
-    # like this in your code, but we will just reuse the trained model here
-    # eval_model = MultiLabelIntentSlotClassificationModel.restore_from(restore_path=checkpoint_path)
-    eval_model = model
-
-    # we will setup testing data reusing the same config (test section)
-    eval_model.update_data_dir_for_testing(data_dir=cfg.model.data_dir)
-    eval_model.setup_test_data(test_data_config=cfg.model.test_ds)
-
-    trainer.test(model=eval_model, ckpt_path=None, verbose=False)
-    logging.info("Testing finished!")
-
-    # Optimize Threshold
-    eval_model.optimize_threshold(cfg.model.test_ds, 'dev')
-
-    # run an inference on a few examples
-    logging.info("======================================================================================")
-    logging.info("Evaluate the model on the given queries...")
-
-    # this will work well if you train the model on ATIS dataset
-    # for your own dataset change the examples appropriately
-    queries = [
-        'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis',
-        'on april first i need a ticket from tacoma to san jose departing before 7 am',
-        'how much is the limousine service in boston',
-    ]
-
-    # We use the optimized threshold for predictions
-    pred_intents, pred_slots, pred_list = eval_model.predict_from_examples(queries, cfg.model.test_ds)
-    logging.info('The prediction results of some sample queries with the trained model:')
-
-    for query, intent, slots in zip(queries, pred_intents, pred_slots):
-        logging.info(f'Query : {query}')
-        logging.info(f'Predicted Intents: {intent}')
-        logging.info(f'Predicted Slots: {slots}')
-
-    logging.info("Inference finished!")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/spellchecking_asr_customization/README.md b/examples/nlp/spellchecking_asr_customization/README.md
deleted file mode 100644
index 9d2063eff181..000000000000
--- a/examples/nlp/spellchecking_asr_customization/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# SpellMapper - spellchecking model for ASR Customization
-Paper: https://arxiv.org/abs/2306.02317
-This model was partly inspired by Microsoft's paper https://arxiv.org/pdf/2203.00888.pdf.
-The goal is to build a model that gets as input a single ASR hypothesis (text) and a vocabulary of custom words/phrases and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any.
-Our model is non-autoregressive (NAR) based on transformer architecture (BERT with multiple separators).
-
-As initial data we use about 5 mln entities from [YAGO corpus](https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/yago-naga/yago/downloads/). These entities are short phrases from Wikipedia headings.
-In order to get misspelled predictions we feed these data to TTS model and then to ASR model.
-Having a "parallel" corpus of "correct + misspelled" phrases, we use statistical machine translation techniques to create a dictionary of possible ngram mappings with their respective frequencies.
-We create an auxiliary algorithm that takes as input a sentence (ASR hypothesis) and a large custom dictionary (e.g. 5000 phrases) and selects top 10 candidate phrases that are probably contained in this sentence in a misspelled way.
-The task of our final neural model is to predict which fragments in the ASR hypothesis should be replaced by which of top-10 candidate phrases if any.
-
-The pipeline consists of multiple steps:
-
-1. Download or generate training data. 
-   See `https://github.com/bene-ges/nemo_compatible/tree/main/scripts/nlp/en_spellmapper/dataset_preparation`
-
-2. [Optional] Convert training dataset to tarred files.
-   `convert_dataset_to_tarred.sh`
- 
-3. Train spellchecking model.
-   `run_training.sh`
-   or 
-   `run_training_tarred.sh`
-
-4. Run evaluation.
-   - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)
-   - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)
-   - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)
-
-5. Run inference.
-   `python run_infer.sh`
diff --git a/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py b/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py
deleted file mode 100644
index c2f514f3e67e..000000000000
--- a/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This script converts checkpoint .ckpt to .nemo file.
-
-This script uses the `examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml`
-config file by default. The other option is to set another config file via command
-line arguments by `--config-name=CONFIG_FILE_PATH'.
-"""
-
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models import SpellcheckingAsrCustomizationModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-
-
-@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config")
-def main(cfg: DictConfig) -> None:
-    logging.debug(f'Config Params: {OmegaConf.to_yaml(cfg)}')
-    SpellcheckingAsrCustomizationModel.load_from_checkpoint(cfg.checkpoint_path).save_to(cfg.target_nemo_path)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml b/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml
deleted file mode 100644
index f8dca7b974e5..000000000000
--- a/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml
+++ /dev/null
@@ -1,97 +0,0 @@
-name: &name spellchecking
-lang: ???        # e.g. 'ru', 'en'
-
-# Pretrained Nemo Models
-pretrained_model: null
-
-trainer:
-  devices: 1 # the number of gpus, 0 for CPU
-  num_nodes: 1
-  max_epochs: 3  # the number of training epochs
-  enable_checkpointing: false  # provided by exp_manager
-  logger: false  # provided by exp_manager
-  accumulate_grad_batches: 1 # accumulates grads every k batches
-  gradient_clip_val: 0.0
-  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
-  accelerator: gpu
-  strategy: ddp
-  log_every_n_steps: 1  # Interval of logging.
-  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
-
-model:
-  do_training: true
-  label_map: ???  # path/.../label_map.txt
-  semiotic_classes: ???  # path/.../semiotic_classes.txt
-  max_sequence_len: 128
-  lang: ${lang}
-  hidden_size: 768
-
-  optim:
-    name: adamw
-    lr: 3e-5
-    weight_decay: 0.1
-
-    sched:
-      name: WarmupAnnealing                      
-
-      # pytorch lightning args
-      monitor: val_loss
-      reduce_on_plateau: false
-
-      # scheduler config override
-      warmup_ratio: 0.1
-      last_epoch: -1
-
-  language_model:
-    pretrained_model_name: bert-base-uncased     # For ru, try DeepPavlov/rubert-base-cased | For de or multilingual, try bert-base-multilingual-cased
-    lm_checkpoint: null
-    config_file: null # json file, precedence over config
-    config: null
-
-  tokenizer:
-    tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
-    vocab_file: null # path to vocab file
-    tokenizer_model: null # only used if tokenizer is sentencepiece
-    special_tokens: null
-
-exp_manager:
-  exp_dir: nemo_experiments # where to store logs and checkpoints
-  name: training # name of experiment
-  create_tensorboard_logger: True
-  create_checkpoint_callback: True
-  checkpoint_callback_params:
-    save_top_k: 3
-    monitor: "val_loss"
-    mode: "min"
-  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
-
-tokenizer:
-    tokenizer_name: ${model.transformer} # or sentencepiece
-    vocab_file: null # path to vocab file
-    tokenizer_model: null # only used if tokenizer is sentencepiece
-    special_tokens: null
-
-# Data
-data:
-  train_ds:
-    data_path: ???  # provide the full path to the file
-    batch_size: 8
-    shuffle: true
-    num_workers: 3
-    pin_memory: false
-    drop_last: false
-
-  validation_ds:
-    data_path: ???  # provide the full path to the file.
-    batch_size: 8
-    shuffle: false
-    num_workers: 3
-    pin_memory: false
-    drop_last: false
-
-
-# Inference
-inference:
-  from_file: null # Path to the raw text, no labels required. Each sentence on a separate line
-  out_file: null # Path to the output file
-  batch_size: 16 # batch size for inference.from_file
diff --git a/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh b/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh
deleted file mode 100644
index d4265eb4beb6..000000000000
--- a/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Path to NeMo repository
-NEMO_PATH=NeMo
-
-DATA_PATH="data_folder"
-
-## data_folder_example
-##   ├── tarred_data
-##   |    └── (output)
-##   ├── config.json
-##   ├── label_map.txt
-##   ├── semiotic_classes.txt
-##   ├── test.tsv
-##   ├── 1.tsv
-##   ├── ...
-##   └── 200.tsv
-
-## Each of {1-200}.tsv input files are 110'000 examples subsets of all.tsv (except for validation part),
-## generated by https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/dataset_preparation/build_training_data.sh
-## Note that in this example we use 110'000 as input and only pack 100'000 of them to tar file. 
-## This is because some input examples, e.g. too long, can be skipped during preprocessing, and we want all tar files to contain fixed equal number of examples.
-
-for part in {1..200}
-do
-    python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py \
-    lang="en" \
-    data.train_ds.data_path=${DATA_PATH}/${part}.tsv \
-    data.validation_ds.data_path=${DATA_PATH}/test.tsv \
-    model.max_sequence_len=256 \
-    model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \
-    model.language_model.config_file=${DATA_PATH}/config.json \
-    model.label_map=${DATA_PATH}/label_map.txt \
-    model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \
-    +output_tar_file=${DATA_PATH}/tarred_data/part${part}.tar \
-    +take_first_n_lines=100000
-done
diff --git a/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py b/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py
deleted file mode 100644
index 68c55ff51a4f..000000000000
--- a/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This script is used to create an index of custom vocabulary and save it to file.
-See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline.
-"""
-
-from argparse import ArgumentParser
-
-from nemo.collections.nlp.data.spellchecking_asr_customization.utils import get_index, load_ngram_mappings
-
-parser = ArgumentParser(description="Create an index of custom vocabulary and save it to file")
-
-parser.add_argument(
-    "--input_name", required=True, type=str, help="Path to input file with custom vocabulary (plain text)"
-)
-parser.add_argument(
-    "--ngram_mappings", required=True, type=str, help="Path to input file with n-gram mapping vocabulary"
-)
-parser.add_argument("--output_name", required=True, type=str, help="Path to output file with custom vocabulary index")
-parser.add_argument("--min_log_prob", default=-4.0, type=float, help="Threshold on log probability")
-parser.add_argument(
-    "--max_phrases_per_ngram",
-    default=500,
-    type=int,
-    help="Threshold on number of phrases that can be stored for one n-gram key in index. Keys with more phrases are discarded.",
-)
-parser.add_argument(
-    "--max_misspelled_freq", default=125000, type=int, help="Threshold on maximum frequency of misspelled n-gram"
-)
-
-args = parser.parse_args()
-
-# Load custom vocabulary
-custom_phrases = set()
-with open(args.input_name, "r", encoding="utf-8") as f:
-    for line in f:
-        phrase = line.strip()
-        custom_phrases.add(" ".join(list(phrase.replace(" ", "_"))))
-print("Size of customization vocabulary:", len(custom_phrases))
-
-# Load n-gram mappings vocabulary
-ngram_mapping_vocab, ban_ngram = load_ngram_mappings(args.ngram_mappings, max_misspelled_freq=args.max_misspelled_freq)
-
-# Generate index of custom phrases
-phrases, ngram2phrases = get_index(
-    custom_phrases,
-    ngram_mapping_vocab,
-    ban_ngram,
-    min_log_prob=args.min_log_prob,
-    max_phrases_per_ngram=args.max_phrases_per_ngram,
-)
-
-# Save index to file
-with open(args.output_name, "w", encoding="utf-8") as out:
-    for ngram in ngram2phrases:
-        for phrase_id, begin, size, logprob in ngram2phrases[ngram]:
-            phrase = phrases[phrase_id]
-            out.write(ngram + "\t" + phrase + "\t" + str(begin) + "\t" + str(size) + "\t" + str(logprob) + "\n")
diff --git a/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py b/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py
deleted file mode 100644
index d0bdc2c9bd30..000000000000
--- a/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This script is used to create a tarred dataset for SpellcheckingAsrCustomizationModel.
-
-This script uses the `/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml`
-config file by default. The other option is to set another config file via command
-line arguments by `--config-name=CONFIG_FILE_PATH'. Probably it is worth looking
-at the example config file to see the list of parameters used for training.
-
-USAGE Example:
-1. Obtain a processed dataset
-2. Run:
-    python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py \
-      lang=${LANG} \
-      data.train_ds.data_path=${DATA_PATH}/train.tsv \
-      model.language_model.pretrained_model_name=${LANGUAGE_MODEL} \
-      model.label_map=${DATA_PATH}/label_map.txt \
-      +output_tar_file=tarred/part1.tar \
-      +take_first_n_lines=100000
-
-"""
-import pickle
-import tarfile
-from io import BytesIO
-
-from helpers import MODEL, instantiate_model_and_trainer
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-
-
-@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config")
-def main(cfg: DictConfig) -> None:
-    logging.info(f'Config Params: {OmegaConf.to_yaml(cfg)}')
-    logging.info("Start creating tar file from " + cfg.data.train_ds.data_path + " ...")
-    _, model = instantiate_model_and_trainer(
-        cfg, MODEL, True
-    )  # instantiate model like for training because we may not have pretrained model
-    dataset = model._train_dl.dataset
-    archive = tarfile.open(cfg.output_tar_file, mode="w")
-    max_lines = int(cfg.take_first_n_lines)
-    for i in range(len(dataset)):
-        if i >= max_lines:
-            logging.info("Reached " + str(max_lines) + " examples")
-            break
-        (
-            input_ids,
-            input_mask,
-            segment_ids,
-            input_ids_for_subwords,
-            input_mask_for_subwords,
-            segment_ids_for_subwords,
-            character_pos_to_subword_pos,
-            labels_mask,
-            labels,
-            spans,
-        ) = dataset[i]
-
-        # do not store masks as they are just arrays of 1
-        content = {
-            "input_ids": input_ids,
-            "input_mask": input_mask,
-            "segment_ids": segment_ids,
-            "input_ids_for_subwords": input_ids_for_subwords,
-            "input_mask_for_subwords": input_mask_for_subwords,
-            "segment_ids_for_subwords": segment_ids_for_subwords,
-            "character_pos_to_subword_pos": character_pos_to_subword_pos,
-            "labels_mask": labels_mask,
-            "labels": labels,
-            "spans": spans,
-        }
-        b = BytesIO()
-        pickle.dump(content, b)
-        b.seek(0)
-        tarinfo = tarfile.TarInfo(name="example_" + str(i) + ".pkl")
-        tarinfo.size = b.getbuffer().nbytes
-        archive.addfile(tarinfo=tarinfo, fileobj=b)
-
-    archive.close()
-    logging.info("Tar file " + cfg.output_tar_file + " created!")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/spellchecking_asr_customization/helpers.py b/examples/nlp/spellchecking_asr_customization/helpers.py
deleted file mode 100644
index 8e3957d34cc1..000000000000
--- a/examples/nlp/spellchecking_asr_customization/helpers.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-from typing import Tuple
-
-import lightning.pytorch as pl
-from omegaconf import DictConfig
-
-from nemo.collections.nlp.models import SpellcheckingAsrCustomizationModel
-from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
-from nemo.utils import logging
-
-__all__ = ["MODEL", "MODEL_NAMES", "instantiate_model_and_trainer"]
-
-MODEL = "spellchecking"
-MODEL_NAMES = [MODEL]
-
-
-def instantiate_model_and_trainer(
-    cfg: DictConfig, model_name: str, do_training: bool
-) -> Tuple[pl.Trainer, SpellcheckingAsrCustomizationModel]:
-    """Function for instantiating a model and a trainer
-    Args:
-        cfg: The config used to instantiate the model and the trainer.
-        model_name: A str indicates the model direction, currently only 'itn'.
-        do_training: A boolean flag indicates whether the model will be trained or evaluated.
-
-    Returns:
-        trainer: A PyTorch Lightning trainer
-        model: A SpellcheckingAsrCustomizationModel
-    """
-
-    if model_name not in MODEL_NAMES:
-        raise ValueError(f"{model_name} is unknown model type")
-
-    # Get configs for the corresponding models
-    trainer_cfg = cfg.get("trainer")
-    model_cfg = cfg.get("model")
-    pretrained_cfg = cfg.get("pretrained_model", None)
-    trainer = pl.Trainer(**trainer_cfg)
-    if not pretrained_cfg:
-        logging.info(f"Initializing {model_name} model")
-        if model_name == MODEL:
-            model = SpellcheckingAsrCustomizationModel(model_cfg, trainer=trainer)
-        else:
-            raise ValueError(f"{model_name} is unknown model type")
-    elif os.path.exists(pretrained_cfg):
-        logging.info(f"Restoring pretrained {model_name} model from {pretrained_cfg}")
-        save_restore_connector = NLPSaveRestoreConnector()
-        model = SpellcheckingAsrCustomizationModel.restore_from(
-            pretrained_cfg, save_restore_connector=save_restore_connector
-        )
-    else:
-        logging.info(f"Loading pretrained model {pretrained_cfg}")
-        if model_name == MODEL:
-            if pretrained_cfg not in SpellcheckingAsrCustomizationModel.get_available_model_names():
-                raise (
-                    ValueError(
-                        f"{pretrained_cfg} not in the list of available Tagger models."
-                        f"Select from {SpellcheckingAsrCustomizationModel.list_available_models()}"
-                    )
-                )
-            model = SpellcheckingAsrCustomizationModel.from_pretrained(pretrained_cfg)
-        else:
-            raise ValueError(f"{model_name} is unknown model type")
-
-    # Setup train and validation data
-    if do_training:
-        model.setup_training_data(train_data_config=cfg.data.train_ds)
-        model.setup_validation_data(val_data_config=cfg.data.validation_ds)
-
-    logging.info(f"Model {model_name} -- Device {model.device}")
-    return trainer, model
diff --git a/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py b/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py
deleted file mode 100644
index 871d5e5c0c0c..000000000000
--- a/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This script is used to postprocess SpellMapper results and generate an updated nemo ASR manifest.
-See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline.
-"""
-
-from argparse import ArgumentParser
-
-from nemo.collections.nlp.data.spellchecking_asr_customization.utils import (
-    update_manifest_with_spellmapper_corrections,
-)
-
-parser = ArgumentParser(description="Postprocess SpellMapper results and generate an updated nemo ASR manifest")
-
-parser.add_argument("--input_manifest", required=True, type=str, help="Path to input nemo ASR manifest")
-parser.add_argument(
-    "--field_name", default="pred_text", type=str, help="Name of json field with original ASR hypothesis text"
-)
-parser.add_argument(
-    "--short2full_name",
-    required=True,
-    type=str,
-    help="Path to input file with correspondence between sentence fragments and full sentences",
-)
-parser.add_argument(
-    "--spellmapper_results", required=True, type=str, help="Path to input file with SpellMapper inference results"
-)
-parser.add_argument("--output_manifest", required=True, type=str, help="Path to output nemo ASR manifest")
-parser.add_argument("--min_prob", default=0.5, type=float, help="Threshold on replacement probability")
-parser.add_argument(
-    "--use_dp",
-    action="store_true",
-    help="Whether to use additional replacement filtering by using dynamic programming",
-)
-parser.add_argument(
-    "--replace_hyphen_to_space",
-    action="store_true",
-    help="Whether to use space instead of hyphen in replaced fragments",
-)
-parser.add_argument(
-    "--ngram_mappings", type=str, required=True, help="File with ngram mappings, only needed if use_dp=true"
-)
-parser.add_argument(
-    "--min_dp_score_per_symbol",
-    default=-1.5,
-    type=float,
-    help="Minimum dynamic programming sum score averaged by hypothesis length",
-)
-
-args = parser.parse_args()
-
-update_manifest_with_spellmapper_corrections(
-    input_manifest_name=args.input_manifest,
-    short2full_name=args.short2full_name,
-    output_manifest_name=args.output_manifest,
-    spellmapper_results_name=args.spellmapper_results,
-    min_prob=args.min_prob,
-    replace_hyphen_to_space=args.replace_hyphen_to_space,
-    field_name=args.field_name,
-    use_dp=args.use_dp,
-    ngram_mappings=args.ngram_mappings,
-    min_dp_score_per_symbol=args.min_dp_score_per_symbol,
-)
-
-print("Resulting manifest saved to: ", args.output_manifest)
diff --git a/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py b/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py
deleted file mode 100644
index 6fd5e524390a..000000000000
--- a/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This script contains an example on how to prepare input for SpellMapper inference from a nemo ASR manifest.
-It splits sentences to shorter fragments, runs candidate retrieval and generates input in the required format.
-It produces two output files:
-    1. File with correspondence between sentence fragments and full sentences.
-    2. File that will serve as input for SpellMapper inference.
-
-See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline.
-"""
-
-from argparse import ArgumentParser
-
-from nemo.collections.nlp.data.spellchecking_asr_customization.utils import (
-    extract_and_split_text_from_manifest,
-    get_candidates,
-    load_index,
-)
-
-parser = ArgumentParser(description="Prepare input for SpellMapper inference from a nemo ASR manifest")
-parser.add_argument("--manifest", required=True, type=str, help="Path to input manifest file")
-parser.add_argument(
-    "--custom_vocab_index", required=True, type=str, help="Path to input file with custom vocabulary index"
-)
-parser.add_argument(
-    "--big_sample",
-    required=True,
-    type=str,
-    help="Path to input file with big sample of phrases to sample dummy candidates if there less than 10 are found by retrieval",
-)
-parser.add_argument(
-    "--short2full_name",
-    required=True,
-    type=str,
-    help="Path to output file with correspondence between sentence fragments and full sentences",
-)
-parser.add_argument(
-    "--output_name",
-    required=True,
-    type=str,
-    help="Path to output file that will serve as input for SpellMapper inference",
-)
-parser.add_argument("--field_name", default="pred_text", type=str, help="Name of json field with ASR hypothesis text")
-parser.add_argument("--len_in_words", default=16, type=int, help="Maximum fragment length in words")
-parser.add_argument(
-    "--step_in_words",
-    default=8,
-    type=int,
-    help="Step in words for moving to next fragment. If less than len_in_words, fragments will intersect",
-)
-
-args = parser.parse_args()
-
-# Split ASR hypotheses to shorter fragments, because SpellMapper can't handle arbitrarily long sequences.
-# The correspondence between short and original fragments is saved to a file and will be used at post-processing.
-extract_and_split_text_from_manifest(
-    input_name=args.manifest,
-    output_name=args.short2full_name,
-    field_name=args.field_name,
-    len_in_words=args.len_in_words,
-    step_in_words=args.step_in_words,
-)
-
-# Load index of custom vocabulary from file
-phrases, ngram2phrases = load_index(args.custom_vocab_index)
-
-# Load big sample of phrases to sample dummy candidates if there less than 10 are found by retrieval
-big_sample_of_phrases = set()
-with open(args.big_sample, "r", encoding="utf-8") as f:
-    for line in f:
-        phrase, freq = line.strip().split("\t")
-        if int(freq) > 50:  # do not want to use frequent phrases as dummy candidates
-            continue
-        if len(phrase) < 6 or len(phrase) > 15:  # do not want to use too short or too long phrases as dummy candidates
-            continue
-        big_sample_of_phrases.add(phrase)
-
-big_sample_of_phrases = list(big_sample_of_phrases)
-
-# Generate input for SpellMapper inference
-out = open(args.output_name, "w", encoding="utf-8")
-with open(args.short2full_name, "r", encoding="utf-8") as f:
-    for line in f:
-        short_sent, _ = line.strip().split("\t")
-        sent = "_".join(short_sent.split())
-        letters = list(sent)
-        candidates = get_candidates(ngram2phrases, phrases, letters, big_sample_of_phrases)
-        if len(candidates) == 0:
-            continue
-        if len(candidates) != 10:
-            raise ValueError("expect 10 candidates, got: ", len(candidates))
-
-        # We add two columns with targets and span_info.
-        # They have same format as during training, but start and end positions are APPROXIMATE, they will be adjusted when constructing BertExample.
-        targets = []
-        span_info = []
-        for idx, c in enumerate(candidates):
-            if c[1] == -1:
-                continue
-            targets.append(str(idx + 1))  # targets are 1-based
-            start = c[1]
-            # ensure that end is not outside sentence length (it can happen because c[2] is candidate length used as approximation)
-            end = min(c[1] + c[2], len(letters))
-            span_info.append("CUSTOM " + str(start) + " " + str(end))
-        out.write(
-            " ".join(letters)
-            + "\t"
-            + ";".join([x[0] for x in candidates])
-            + "\t"
-            + " ".join(targets)
-            + "\t"
-            + ";".join(span_info)
-            + "\n"
-        )
-out.close()
diff --git a/examples/nlp/spellchecking_asr_customization/run_infer.sh b/examples/nlp/spellchecking_asr_customization/run_infer.sh
deleted file mode 100644
index b4bbdc4da375..000000000000
--- a/examples/nlp/spellchecking_asr_customization/run_infer.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-## RUN INFERENCE ON NEMO MANIFEST AND CUSTOM VOCABULARY
-
-## Path to NeMo repository
-NEMO_PATH=NeMo
-
-## Download model repo from Hugging Face (if clone doesn't work, run "git lfs install" and try again)
-git clone https://huggingface.co/bene-ges/spellmapper_asr_customization_en
-## Download repo with test data
-git clone https://huggingface.co/datasets/bene-ges/spellmapper_en_evaluation
-
-## Files in model repo
-PRETRAINED_MODEL=spellmapper_asr_customization_en/training_10m_5ep.nemo
-NGRAM_MAPPINGS=spellmapper_asr_customization_en/replacement_vocab_filt.txt
-BIG_SAMPLE=spellmapper_asr_customization_en/big_sample.txt
-
-## Override these two files if you want to test on your own data
-## File with input nemo ASR manifest
-INPUT_MANIFEST=spellmapper_en_evaluation/medical_manifest_ctc.json
-## File containing custom words and phrases (plain text)
-CUSTOM_VOCAB=spellmapper_en_evaluation/medical_custom_vocab.txt
-
-## Other files will be created 
-## File with index of custom vocabulary
-INDEX="index.txt"
-## File with short fragments and corresponding original sentences
-SHORT2FULL="short2full.txt"
-## File with input for SpellMapper inference
-SPELLMAPPER_INPUT="spellmapper_input.txt"
-## File with output of SpellMapper inference
-SPELLMAPPER_OUTPUT="spellmapper_output.txt"
-## File with output nemo ASR manifest
-OUTPUT_MANIFEST="out_manifest.json"
-
-
-# Create index of custom vocabulary
-python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py \
-  --input_name ${CUSTOM_VOCAB} \
-  --ngram_mappings ${NGRAM_MAPPINGS} \
-  --output_name ${INDEX} \
-  --min_log_prob -4.0 \
-  --max_phrases_per_ngram 600
-
-# Prepare input for SpellMapper inference
-python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py \
-  --manifest ${INPUT_MANIFEST} \
-  --custom_vocab_index ${INDEX} \
-  --big_sample ${BIG_SAMPLE} \
-  --short2full_name ${SHORT2FULL} \
-  --output_name ${SPELLMAPPER_INPUT} \
-  --field_name "pred_text" \
-  --len_in_words 16 \
-  --step_in_words 8
-
-# Run SpellMapper inference
-python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \
-  pretrained_model=${PRETRAINED_MODEL} \
-  model.max_sequence_len=512 \
-  inference.from_file=${SPELLMAPPER_INPUT} \
-  inference.out_file=${SPELLMAPPER_OUTPUT} \
-  inference.batch_size=16 \
-  lang=en
-
-# Postprocess and create output corrected manifest
-python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \
-  --input_manifest ${INPUT_MANIFEST} \
-  --short2full_name ${SHORT2FULL} \
-  --output_manifest ${OUTPUT_MANIFEST} \
-  --spellmapper_result ${SPELLMAPPER_OUTPUT} \
-  --replace_hyphen_to_space \
-  --field_name "pred_text" \
-  --use_dp \
-  --ngram_mappings ${NGRAM_MAPPINGS} \
-  --min_dp_score_per_symbol -1.5
-
-# Check WER of initial manifest
-python ${NEMO_PATH}/examples/asr/speech_to_text_eval.py \
-  dataset_manifest=${INPUT_MANIFEST} \
-  use_cer=False \
-  only_score_manifest=True
-
-# Check WER of corrected manifest
-python ${NEMO_PATH}/examples/asr/speech_to_text_eval.py \
-  dataset_manifest=${OUTPUT_MANIFEST} \
-  use_cer=False \
-  only_score_manifest=True
diff --git a/examples/nlp/spellchecking_asr_customization/run_training.sh b/examples/nlp/spellchecking_asr_customization/run_training.sh
deleted file mode 100644
index 85dddbb2a038..000000000000
--- a/examples/nlp/spellchecking_asr_customization/run_training.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-## TRAIN WITH NON-TARRED DATA 
-
-# Path to NeMo repository
-NEMO_PATH=NeMo
-
-## Download repo with training data (very small example)
-## If clone doesn't work, run "git lfs install" and try again
-git clone https://huggingface.co/datasets/bene-ges/spellmapper_en_train_micro
-
-DATA_PATH=spellmapper_en_train_micro
-
-## Example of all files needed to run training with non-tarred data:
-## spellmapper_en_train_micro
-##   ├── config.json
-##   ├── label_map.txt
-##   ├── semiotic_classes.txt
-##   ├── test.tsv
-##   └── train.tsv
-
-## To generate files config.json, label_map.txt, semiotic_classes.txt - run generate_configs.sh
-## Files "train.tsv" and "test.tsv" contain training examples. 
-## For data preparation see https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/dataset_preparation/build_training_data.sh
-
-## Note that training with non-tarred data only works on single gpu. It makes sense if you use 1-2 million examples or less.
-
-python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py \
-  lang="en" \
-  data.validation_ds.data_path=${DATA_PATH}/test.tsv \
-  data.train_ds.data_path=${DATA_PATH}/train.tsv \
-  data.train_ds.batch_size=32 \
-  data.train_ds.num_workers=8 \
-  model.max_sequence_len=512 \
-  model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \
-  model.language_model.config_file=${DATA_PATH}/config.json \
-  model.label_map=${DATA_PATH}/label_map.txt \
-  model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \
-  model.optim.lr=3e-5 \
-  trainer.devices=[1] \
-  trainer.num_nodes=1 \
-  trainer.accelerator=gpu \
-  trainer.strategy=ddp \
-  trainer.max_epochs=5
diff --git a/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh b/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh
deleted file mode 100644
index 655c3e23e610..000000000000
--- a/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-## TRAIN WITH TARRED DATA
-
-# Path to NeMo repository
-NEMO_PATH=NeMo
-
-DATA_PATH=data_folder
-
-## data_folder_example
-##   ├── train_tarred
-##   |   ├── part1.tar
-##   |   ├── ...
-##   |   └── part200.tar
-##   ├── config.json
-##   ├── label_map.txt
-##   ├── semiotic_classes.txt
-##   └── test.tsv
-## To generate files config.json, label_map.txt, semiotic_classes.txt, run generate_configs.sh
-## To prepare data, see ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/dataset_preparation/build_training_data.sh
-## To convert data to tarred format, split all.tsv to pieces of 110'000 examples (except for validation part) and use ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/dataset_preparation/convert_data_to_tarred.sh
-## To run training with tarred data, use ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh
-
-## ATTENTION: How to calculate model.optim.sched.max_steps:
-##   Suppose, you have 2'000'000 training examples, and want to train for 5 epochs on 4 gpus with batch size 32.
-##   5 (epochs) * 32 (bs) * 4 (gpus)
-##   1 step consumes 128 examples (32(bs) * 4(gpus))
-##   1 epoch makes 2000000/128=15625 steps (updates)
-##   5 epochs make 5*15625=78125 steps
-
-python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py \
-  lang="en" \
-  data.validation_ds.data_path=${DATA_PATH}/test.tsv \
-  data.train_ds.data_path=${DATA_PATH}/train_tarred/part_OP_1..100_CL_.tar \
-  data.train_ds.batch_size=32 \
-  data.train_ds.num_workers=16 \
-  +data.train_ds.use_tarred_dataset=true \
-  data.train_ds.shuffle=false \
-  data.validation_ds.batch_size=16 \
-  model.max_sequence_len=512 \
-  model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \
-  model.language_model.config_file=${DATA_PATH}/config.json \
-  model.label_map=${DATA_PATH}/label_map.txt \
-  model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \
-  model.optim.sched.name=CosineAnnealing \
-  +model.optim.sched.max_steps=195313 \
-  trainer.devices=8 \
-  trainer.num_nodes=1 \
-  trainer.accelerator=gpu \
-  trainer.strategy=ddp \
-  trainer.max_epochs=5
diff --git a/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py b/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py
deleted file mode 100644
index 593264f14a5d..000000000000
--- a/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This script contains an example on how to run inference with the SpellcheckingAsrCustomizationModel.
-
-An input line should consist of 4 tab-separated columns:
-    1. text of ASR-hypothesis
-    2. texts of 10 candidates separated by semicolon
-    3. 1-based ids of non-dummy candidates
-    4. approximate start/end coordinates of non-dummy candidates (correspond to ids in third column)
-
-Example input (in one line):
-    t h e _ t a r a s i c _ o o r d a _ i s _ a _ p a r t _ o f _ t h e _ a o r t a _ l o c a t e d _ i n _ t h e _ t h o r a x	
-    h e p a t i c _ c i r r h o s i s;u r a c i l;c a r d i a c _ a r r e s t;w e a n;a p g a r;p s y c h o m o t o r;t h o r a x;t h o r a c i c _ a o r t a;a v f;b l o c k a d e d
-    1 2 6 7 8 9 10
-    CUSTOM 6 23;CUSTOM 4 10;CUSTOM 4 15;CUSTOM 56 62;CUSTOM 5 19;CUSTOM 28 31;CUSTOM 39 48
-
-Each line in SpellMapper output is tab-separated and consists of 4 columns:
-    1. ASR-hypothesis (same as in input)
-    2. 10 candidates separated with semicolon (same as in input)
-    3. fragment predictions, separated with semicolon, each prediction is a tuple (start, end, candidate_id, probability)
-    4. letter predictions - candidate_id predicted for each letter (this is only for debug purposes)
-
-Example output (in one line):
-    t h e _ t a r a s i c _ o o r d a _ i s _ a _ p a r t _ o f _ t h e _ a o r t a _ l o c a t e d _ i n _ t h e _ t h o r a x
-    h e p a t i c _ c i r r h o s i s;u r a c i l;c a r d i a c _ a r r e s t;w e a n;a p g a r;p s y c h o m o t o r;t h o r a x;t h o r a c i c _ a o r t a;a v f;b l o c k a d e d
-    56 62 7 0.99998;4 20 8 0.95181;12 20 8 0.44829;4 17 8 0.99464;12 17 8 0.97645
-    8 8 8 0 8 8 8 8 8 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7    
-   
-
-USAGE Example:
-1. Train a model, or use a pretrained checkpoint.
-2. Run on a single file:
-    python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \
-        pretrained_model=${PRETRAINED_NEMO_CHECKPOINT} \
-        model.max_sequence_len=512 \
-        inference.from_file=input.txt \
-        inference.out_file=output.txt \
-        inference.batch_size=16 \
-        lang=en
-or on multiple files:
-    python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \
-        pretrained_model=${PRETRAINED_NEMO_CHECKPOINT} \
-        model.max_sequence_len=512 \
-        +inference.from_filelist=filelist.txt \
-        +inference.output_folder=output_folder \
-        inference.batch_size=16 \
-        lang=en
-
-This script uses the `/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml`
-config file by default. The other option is to set another config file via command
-line arguments by `--config-name=CONFIG_FILE_PATH'.
-"""
-
-
-import os
-
-from helpers import MODEL, instantiate_model_and_trainer
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-
-
-@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config")
-def main(cfg: DictConfig) -> None:
-    logging.debug(f'Config Params: {OmegaConf.to_yaml(cfg)}')
-
-    if cfg.pretrained_model is None:
-        raise ValueError("A pre-trained model should be provided.")
-    _, model = instantiate_model_and_trainer(cfg, MODEL, False)
-
-    if cfg.model.max_sequence_len != model.max_sequence_len:
-        model.max_sequence_len = cfg.model.max_sequence_len
-        model.builder._max_seq_length = cfg.model.max_sequence_len
-    input_filenames = []
-    output_filenames = []
-
-    if "from_filelist" in cfg.inference and "output_folder" in cfg.inference:
-        filelist_file = cfg.inference.from_filelist
-        output_folder = cfg.inference.output_folder
-        with open(filelist_file, "r", encoding="utf-8") as f:
-            for line in f:
-                path = line.strip()
-                input_filenames.append(path)
-                folder, name = os.path.split(path)
-                output_filenames.append(os.path.join(output_folder, name))
-    else:
-        text_file = cfg.inference.from_file
-        logging.info(f"Running inference on {text_file}...")
-        if not os.path.exists(text_file):
-            raise ValueError(f"{text_file} not found.")
-        input_filenames.append(text_file)
-        output_filenames.append(cfg.inference.out_file)
-
-    dataloader_cfg = {
-        "batch_size": cfg.inference.get("batch_size", 8),
-        "num_workers": cfg.inference.get("num_workers", 4),
-        "pin_memory": cfg.inference.get("num_workers", False),
-    }
-    for input_filename, output_filename in zip(input_filenames, output_filenames):
-        if not os.path.exists(input_filename):
-            logging.info(f"Skip non-existing {input_filename}.")
-            continue
-        model.infer(dataloader_cfg, input_filename, output_filename)
-        logging.info(f"Predictions saved to {output_filename}.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py b/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py
deleted file mode 100644
index ac50b4121f15..000000000000
--- a/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This script contains an example on how to train SpellMapper (SpellcheckingAsrCustomizationModel).
-It uses the `examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml`
-config file by default. The other option is to set another config file via command
-line arguments by `--config-name=CONFIG_FILE_PATH'. Probably it is worth looking
-at the example config file to see the list of parameters used for training.
-
-USAGE Example:
-    See `examples/nlp/spellchecking_asr_customization/run_training.sh` for training on non-tarred data.
-    and
-    `examples/nlp/spellchecking_asr_customization/run_training_tarred.sh` for training on tarred data.
-
-One (non-tarred) training example should consist of 4 tab-separated columns:
-    1. text of ASR-hypothesis
-    2. texts of 10 candidates separated by semicolon
-    3. 1-based ids of correct candidates, or 0 if none
-    4. start/end coordinates of correct candidates (correspond to ids in third column)
-Example (in one line):
-    a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o
-    d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y
-    1 3
-    CUSTOM 12 23;CUSTOM 28 41
-"""
-
-from helpers import MODEL, instantiate_model_and_trainer
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config")
-def main(cfg: DictConfig) -> None:
-    # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True
-    # when there are unused parameters like here
-    if cfg.trainer.strategy == 'ddp':
-        cfg.trainer.strategy = "ddp_find_unused_parameters_true"
-    logging.info(f'Config Params: {OmegaConf.to_yaml(cfg)}')
-
-    # Train the model
-    if cfg.model.do_training:
-        logging.info(
-            "================================================================================================"
-        )
-        logging.info('Start training...')
-        trainer, model = instantiate_model_and_trainer(cfg, MODEL, True)
-        spellchecking_exp_manager = cfg.get('exp_manager', None)
-        exp_manager(trainer, spellchecking_exp_manager)
-        trainer.fit(model)
-        logging.info('Training finished!')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/token_classification/README.md b/examples/nlp/token_classification/README.md
deleted file mode 100644
index 808ed2856fb2..000000000000
--- a/examples/nlp/token_classification/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-> [!IMPORTANT]  
-> This section is no longer supported in NeMo and is scheduled for removal in the 23.11 release.
diff --git a/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml b/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml
deleted file mode 100644
index cc374f538c93..000000000000
--- a/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Punctuation and capitalization model with pretrained BERT-like models
-
-pretrained_model: null # pretrained Punctuation and Capitalization model from list_available_models(), for example:
-# punctuation_en_bert or punctuation_en_distilbert
-# or your_model.nemo
-trainer:
-  devices: 1 # the number of gpus, 0 for CPU
-  num_nodes: 1
-  max_epochs: 3
-  max_steps: -1 # precedence over max_epochs
-  accumulate_grad_batches: 1 # accumulates grads every k batches
-  gradient_clip_val: 0.0
-  precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
-  accelerator: gpu
-  strategy: ddp
-  enable_checkpointing: False  # Provided by exp_manager
-  logger: false  # Provided by exp_manager
-  log_every_n_steps: 1  # Interval of logging.
-  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
-
-exp_manager:
-  exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
-  name: Punctuation_and_Capitalization  # The name of your model
-  create_tensorboard_logger: true  # Whether you want exp_manger to create a tb logger
-  create_checkpoint_callback: true  # Whether you want exp_manager to create a model checkpoint callback
-
-model:
-  class_labels:
-    punct_labels_file: punct_label_ids.csv
-    capit_labels_file: capit_label_ids.csv
-
-  common_dataset_parameters:
-    pad_label: 'O'
-    ignore_extra_tokens: false
-    ignore_start_end: true
-    punct_label_ids: null
-    capit_label_ids: null
-    label_vocab_dir: null
-
-  train_ds:
-    # Tarred dataset is recommended if all dataset cannot be loaded in memory. Use script
-    # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py` for tarred dataset
-    # creation.
-    use_tarred_dataset: false
-    # A path to directory where `tar_metadata_file` or `text_file` and `labels_file` are stored.
-    ds_item: ???
-
-    text_file: text_train.txt
-    labels_file: labels_train.txt
-    # Permutes batches every epoch
-    shuffle: true
-    num_samples: -1
-    # A max number of source text tokens in a batch. Examples are sorted by number of tokens in a source text before
-    # batching. Examples which number of tokens do not differ much are added to the batch. This procedure reduces
-    # number of pad tokens in a batch. A number of examples in a batch varies: longer input sequences -> less
-    # examples in a batch.
-    tokens_in_batch: 15000
-    max_seq_length: 512
-    # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null,
-    # number of jobs is equal to the number of CPU cores.
-    # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece)
-    n_jobs: 0
-
-    # Path to tarred dataset metadata file. Required if tarred dataset is used. Metadata file is a JSON file which
-    # contains total number of batches in the dataset, a list of paths to tar files and paths to label vocabularies.
-    # Metadata file is create by script
-    # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py`
-    tar_metadata_file: null
-    # Controls batch shuffling in tarred dataset. `tar_shuffle_n` is a size of shuffled batch buffer. Mind that this
-    # shuffling only permutes batches and doesn't exchange samples between batches. Proper shuffling is turned on in
-    # regular dataset.
-    tar_shuffle_n: 1
-
-  validation_ds:
-    # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for
-    # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used
-    # See `train_ds` section for more details on tarred dataset
-    use_tarred_dataset: false
-    # expected format: `[PATH_TO_DEV1,PATH_TO_DEV2]` OR `PATH_TO_DEV` (Note no space between the paths and square
-    # brackets)
-    ds_item: ???
-
-    text_file: text_dev.txt
-    labels_file: labels_dev.txt
-    shuffle: false
-    num_samples: -1
-    # See comment above `model.train_ds.tokens_in_batch` parameter for explanation.
-    tokens_in_batch: 15000
-    max_seq_length: 512
-    # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null,
-    # number of jobs is equal to the number of CPU cores.
-    # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece)
-    n_jobs: 0
-
-    # For more details see `train_ds` section.
-    tar_metadata_file: null
-
-  test_ds:
-    # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for
-    # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used
-    # See `train_ds` section for more details on tarred dataset
-    use_tarred_dataset: false
-    ds_item: ???  # expected format: [PATH_TO_DEV1,PATH_TO_DEV2] (Note no space between the paths and square brackets)
-
-    text_file: text_dev.txt
-    labels_file: labels_dev.txt
-    shuffle: false
-    num_samples: -1
-    # See comment above `model.train_ds.tokens_in_batch` parameter for explanation.
-    tokens_in_batch: 15000
-    max_seq_length: 512
-    # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null,
-    # number of jobs is equal to the number of CPU cores.
-    # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece)
-    n_jobs: 0
-
-    # For more details see `train_ds` section.
-    tar_metadata_file: null
-
-  tokenizer:
-    tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
-    vocab_file: null # path to vocab file
-    tokenizer_model: null # only used if tokenizer is sentencepiece
-    special_tokens: null
-
-  language_model:
-    pretrained_model_name: bert-base-uncased
-    lm_checkpoint: null
-    config_file: null # json file, precedence over config
-    config: null
-
-  punct_head:
-    num_fc_layers: 1
-    fc_dropout: 0.1
-    activation: 'relu'
-    use_transformer_init: True
-
-  capit_head:
-    num_fc_layers: 1
-    fc_dropout: 0.1
-    activation: 'relu'
-    use_transformer_init: true
-
-  optim:
-    name: adam
-    lr: 1e-4
-    weight_decay: 0.00
-
-    sched:
-      name: WarmupAnnealing
-      # Scheduler params
-      warmup_steps: null
-      warmup_ratio: 0.1
-      last_epoch: -1
-
-      # pytorch lightning args
-      monitor: val_loss
-      reduce_on_plateau: false
-
-hydra:
-  run:
-    dir: .
-  job_logging:
-    root:
-      handlers: null
diff --git a/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml b/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml
deleted file mode 100644
index e727d22aca54..000000000000
--- a/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Punctuation and capitalization lexical audio model with pretrained BERT-like models and Encoder-Decoder-like models.
-pretrained_model: null # pretrained Punctuation and Capitalization Lexical Audio model from list_available_models(), for example:
-#
-# or your_model.nemo
-trainer:
-  devices: -1 # the number of gpus, 0 for CPU
-  num_nodes: 1
-  max_epochs: 5
-  max_steps: -1 # precedence over max_epochs
-  accumulate_grad_batches: 1 # accumulates grads every k batches
-  gradient_clip_val: 0.0
-  precision: 32 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
-  accelerator: gpu
-  strategy: ddp
-  enable_checkpointing: False  # Provided by exp_manager
-  logger: false  # Provided by exp_manager
-  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
-  # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step,
-  # LR schedulers, apex, etc.
-  log_every_n_steps: 50
-
-exp_manager:
-  exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
-  name: Punctuation_and_Capitalization_Lexical_Audio  # The name of your model
-  create_tensorboard_logger: true  # Whether you want exp_manger to create a tb logger
-  create_checkpoint_callback: true  # Whether you want exp_manager to create a model checkpoint callback
-  checkpoint_callback_params:
-    save_top_k: 3
-    monitor: "val_loss"
-    mode: "min"
-    save_best_model: true
-  resume_from_checkpoint: null
-
-model:
-  audio_encoder:
-    pretrained_model: stt_en_conformer_ctc_medium # You can choose any pretrained ASR model from list_available_models() of EncDecCTCModel.
-    freeze:
-      is_enabled: false # If set to True weights of audio encoder will not be updated during training.
-      d_model: 256 # Input dimension of MultiheadAttentionMechanism and PositionwiseFeedForward
-      d_ff: 1024 # Hidden dimension of PositionwiseFeedForward
-      num_layers: 4 # Number of additional Conformer layers
-    adapter:
-      enable: false # If set to True will enable adapters for audio encoder.
-      config:
-        # For more details see `nemo.collections.common.parts.LinearAdapter` class
-        in_features: -1 # Will be replaced with size of audio encoder
-        dim: 128 # Hidden dimension of the feed forward network.
-        activation: 'swish' # Str name for an activation function.
-    fusion:
-      num_layers: 4 # Number of layers to use in fusion
-      num_attention_heads: 4 # Number of attention heads to use in fusion
-      inner_size: 2048 # Fusion inner size
-
-  class_labels:
-    punct_labels_file: punct_label_ids.txt
-    capit_labels_file: capit_label_ids.txt
-
-  common_dataset_parameters:
-    pad_label: 'O'
-    ignore_extra_tokens: false
-    ignore_start_end: true
-    punct_label_ids: null
-    capit_label_ids: null
-    label_vocab_dir: null
-
-  train_ds:
-    # Tarred dataset is recommended if all dataset cannot be loaded in memory. Use script
-    # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py` for tarred dataset
-    # creation.
-    use_tarred_dataset: false
-
-    # A path to directory where `tar_metadata_file` or `text_file` and `labels_file` and `audio_file` are stored.
-    ds_item: ???
-    text_file: text_train.txt
-    labels_file: labels_train.txt
-    audio_file: audio_train.txt
-
-    use_audio: true # Has to be set to true to use it for lexical audio model.
-    use_bucketing: true # If set to true batches will be sorted by length of audios and packed in batches limited by `tokens_in_batch`. Otherwise, provide `batch_size` parameter.
-    # If set to true audios will be loaded to memory during __init__ call of `BertPunctuationCapitalizationDataset`, consumes more RAM.
-    # Otherwise, audios will be loaded during `collate_fn` call of `BertPunctuationCapitalizationDataset`.
-    preload_audios: true
-
-    # A max number of source text tokens in a batch. Examples are sorted by number of tokens in a source text before
-    # batching. Examples which number of tokens do not differ much are added to the batch. This procedure reduces
-    # number of pad tokens in a batch. A number of examples in a batch varies: longer input sequences -> less
-    # examples in a batch.
-    tokens_in_batch: 2048
-    max_seq_length: 512
-
-    sample_rate: 16000 # Target sample rate of audios can be used for downsampling or upsamling.
-    num_workers: 0
-
-    # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null,
-    # number of jobs is equal to the number of CPU cores.
-    # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece)
-    n_jobs: 0
-
-    # Path to tarred dataset metadata file. Required if tarred dataset is used. Metadata file is a JSON file which
-    # contains total number of batches in the dataset, a list of paths to tar files and paths to label vocabularies.
-    # Metadata file is create by script
-    # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py`
-    tar_metadata_file: null
-    # Controls batch shuffling in tarred dataset. `tar_shuffle_n` is a size of shuffled batch buffer. Mind that this
-    # shuffling only permutes batches and doesn't exchange samples between batches. Proper shuffling is turned on in
-    # regular dataset.
-    tar_shuffle_n: 1
-
-  validation_ds:
-    # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for
-    # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used
-    # See `train_ds` section for more details on tarred dataset
-    use_tarred_dataset: false
-    # expected format: `[PATH_TO_DEV1,PATH_TO_DEV2]` OR `PATH_TO_DEV` (Note no space between the paths and square
-    # brackets)
-    ds_item: ???
-
-    text_file: text_dev.txt
-    labels_file: labels_dev.txt
-    audio_file: audio_dev.txt
-
-    use_audio: true
-    use_bucketing: false
-    preload_audios: false
-
-    shuffle: false
-    num_samples: -1
-    batch_size: 32
-    # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null,
-    # number of jobs is equal to the number of CPU cores.
-    # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece)
-    n_jobs: 0
-
-    # For more details see `train_ds` section.
-    tar_metadata_file: null
-
-    sample_rate: 16000
-    num_workers: 0
-
-  test_ds:
-    # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for
-    # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used
-    # See `train_ds` section for more details on tarred dataset
-    use_tarred_dataset: false
-    # expected format: `[PATH_TO_DEV1,PATH_TO_DEV2]` OR `PATH_TO_DEV` (Note no space between the paths and square
-    # brackets)
-    ds_item: ???
-
-    text_file: text_dev.txt
-    labels_file: labels_dev.txt
-    audio_file: audio_dev.txt
-
-    use_audio: true
-    use_bucketing: false
-    preload_audios: false
-
-    shuffle: false
-    num_samples: -1
-    batch_size: 32
-    # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null,
-    # number of jobs is equal to the number of CPU cores.
-    # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece)
-    n_jobs: 0
-
-    # For more details see `train_ds` section.
-    tar_metadata_file: null
-
-    sample_rate: 16000
-    num_workers: 0
-
-  tokenizer:
-    tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
-    vocab_file: null # path to vocab file
-    tokenizer_model: null # only used if tokenizer is sentencepiece
-    special_tokens: null
-
-  language_model:
-    pretrained_model_name: bert-base-uncased
-    lm_checkpoint: null
-    config_file: null # json file, precedence over config
-    config: null
-
-  punct_head:
-    num_fc_layers: 1
-    fc_dropout: 0.1
-    activation: 'relu'
-    use_transformer_init: True
-
-  capit_head:
-    num_fc_layers: 1
-    fc_dropout: 0.1
-    activation: 'relu'
-    use_transformer_init: true
-
-  optim:
-    name: adam
-    lr: 1e-4
-    weight_decay: 0.00
-
-    sched:
-      name: WarmupAnnealing
-      # Scheduler params
-      warmup_steps: null
-      warmup_ratio: 0.1
-      last_epoch: -1
-
-      # pytorch lightning args
-      monitor: val_loss
-      reduce_on_plateau: false
-
-hydra:
-  run:
-    dir: .
-  job_logging:
-    root:
-      handlers: null
\ No newline at end of file
diff --git a/examples/nlp/token_classification/conf/token_classification_config.yaml b/examples/nlp/token_classification/conf/token_classification_config.yaml
deleted file mode 100644
index 05024c781dab..000000000000
--- a/examples/nlp/token_classification/conf/token_classification_config.yaml
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Token Classification tasks (for example, Named Entity Recognition) with pretrained BERT-like models
-
-pretrained_model: null # pretrained TokenClassification model from list_available_models() or path to a .nemo file,
-# for example: ner_en_bert or your_model.nemo
-trainer:
-  devices: 1 # the number of gpus, 0 for CPU
-  num_nodes: 1
-  max_epochs: 5
-  max_steps: -1 # precedence over max_epochs
-  accumulate_grad_batches: 1 # accumulates grads every k batches
-  gradient_clip_val: 0.0
-  precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
-  accelerator: gpu
-  enable_checkpointing: False  # Provided by exp_manager
-  logger: False  # Provided by exp_manager
-  log_every_n_steps: 1  # Interval of logging.
-  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
-
-exp_manager:
-  exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
-  name: token_classification_model  # The name of your model
-  create_tensorboard_logger: true  # Whether you want exp_manager to create a tb logger
-  create_checkpoint_callback: true  # Whether you want exp_manager to create a model checkpoint callback
-
-model:
-  label_ids: null # will be filled during training
-  class_labels:
-    class_labels_file: label_ids.csv # will be generated during training and saved in .nemo file
-  dataset:
-    data_dir: ??? # /path/to/data
-    class_balancing: null # choose from [null, weighted_loss]. Weighted_loss enables the weighted class balancing of the loss, may be used for handling unbalanced classes
-    max_seq_length: 128
-    pad_label: 'O'
-    ignore_extra_tokens: false
-    ignore_start_end: false
-    use_cache: false
-    # shared among dataloaders
-    num_workers:  2
-    pin_memory: false
-    drop_last: false
-
-  train_ds:
-    text_file: text_train.txt
-    labels_file: labels_train.txt
-    shuffle: true
-    num_samples: -1
-    batch_size: 64
-
-  validation_ds:
-    text_file: text_dev.txt
-    labels_file: labels_dev.txt
-    shuffle: false
-    num_samples: -1
-    batch_size: 64
-
-  test_ds:
-    text_file: text_dev.txt
-    labels_file: labels_dev.txt
-    shuffle: false
-    num_samples: -1
-    batch_size: 64
-
-  tokenizer:
-    tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
-    vocab_file: null # path to vocab file
-    tokenizer_model: null # only used if tokenizer is sentencepiece
-    special_tokens: null
-
-  language_model:
-    pretrained_model_name: bert-base-uncased
-    lm_checkpoint: null
-    config_file: null # json file, precedence over config
-    config: null
-
-
-  head:
-    num_fc_layers: 2
-    fc_dropout: 0.5
-    activation: 'relu'
-    use_transformer_init: True
-
-  optim:
-    name: adam
-    lr: 5e-5
-    weight_decay: 0.00
-
-    sched:
-      name: WarmupAnnealing
-      # Scheduler params
-      warmup_steps: null
-      warmup_ratio: 0.1
-      last_epoch: -1
-
-      # pytorch lightning args
-      monitor: val_loss
-      reduce_on_plateau: false
-
-hydra:
-  run:
-    dir: .
-  job_logging:
-    root:
-      handlers: null
diff --git a/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py b/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py
deleted file mode 100644
index d74c2d8bc19a..000000000000
--- a/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py
+++ /dev/null
@@ -1,356 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import multiprocessing as mp
-from pathlib import Path
-
-from nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset import (
-    DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME,
-    DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME,
-    METADATA_CAPIT_LABEL_VOCAB_KEY,
-    METADATA_PUNCT_LABEL_VOCAB_KEY,
-    build_label_ids_from_list_of_labels,
-    check_labels_for_being_unique_before_building_label_ids,
-    check_tar_file_prefix,
-    create_tarred_dataset,
-)
-
-
-"""
-A tarred dataset allows to train on large amounts without storing it all into memory simultaneously. In case of
-punctuation and capitalization model, tarred dataset is a directory which contains metadata file, tar files with
-batches, punct_label_vocab.csv and capit_label_vocab.csv files.
-
-A metadata file is a JSON file with 4 fields: 'num_batches', 'tar_files', 'punct_label_vocab_file',
-'capit_label_vocab_file'. 'num_batches' (int) is a total number of batches in tarred dataset. 'tar_files' is a list of
-paths to tar files relative to directory containing the metadata file. 'punct_label_vocab_file' and
-'capit_label_vocab_file' are paths to .csv files containing all unique punctuation and capitalization labels. Each
-label in these files is written in a separate line. The first labels in both files are equal and serve for padding and
-as neutral labels.
-
-Every tar file contains objects written using `webdataset.TarWriter`. Each object is a dictionary with two items:
-'__key__' and 'batch.pyd'. '__key__' is a name of a batch and 'batch.pyd' is a pickled dictionary which contains
-'input_ids', 'subtokens_mask', 'punct_labels', 'capit_labels'. 'input_ids' is an array containing ids of source tokens,
-'subtokens_mask' is a boolean array showing first tokens in words, 'punct_labels' and 'capit_labels' are arrays with
-ids of labels. Metadata file should be passed to constructor of
-`nemo.collections.nlp.data.token_classification.PunctuationCapitalizationTarredDataset` and the instance of 
-the class will handle iteration and constructing masks and token types for BERT model.
-
-Example of usage:
-
-python create_punctuation_capitalization_tarred_dataset.py \
-  --text <PATH/TO/TEXT/FILE> \
-  --labels <PATH/TO/LABELS/FILE> \
-  --output_dir <PATH/TO/OUTPUT/DIR> \
-  --lines_per_dataset_fragment 10000 \
-  --tokens_in_batch 8000 \
-  --num_batches_per_tarfile 5 \
-  --tokenizer_name char \
-  --vocab_file <PATH_TO_CHAR_TOKENIZER_VOCABULARY>
-"""
-
-
-def get_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"A tarred dataset allows to train on large amounts without storing it all into memory "
-        f"simultaneously. In case of punctuation and capitalization model, tarred dataset is a directory which "
-        f"contains metadata file, tar files with batches, {DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME} and "
-        f"{DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME} files. A metadata file is a JSON file with 4 fields: 'num_batches', "
-        f"'tar_files', '{METADATA_PUNCT_LABEL_VOCAB_KEY}', '{METADATA_CAPIT_LABEL_VOCAB_KEY}'. 'num_batches' (int) is "
-        f"a total number of batches in tarred dataset. 'tar_files' is a list of paths to tar files relative "
-        f"to directory containing the metadata file. '{METADATA_PUNCT_LABEL_VOCAB_KEY}' and "
-        f"'{METADATA_CAPIT_LABEL_VOCAB_KEY}' are paths to .csv files containing all unique punctuation and "
-        f"capitalization labels. Each label in these files is written in a separate line. The first labels in both "
-        f"files are equal and serve for padding and as neutral labels. Every tar file contains objects written "
-        f"using `webdataset.TarWriter`. Each object is a dictionary with two items: '__key__' and 'batch.pyd'. "
-        f"'__key__' is a name of a batch and 'batch.pyd' is a pickled dictionary which contains 'input_ids', "
-        f"'subtokens_mask', 'punct_labels', 'capit_labels'. 'input_ids' is an array containing ids of source tokens, "
-        f"'subtokens_mask' is a boolean array showing first tokens in words, 'punct_labels' and 'capit_labels' are "
-        f"arrays with ids of labels. Metadata file should be passed to constructor of "
-        "`nemo.collections.nlp.data.token_classification.PunctuationCapitalizationTarredDataset` and the instance of "
-        "the class will handle iteration and constructing masks and token types for BERT model.",
-    )
-    parser.add_argument(
-        "--text",
-        "-t",
-        help="Path to source lowercased text without punctuation. Number of lines in `--text` file has to be equal "
-        "to number of lines in `--labels` file.",
-        type=Path,
-        required=True,
-    )
-    parser.add_argument(
-        "--audio_file",
-        type=Path,
-        required=False,
-        help="Path to source file which contains paths to audio one path per line. "
-        "Number of lines in `--audio_file` has to be equal to number of lines in `--labels` file",
-    )
-    parser.add_argument(
-        "--use_audio",
-        required=False,
-        action="store_true",
-        help="If set to `True` script creates lexical audio dataset which can be used with `PunctuationCapitalizationLexicalAudioModel`.",
-    )
-    parser.add_argument(
-        "--sample_rate",
-        type=int,
-        required=False,
-        help="Target sample rate of audios. Can be used for downsampling or upsampling.",
-    )
-    parser.add_argument(
-        "--labels",
-        "-L",
-        type=Path,
-        required=True,
-        help="Path to file with labels in the format described here "
-        "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#"
-        "nemo-data-format . Number of lines in `--labels` file has to be equal to the number of lines in `--text` "
-        "file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        "-o",
-        type=Path,
-        required=True,
-        help="Path to directory where .tar files, metadata file, label id files are stored.",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        "-s",
-        type=int,
-        default=512,
-        help="Maximum number of subtokens in an input sequence. A source sequence which contain too many subtokens are "
-        "clipped to `--max_seq_length - 2` subtokens and then [CLS] token is prepended to the clipped sequence and "
-        "[SEP] token is appended to the clipped sequence. The clipping is performed via removal of subtokens in the "
-        "end of a source sequence.",
-    )
-    parser.add_argument(
-        "--tokens_in_batch",
-        "-b",
-        type=int,
-        default=15000,
-        help="Maximum number of tokens in a batch including [CLS], [SEP], [UNK], and [PAD] tokens. Before packing into "
-        "batches source sequences are sorted by number of tokens in order to reduce number of pad tokens. So the "
-        "number of sequences in a batch may be different.",
-    )
-    parser.add_argument(
-        "--lines_per_dataset_fragment",
-        type=int,
-        default=10 ** 6,
-        help="A number of lines processed by one worker during creation of tarred dataset. A worker tokenizes "
-        "`--lines_per_dataset_fragment` lines and keeps in RAM tokenized text labels before packing them into "
-        "batches. Reducing `--lines_per_dataset_fragment` leads to reducing of the amount of memory required by this "
-        "script.",
-    )
-    parser.add_argument(
-        "--num_batches_per_tarfile",
-        type=int,
-        default=1000,
-        help="A number of batches saved in a tar file. If you increase `--num_batches_per_tarfile`, then there will "
-        "be less tar files in the dataset. There cannot be less then `--num_batches_per_tarfile` batches in a tar "
-        "file, and all excess batches are removed. Maximum number of discarded batches is "
-        "`--num_batches_per_tarfile - 1`.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        "-T",
-        default="bert-base-uncased",
-        help="Name of the tokenizer used for tokenization of source sequences. Possible options are 'sentencepiece', "
-        "'word', 'char', HuggingFace tokenizers. For more options see function "
-        "`nemo.collections.nlp.modules.common.get_tokenizer`. The tokenizer has to have properties `cls_id`, "
-        "`pad_id`, `sep_id`, `unk_id`.",
-    )
-    parser.add_argument(
-        "--tokenizer_model", "-m", type=Path, help="Path to tokenizer model required for 'sentencepiece' tokenizer."
-    )
-    parser.add_argument(
-        "--vocab_file",
-        "-v",
-        type=Path,
-        help="Path to vocabulary file which can be used in 'word', 'char', and HuggingFace tokenizers.",
-    )
-    parser.add_argument(
-        "--merges_file", "-M", type=Path, help="Path to merges file which can be used in HuggingFace tokenizers."
-    )
-    parser.add_argument(
-        "--special_token_names",
-        "-n",
-        nargs="+",
-        help="Names of special tokens which may be passed to constructors of 'char', 'word', 'sentencepiece', and "
-        "HuggingFace tokenizers.",
-    )
-    parser.add_argument(
-        "--special_token_values",
-        "-V",
-        nargs="+",
-        help="Values of special tokens which may be passed to constructors of 'char', 'word', 'sentencepiece', and "
-        "HuggingFace tokenizers.",
-    )
-    parser.add_argument(
-        "--use_fast_tokenizer", "-f", action="store_true", help="Whether to use fast HuggingFace tokenizer."
-    )
-    parser.add_argument(
-        "--pad_label",
-        "-P",
-        default='O',
-        help="Pad label both for punctuation and capitalization. This label is also is used for marking words which "
-        "do not need punctuation and capitalization. It is also a neutral label used for marking words which do "
-        "not require punctuation and capitalization.",
-    )
-    punct = parser.add_mutually_exclusive_group(required=False)
-    punct.add_argument(
-        "--punct_labels",
-        "-p",
-        nargs="+",
-        help="All punctuation labels EXCEPT PAD LABEL. Punctuation labels are strings separated by spaces. "
-        "Alternatively you can use parameter `--punct_label_vocab_file`. If none of parameters `--punct_labels` "
-        "and `--punct_label_vocab_file` are provided, then punctuation label ids will be inferred from `--labels` "
-        "file.",
-    )
-    punct.add_argument(
-        "--punct_label_vocab_file",
-        type=Path,
-        help="A path to file with punctuation labels. These labels include pad label. Pad label has to be the first "
-        "label in the file. Each label is written on separate line. Alternatively you can use `--punct_labels` "
-        "parameter. If none of parameters `--punct_labels` and `--punct_label_vocab_file` are provided, then "
-        "punctuation label ids will be inferred from `--labels` file.",
-    )
-    capit = parser.add_mutually_exclusive_group(required=False)
-    capit.add_argument(
-        "--capit_labels",
-        "-c",
-        nargs="+",
-        help="All capitalization labels EXCEPT PAD LABEL. Capitalization labels are strings separated by spaces. "
-        "Alternatively you can use parameter `--capit_label_vocab_file`. If none of parameters `--capit_labels` "
-        "and `--capit_label_vocab_file` are provided, then capitalization label ids will be inferred from `--labels` "
-        "file.",
-    )
-    capit.add_argument(
-        "--capit_label_vocab_file",
-        type=Path,
-        help="A path to file with capitalization labels. These labels include pad label. Pad label has to be the "
-        "first label in the file. Each label is written on separate line. Alternatively you can use `--capit_labels` "
-        "parameter. If none of parameters `--capit_labels` and `--capit_label_vocab_file` are provided, then "
-        "capitalization label ids will be inferred from `--labels` file.",
-    )
-    parser.add_argument(
-        "--tar_file_prefix",
-        "-x",
-        default="punctuation_capitalization",
-        help="A string from which tar file names start. It can contain only characters 'A-Z', 'a-z', '0-9', '_', '-', "
-        "'.'.",
-    )
-    parser.add_argument(
-        "--n_jobs",
-        "-j",
-        type=int,
-        default=mp.cpu_count(),
-        help="Number of workers for creating tarred dataset. By default it is equal to the number of CPU cores.",
-    )
-    args = parser.parse_args()
-    for name in [
-        "text",
-        "labels",
-        "output_dir",
-        "tokenizer_model",
-        "vocab_file",
-        "merges_file",
-        "punct_label_vocab_file",
-        "capit_label_vocab_file",
-    ]:
-        if getattr(args, name) is not None:
-            setattr(args, name, getattr(args, name).expanduser())
-    if args.special_token_names is not None or args.special_token_values is not None:
-        if args.special_token_names is None:
-            parser.error(
-                "If you provide parameter `--special_token_values` you have to provide parameter "
-                "`--special_token_names`."
-            )
-        if args.special_token_values is None:
-            parser.error(
-                "If you provide parameter `--special_token_names` you have to provide parameter "
-                "`--special_token_values`."
-            )
-        if len(args.special_token_names) != len(args.special_token_values):
-            parser.error(
-                f"Parameters `--special_token_names` and `--special_token_values` have to have equal number of values "
-                f"whereas parameter `--special_token_names` has {len(args.special_token_names)} values and parameter "
-                f"`--special_token_values` has {len(args.special_token_values)} values."
-            )
-        if len(set(args.special_token_names)) != len(args.special_token_names):
-            for i in range(len(args.special_token_names) - 1):
-                if args.special_token_names[i] in args.special_token_names[i + 1 :]:
-                    parser.error(
-                        f"Values of parameter `--special_token_names` has to be unique. Found duplicate value "
-                        f"'{args.special_token_names[i]}'."
-                    )
-    if args.punct_labels is not None:
-        check_labels_for_being_unique_before_building_label_ids(
-            args.pad_label, args.punct_labels, '--pad_label', '--punct_labels', parser.error
-        )
-        check_labels_for_being_unique_before_building_label_ids(
-            args.pad_label, args.capit_labels, '--pad_label', '--capit_labels', parser.error
-        )
-    check_tar_file_prefix(args.tar_file_prefix, parser.error, '--tar_file_prefix')
-    return args
-
-
-def main() -> None:
-    args = get_args()
-    if args.special_token_names is None:
-        special_tokens = None
-    else:
-        special_tokens = dict(zip(args.special_token_names, args.special_token_values))
-
-    if args.punct_labels is not None:
-        punct_label_ids = build_label_ids_from_list_of_labels(args.pad_label, args.punct_labels)
-    else:
-        punct_label_ids = None
-
-    if args.capit_labels is not None:
-        capit_label_ids = build_label_ids_from_list_of_labels(args.pad_label, args.capit_labels)
-    else:
-        capit_label_ids = None
-
-    create_tarred_dataset(
-        args.text,
-        args.labels,
-        args.output_dir,
-        args.max_seq_length,
-        args.tokens_in_batch,
-        args.lines_per_dataset_fragment,
-        args.num_batches_per_tarfile,
-        args.tokenizer_name,
-        tokenizer_model=args.tokenizer_model,
-        vocab_file=args.vocab_file,
-        merges_file=args.merges_file,
-        special_tokens=special_tokens,
-        use_fast_tokenizer=args.use_fast_tokenizer,
-        pad_label=args.pad_label,
-        punct_label_ids=punct_label_ids,
-        capit_label_ids=capit_label_ids,
-        punct_label_vocab_file=args.punct_label_vocab_file,
-        capit_label_vocab_file=args.capit_label_vocab_file,
-        tar_file_prefix=args.tar_file_prefix,
-        n_jobs=args.n_jobs,
-        audio_file=args.audio_file,
-        sample_rate=args.sample_rate,
-        use_audio=args.use_audio,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/nlp/token_classification/data/get_libritts_data.py b/examples/nlp/token_classification/data/get_libritts_data.py
deleted file mode 100644
index 86a5d01eb9dc..000000000000
--- a/examples/nlp/token_classification/data/get_libritts_data.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This script downloads and unpacks LibriTTS data. And prepares it for punctuation and capitalization lexical audio model.
-Data is being downloaded from www.openslr.org and then extracted via tar.
-The script gathers text from every *.normalized.txt file inside of archive into single file with text and file with audio filepaths.
-"""
-import argparse
-import glob
-import os
-import re
-import shutil
-import subprocess
-import tarfile
-
-from tqdm import tqdm
-
-from nemo.collections.nlp.data.token_classification.token_classification_utils import create_text_and_labels
-from nemo.utils import logging
-
-URL = {
-    'train_clean_100': "https://www.openslr.org/resources/60/train-clean-100.tar.gz",
-    'train_clean_360': "https://www.openslr.org/resources/60/train-clean-360.tar.gz",
-    'train_other_500': "https://www.openslr.org/resources/60/train-other-500.tar.gz",
-    'dev_clean': "https://www.openslr.org/resources/60/dev-clean.tar.gz",
-    'dev_other': "https://www.openslr.org/resources/60/dev-other.tar.gz",
-    'test_clean': "https://www.openslr.org/resources/60/test-clean.tar.gz",
-    'test_other': "https://www.openslr.org/resources/60/test-other.tar.gz",
-}
-
-
-def __extract_file(filepath, data_dir):
-    try:
-        tar = tarfile.open(filepath)
-        tar.extractall(data_dir)
-        tar.close()
-    except Exception:
-        print(f"Error while extracting {filepath}. Already extracted?")
-
-
-def __maybe_download_file(destination: str, source: str):
-    """
-    Downloads source to destination if not exists.
-    If exists, skips download
-    Args:
-        destination: local filepath
-        source: url of resource
-    """
-    source = URL[source]
-    if not os.path.exists(destination):
-        logging.info(f'Downloading {source} to {destination}')
-        subprocess.run(['wget', '-O', destination, source])
-        return 1
-    else:
-        logging.info(f'{destination} found. Skipping download')
-        return 0
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description='Prepare LibriTTS dataset for punctuation capitalization lexical audio model training/evaluating.'
-    )
-    parser.add_argument("--data_sets", default="dev_clean", type=str, help="List of subsets separated by comma")
-    parser.add_argument("--data_dir", required=True, type=str, help="Path to dir where data will be stored")
-    parser.add_argument(
-        "--clean", "-c", action="store_true", help="If set to True will delete all files except produced .txt and .wav"
-    )
-    args = parser.parse_args()
-
-    data_dir = args.data_dir
-
-    if not os.path.exists(data_dir):
-        os.makedirs(data_dir)
-
-    for subset in args.data_sets.split(','):
-        logging.info(f'Downloading {subset} subset')
-        if __maybe_download_file(data_dir + f'/{subset}.tar.gz', subset):
-            logging.info(f'Extracting {subset} subset')
-            __extract_file(data_dir + f'/{subset}.tar.gz', data_dir)
-
-    logging.info(f'Processing data')
-
-    splits = set([split.split('_')[0] for split in args.data_sets.split(',')])
-    for split in splits:
-        os.makedirs(f'{data_dir}/audio/{split}', exist_ok=True)
-        with open(f'{data_dir}/{split}.txt', 'w') as text_data, open(
-            f'{data_dir}/audio_{split}.txt', 'w'
-        ) as audio_data:
-            for file in tqdm(glob.glob(f'{data_dir}/LibriTTS/{split}*/*/*/*.wav'), desc=f'Processing {split}'):
-                with open(file[:-4] + '.normalized.txt', 'r') as source_file:
-                    lines = source_file.readlines()
-                    text = lines[0]
-                    text = re.sub(r"[^a-zA-Z\d,?!.']", ' ', text)
-                    text = re.sub(' +', ' ', text)
-                shutil.copy(file.strip(), (f'{data_dir}/audio/{split}/' + file.split('/')[-1]).strip())
-                text_data.write(text.strip() + "\n")
-                audio_data.write((f'{data_dir}/audio/{split}/' + file.split('/')[-1]).strip() + "\n")
-        create_text_and_labels(f'{data_dir}/', f'{data_dir}/{split}.txt')
-        logging.info(f'Processed {split} subset')
-
-    if args.clean:
-        shutil.rmtree(f'{data_dir}/LibriTTS')
-        for tar in glob.glob(f'{data_dir}/**.tar.gz'):
-            os.remove(tar)
diff --git a/examples/nlp/token_classification/data/get_tatoeba_data.py b/examples/nlp/token_classification/data/get_tatoeba_data.py
deleted file mode 100644
index 6a4cd23b249d..000000000000
--- a/examples/nlp/token_classification/data/get_tatoeba_data.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import os
-import random
-import re
-import subprocess
-
-from nemo.collections.nlp.data.token_classification.token_classification_utils import create_text_and_labels
-from nemo.utils import logging
-
-URL = {'tatoeba': 'https://downloads.tatoeba.org/exports/sentences.csv'}
-
-
-def __maybe_download_file(destination: str, source: str):
-    """
-    Downloads source to destination if not exists.
-    If exists, skips download
-    Args:
-        destination: local filepath
-        source: url of resource
-    """
-    source = URL[source]
-    if not os.path.exists(destination):
-        logging.info(f'Downloading {source} to {destination}')
-        subprocess.run(['wget', '-O', destination, source])
-    else:
-        logging.info(f'{destination} found. Skipping download')
-
-
-def __process_english_sentences(
-    in_file: str, out_file: str, percent_to_cut: float = 0, num_to_combine: int = 1, num_samples: int = -1
-):
-    """
-    Extract English sentences from the Tatoeba dataset.
-
-    Expected in_file format
-    that
-    contain letters and punctuation marks (,.?).
-    Chop and combine sentences.
-    Args:
-        in_file: local filepath to the tatoeba dataset.
-    Format: id [TAB] region_name [TAB] sentence,
-    for example: "1276\teng\tLet's try something.\n"
-        out_file: local filepath to the clean dataset
-        percent_to_cut: Percent of sentences to cut in the middle
-            to get examples of incomplete sentences.
-            This could be useful since ASR output not always
-            represents a complete sentence
-        num_to_combine: Number of sentences to combine into
-            a single example
-        num_samples: Number of samples in the final dataset
-    """
-    if not os.path.exists(in_file):
-        raise FileNotFoundError(f'{in_file} not found.')
-
-    in_file = open(in_file, 'r')
-    out_file = open(out_file, 'w')
-    lines_to_combine = []
-    samples_count = 0
-
-    for line in in_file:
-        line = line.split('\t')
-        # use only English sentences
-        if line[1] == 'eng':
-            line = line[2].strip()
-            if re.match("^[A-Z][A-Za-z.,'?\s]+$", line):  # nopep8
-                # chop some sentences in the middle
-                if percent_to_cut > 0:
-                    line = line.split()
-                    if random.random() < percent_to_cut:
-                        line = line[: len(line) // 2]
-                    line = ' '.join(line)
-
-                # combine multiple sentences into a single example
-                # to make it harder for the model to learn eos punctuation
-                if len(lines_to_combine) >= num_to_combine:
-                    if samples_count == num_samples:
-                        return
-                    out_file.write(' '.join(lines_to_combine) + '\n')
-                    lines_to_combine = []
-                    samples_count += 1
-                lines_to_combine.append(line)
-
-    if len(lines_to_combine) > 0 and (samples_count < num_samples or num_samples < 0):
-        out_file.write(' '.join(lines_to_combine) + '\n')
-
-
-def __split_into_train_dev(in_file: str, train_file: str, dev_file: str, percent_dev: float):
-    """
-    Create train and dev split of the dataset.
-    Args:
-        in_file: local filepath to the dataset
-        train_file: local filepath to the train dataset
-        dev_file: local filepath to the dev dataset
-        percent_dev: Percent of the sentences in the dev set
-    """
-    if not os.path.exists(in_file):
-        raise FileNotFoundError(f'{in_file} not found.')
-
-    lines = open(in_file, 'r').readlines()
-    train_file = open(train_file, 'w')
-    dev_file = open(dev_file, 'w')
-
-    dev_size = int(len(lines) * percent_dev)
-    train_file.write(' '.join(lines[:-dev_size]))
-    dev_file.write(' '.join(lines[-dev_size:]))
-
-
-def __delete_file(file_to_del: str):
-    """
-    Deletes the file
-    Args:
-        file_to_del: local filepath to the file to delete
-    """
-    if os.path.exists(file_to_del):
-        os.remove(file_to_del)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Prepare tatoeba dataset')
-    parser.add_argument("--data_dir", required=True, type=str)
-    parser.add_argument("--dataset", default='tatoeba', type=str)
-    parser.add_argument("--num_samples", default=-1, type=int, help='-1 to use the whole dataset')
-    parser.add_argument("--percent_to_cut", default=0, type=float, help='Percent of sentences to cut in the middle')
-    parser.add_argument(
-        "--num_lines_to_combine", default=1, type=int, help='Number of lines to combine into single example'
-    )
-    parser.add_argument("--percent_dev", default=0.2, type=float, help='Size of the dev set, float')
-    parser.add_argument("--clean_dir", action='store_true')
-    args = parser.parse_args()
-
-    if not os.path.exists(args.data_dir):
-        os.makedirs(args.data_dir)
-
-    if args.dataset != 'tatoeba':
-        raise ValueError("Unsupported dataset.")
-
-    logging.info(f'Downloading tatoeba dataset')
-    tatoeba_dataset = os.path.join(args.data_dir, 'sentences.csv')
-    __maybe_download_file(tatoeba_dataset, args.dataset)
-
-    logging.info(f'Processing English sentences...')
-    clean_eng_sentences = os.path.join(args.data_dir, 'clean_eng_sentences.txt')
-    __process_english_sentences(
-        tatoeba_dataset, clean_eng_sentences, args.percent_to_cut, args.num_lines_to_combine, args.num_samples
-    )
-
-    train_file = os.path.join(args.data_dir, 'train.txt')
-    dev_file = os.path.join(args.data_dir, 'dev.txt')
-
-    logging.info(
-        f'Splitting the {args.dataset} dataset into train and dev sets' + ' and creating labels and text files'
-    )
-    __split_into_train_dev(clean_eng_sentences, train_file, dev_file, args.percent_dev)
-
-    logging.info(f'Creating text and label files for training')
-    create_text_and_labels(args.data_dir, os.path.join(args.data_dir, 'train.txt'))
-    create_text_and_labels(args.data_dir, os.path.join(args.data_dir, 'dev.txt'))
-
-    if args.clean_dir:
-        logging.info(f'Cleaning up {args.data_dir}')
-        __delete_file(clean_eng_sentences)
-        __delete_file(tatoeba_dataset)
-        __delete_file(train_file)
-        __delete_file(dev_file)
-    logging.info(f'Processing of the {args.dataset} is complete')
diff --git a/examples/nlp/token_classification/data/import_from_iob_format.py b/examples/nlp/token_classification/data/import_from_iob_format.py
deleted file mode 100644
index 4a6f15442b98..000000000000
--- a/examples/nlp/token_classification/data/import_from_iob_format.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-from nemo.utils import logging
-
-
-def __convert_data(in_file: str, out_text_f: str, out_labels_f: str, max_length: int):
-    """
-    Convert data from the IOB format to NeMo accepted format described below.
-    in_file should be in the IOB format, see example here:
-    https://www.clips.uantwerpen.be/conll2003/ner/.
-
-    Args:
-        in_file: input file name
-        out_text_f: output file with text
-        out_labels_f: output file with labels
-        max_length: use -1 to leave the examples' length as is, otherwise long examples will be split into multiple
-            examples
-    After the conversion, the dataset is split into 2 files: text.txt
-    and labels.txt.
-    Each line of the text.txt file contains text sequences, where words
-    are separated with spaces. The labels.txt file contains corresponding
-    labels for each word in text.txt, the labels are separated with spaces.
-    Each line of the files should follow the format:
-    [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and
-    [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).
-
-    """
-    in_file = open(in_file, 'r')
-
-    if max_length == -1:
-        with open(out_text_f, 'w') as out_text, open(out_labels_f, 'w') as out_labels:
-            for line in in_file:
-                if line == '\n':
-                    out_text.write(line)
-                    out_labels.write(line)
-                else:
-                    line = line.split()
-                    out_text.write(line[0] + ' ')
-                    out_labels.write(line[-1] + ' ')
-
-    else:
-        words = []
-        labels = []
-        with open(out_text_f, 'w') as out_text, open(out_labels_f, 'w') as out_labels:
-            lines = in_file.readlines()
-            for line_id, line in enumerate(lines):
-                logging.info(f"{line_id} {len(lines)}")
-                contends = line.strip()
-                if len(contends) == 0:
-                    assert len(words) == len(labels)
-                    if len(words) > max_length:
-                        # split if the sentence is longer than max_length
-                        while len(words) > max_length:
-                            tmplabel = labels[:max_length]
-                            for iidx in range(len(tmplabel)):
-                                if tmplabel.pop() == 'O':
-                                    break
-                            l = ' '.join([label for label in labels[: len(tmplabel) + 1] if len(label) > 0])
-                            w = ' '.join([word for word in words[: len(tmplabel) + 1] if len(word) > 0])
-
-                            out_text.write(w + "\n")
-                            out_labels.write(l + "\n")
-                            words = words[len(tmplabel) + 1 :]
-                            labels = labels[len(tmplabel) + 1 :]
-
-                    if len(words) == 0:
-                        continue
-                    l = ' '.join([label for label in labels if len(label) > 0])
-                    w = ' '.join([word for word in words if len(word) > 0])
-
-                    out_text.write(w + "\n")
-                    out_labels.write(l + "\n")
-                    words = []
-                    labels = []
-                    continue
-
-                word = line.strip().split()[0]
-                label = line.strip().split()[-1]
-                words.append(word)
-                labels.append(label)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description='Convert data from IOB format to the format compatible with \
-        nlp/examples/token_classification/scripts/token_classification_train.py and \
-        token_classification_evaluate.py'
-    )
-    parser.add_argument("--data_file", required=True, type=str, help='path to a file in IOB format')
-    parser.add_argument(
-        "--max_length",
-        default=-1,
-        type=int,
-        help='use -1 to leave the examples\'s length as is, '
-        'otherwise long examples will be split into multiple examples',
-    )
-    args = parser.parse_args()
-
-    data_dir, basename = os.path.split(args.data_file)
-    prefix = os.path.splitext(basename)[0]
-    if not os.path.exists(args.data_file):
-        raise FileNotFoundError(f"{args.data_file} not found")
-
-    logging.info(f'Processing {args.data_file}')
-    out_text = os.path.join(data_dir, 'text_' + prefix + '.txt')
-    out_labels = os.path.join(data_dir, 'labels_' + prefix + '.txt')
-
-    __convert_data(args.data_file, out_text, out_labels, args.max_length)
-    logging.info(f'Processing of the {args.data_file} is complete')
diff --git a/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py b/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py
deleted file mode 100644
index 78a0763d3b54..000000000000
--- a/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-The script converts raw text to the NeMo format for punctuation and capitalization task.
-
-Raw Data Format
----------------
-
-The Punctuation and Capitalization model can work with any text dataset, although it is recommended to balance the data, especially for the punctuation task.
-Before pre-processing the data to the format expected by the model, the data should be split into train.txt and dev.txt (and optionally test.txt).
-Each line in the **train.txt/dev.txt/test.txt** should represent one or more full and/or truncated sentences.
-
-Example of the train.txt/dev.txt file:
-    When is the next flight to New York?
-    The next flight is ...
-    ....
-
-
-The `source_data_dir` structure should look like this:
-   .
-   |--sourced_data_dir
-     |-- dev.txt
-     |-- train.txt
-
-
-
-NeMo Data Format for training the model
----------------------------------------
-
-The punctuation and capitalization model expects the data in the following format:
-
-The training and evaluation data is divided into 2 files: text.txt and labels.txt. \
-Each line of the **text.txt** file contains text sequences, where words are separated with spaces, i.e.
-
-[WORD] [SPACE] [WORD] [SPACE] [WORD], for example:
-        when is the next flight to new york
-        the next flight is ...
-        ...
-
-The **labels.txt** file contains corresponding labels for each word in text.txt, the labels are separated with spaces. \
-Each label in labels.txt file consists of 2 symbols:
-
-* the first symbol of the label indicates what punctuation mark should follow the word (where O means no punctuation needed);
-* the second symbol determines if a word needs to be capitalized or not (where U indicates that the word should be upper-cased, and O - no capitalization needed.)
-
-By default, the following punctuation marks are considered: commas, periods, and question marks; the rest punctuation marks were removed from the data.
-This can be changed by introducing new labels in the labels.txt files
-
-Each line of the labels.txt should follow the format: [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt). \
-For example, labels for the above text.txt file should be:
-
-        OU OO OO OO OO OO OU ?U
-        OU OO OO OO ...
-        ...
-
-The complete list of all possible labels for this task used in this tutorial is: OO, ,O, .O, ?O, OU, ,U, .U, ?U.
-
-Converting Raw data to NeMo format
-----------------------------------
-
-To pre-process the raw text data, stored under :code:`sourced_data_dir` (see the :ref:`raw_data_format_punct`
-section), run the following command:
-
-    python examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py \
-           -s <PATH/TO/THE/SOURCE/FILE> \
-           -o <PATH/TO/THE/OUTPUT/DIRECTORY>
-
-"""
-
-import argparse
-import os
-
-from get_tatoeba_data import create_text_and_labels
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Prepare data for punctuation and capitalization tasks')
-    parser.add_argument("-s", "--source_file", required=True, type=str, help="Path to the source file")
-    parser.add_argument("-o", "--output_dir", required=True, type=str, help="Path to the output directory")
-    parser.add_argument(
-        "-p",
-        "--marks",
-        required=False,
-        type=str,
-        help="Punctuation marks to consider for dataset",
-        default=[",", ".", "?"],
-        nargs="+",
-    )
-    args = parser.parse_args()
-
-    if not os.path.exists(args.source_file):
-        raise ValueError(f'{args.source_file} was not found')
-
-    os.makedirs(args.output_dir, exist_ok=True)
-    create_text_and_labels(args.output_dir, args.source_file, "".join(args.marks))
-
-    print(f'Processing of the {args.source_file} is complete')
diff --git a/examples/nlp/token_classification/punctuate_capitalize_infer.py b/examples/nlp/token_classification/punctuate_capitalize_infer.py
deleted file mode 100644
index 8fdb3ab5a1ed..000000000000
--- a/examples/nlp/token_classification/punctuate_capitalize_infer.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-from pathlib import Path
-from typing import Dict, List, Union
-
-import torch.cuda
-
-from nemo.collections.nlp.models import PunctuationCapitalizationLexicalAudioModel, PunctuationCapitalizationModel
-
-
-"""
-This script is for restoring punctuation and capitalization.
-
-Usage example:
-
-python punctuate_capitalize.py \
-    --input_manifest <PATH/TO/INPUT/MANIFEST> \
-    --output_manifest <PATH/TO/OUTPUT/MANIFEST>
-
-Usage example for lexical audio model:
-python punctuate_capitalize.py \
-    --input_manifest <PATH/TO/INPUT/MANIFEST> \
-    --output_manifest <PATH/TO/OUTPUT/MANIFEST> \
-    --use_audio
-
-
-<PATH/TO/INPUT/MANIFEST> is a path to NeMo ASR manifest. Usually it is an output of
-    NeMo/examples/asr/transcribe_speech.py but can be a manifest with 'text' key. Alternatively you can use
-    --input_text parameter for passing text for inference.
-<PATH/TO/OUTPUT/MANIFEST> is a path to NeMo ASR manifest into which script output will be written. Alternatively
-    you can use parameter --output_text.
-
-For more details on this script usage look in argparse help.
-"""
-
-
-def get_args() -> argparse.Namespace:
-    default_model_parameter = "pretrained_name"
-    default_model = "punctuation_en_bert"
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description="The script is for restoring punctuation and capitalization in text or text and audio. To use text and audio use '--use_audio'. Long strings are split into "
-        "segments of length `--max_seq_length`. `--max_seq_length` is the length which includes [CLS] and [SEP] "
-        "tokens. If `--use_audio` is set, samples with texts longer than `--max_seq_length` will be ignored. Parameter `--step` controls segments overlapping. `--step` is a distance between beginnings of "
-        "consequent segments. Model outputs for tokens near the borders of tensors are less accurate and can be "
-        "discarded before final predictions computation. Parameter `--margin` is number of discarded outputs near "
-        "segments borders. Probabilities of tokens in overlapping parts of segments multiplied before selecting the "
-        "best prediction. Default values of parameters `--max_seq_length`, `--step`, and `--margin` are optimal for "
-        "IWSLT 2019 test dataset.",
-    )
-    parser.add_argument(
-        '--use_audio',
-        required=False,
-        action="store_true",
-        help="If set `PunctuationCapitalizationLexicalAudioModel` will be used for inference",
-    )
-    input_ = parser.add_mutually_exclusive_group(required=True)
-    input_.add_argument(
-        "--input_manifest",
-        "-m",
-        type=Path,
-        help="Path to the file with NeMo manifest which needs punctuation and capitalization. If the first element "
-        "of manifest contains key 'pred_text', 'pred_text' values are passed for tokenization. Otherwise 'text' "
-        "values are passed for punctuation and capitalization. Exactly one parameter of `--input_manifest` and "
-        "`--input_text` should be provided.",
-    )
-    input_.add_argument(
-        "--input_text",
-        "-t",
-        type=Path,
-        help="Path to file with text which needs punctuation and capitalization. Exactly one parameter of "
-        "`--input_manifest` and `--input_text` should be provided.",
-    )
-    parser.add_argument(
-        '--audio_file',
-        required=False,
-        type=Path,
-        help="Path to file with paths to audio. One path per row. Required if '--input_text' provided. Else 'audio_filepath' from manifest will be used.",
-    )
-    output = parser.add_mutually_exclusive_group(required=True)
-    output.add_argument(
-        "--output_manifest",
-        "-M",
-        type=Path,
-        help="Path to output NeMo manifest. Text with restored punctuation and capitalization will be saved in "
-        "'pred_text' elements if 'pred_text' key is present in the input manifest. Otherwise text with restored "
-        "punctuation and capitalization will be saved in 'text' elements. Exactly one parameter of `--output_manifest` "
-        "and `--output_text` should be provided.",
-    )
-    output.add_argument(
-        "--output_text",
-        "-T",
-        type=Path,
-        help="Path to file with text with restored punctuation and capitalization. Exactly one parameter of "
-        "`--output_manifest` and `--output_text` should be provided.",
-    )
-    model = parser.add_mutually_exclusive_group(required=False)
-    model.add_argument(
-        "--pretrained_name",
-        "-p",
-        help=f"The name of NGC pretrained model. No more than one of parameters `--pretrained_name`, `--model_path`"
-        f"should be provided. If neither of parameters `--pretrained_name` and `--model_path` are provided, then the "
-        f"script is run with `--{default_model_parameter}={default_model}`.",
-        choices=[m.pretrained_model_name for m in PunctuationCapitalizationModel.list_available_models()]
-        + [m.pretrained_model_name for m in PunctuationCapitalizationLexicalAudioModel.list_available_models()],
-    )
-    model.add_argument(
-        "--model_path",
-        "-P",
-        type=Path,
-        help=f"Path to .nemo checkpoint of punctuation and capitalization model. No more than one of parameters "
-        f"`--pretrained_name` and `--model_path` should be provided. If neither of parameters `--pretrained_name` and "
-        f"`--model_path` are provided, then the script is run with `--{default_model_parameter}={default_model}`.",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        "-L",
-        type=int,
-        default=64,
-        help="Length of segments into which queries are split. `--max_seq_length` includes [CLS] and [SEP] tokens.",
-    )
-    parser.add_argument(
-        "--step",
-        "-s",
-        type=int,
-        default=8,
-        help="Relative shift of consequent segments into which long queries are split. Long queries are split into "
-        "segments which can overlap. Parameter `step` controls such overlapping. Imagine that queries are "
-        "tokenized into characters, `max_seq_length=5`, and `step=2`. In such a case query 'hello' is tokenized "
-        "into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`.",
-    )
-    parser.add_argument(
-        "--margin",
-        "-g",
-        type=int,
-        default=16,
-        help="A number of subtokens in the beginning and the end of segments which output probabilities are not used "
-        "for prediction computation. The first segment does not have left margin and the last segment does not have "
-        "right margin. For example, if input sequence is tokenized into characters, `max_seq_length=5`, `step=1`, "
-        "and `margin=1`, then query 'hello' will be tokenized into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], "
-        "['[CLS]', 'e', 'l', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`. These segments are passed to the "
-        "model. Before final predictions computation, margins are removed. In the next list, subtokens which logits "
-        "are not used for final predictions computation are marked with asterisk: `[['[CLS]'*, 'h', 'e', 'l'*, "
-        "'[SEP]'*], ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]`.",
-    )
-    parser.add_argument(
-        "--batch_size", "-b", type=int, default=128, help="Number of segments which are processed simultaneously.",
-    )
-    parser.add_argument(
-        "--save_labels_instead_of_text",
-        "-B",
-        action="store_true",
-        help="If this option is set, then punctuation and capitalization labels are saved instead text with restored "
-        "punctuation and capitalization. Labels are saved in format described here "
-        "https://docs.nvidia.com/deeplearning/nemo/"
-        "user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#nemo-data-format",
-    )
-    parser.add_argument(
-        "--device",
-        "-d",
-        choices=['cpu', 'cuda'],
-        help="Which device to use. If device is not set and CUDA is available, then GPU will be used. If device is "
-        "not set and CUDA is not available, then CPU is used.",
-    )
-    parser.add_argument(
-        "--sample_rate",
-        type=int,
-        default=16000,
-        help="Target sample rate for audios if `--use_audio` was passed",
-        required=False,
-    )
-    args = parser.parse_args()
-    if args.input_manifest is None and args.output_manifest is not None:
-        parser.error("--output_manifest requires --input_manifest")
-    if args.use_audio and (args.input_manifest is None and args.audio_file is None):
-        parser.error("--use_audio and --input_text require --audio_file")
-    if args.pretrained_name is None and args.model_path is None:
-        setattr(args, default_model_parameter, default_model)
-    for name in ["input_manifest", "input_text", "output_manifest", "output_text", "model_path", "audio_file"]:
-        if getattr(args, name) is not None:
-            setattr(args, name, getattr(args, name).expanduser())
-    return args
-
-
-def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
-    result = []
-    with manifest.open() as f:
-        for i, line in enumerate(f):
-            data = json.loads(line)
-            result.append(data)
-    return result
-
-
-def main() -> None:
-    args = get_args()
-    if args.pretrained_name is None:
-        model = (
-            PunctuationCapitalizationModel.restore_from(args.model_path)
-            if not args.use_audio
-            else PunctuationCapitalizationLexicalAudioModel.restore_from(args.model_path)
-        )
-    else:
-        model = (
-            PunctuationCapitalizationModel.from_pretrained(args.pretrained_name)
-            if not args.use_audio
-            else PunctuationCapitalizationLexicalAudioModel.restore_from(args.model_path)
-        )
-    if args.device is None:
-        if torch.cuda.is_available():
-            model = model.cuda()
-        else:
-            model = model.cpu()
-    else:
-        model = model.to(args.device)
-    if args.input_manifest is None:
-        texts = []
-        audios = []
-        with args.input_text.open() as f:
-            for line in f:
-                texts.append(line.strip())
-        if args.use_audio:
-            with args.audio_file.open() as f:
-                for line in f:
-                    audios.append(line.strip())
-    else:
-        manifest = load_manifest(args.input_manifest)
-        text_key = "pred_text" if "pred_text" in manifest[0] else "text"
-        texts = []
-        audios = []
-        for item in manifest:
-            texts.append(item[text_key])
-            if args.use_audio:
-                audios.append(item["audio_filepath"])
-    if args.use_audio:
-        processed_texts = model.add_punctuation_capitalization(
-            texts,
-            batch_size=args.batch_size,
-            max_seq_length=args.max_seq_length,
-            step=args.step,
-            margin=args.margin,
-            return_labels=args.save_labels_instead_of_text,
-            audio_queries=audios,
-            target_sr=args.sample_rate,
-        )
-    else:
-        processed_texts = model.add_punctuation_capitalization(
-            texts,
-            batch_size=args.batch_size,
-            max_seq_length=args.max_seq_length,
-            step=args.step,
-            margin=args.margin,
-            return_labels=args.save_labels_instead_of_text,
-        )
-    if args.output_manifest is None:
-        args.output_text.parent.mkdir(exist_ok=True, parents=True)
-        with args.output_text.open('w') as f:
-            for t in processed_texts:
-                f.write(t + '\n')
-    else:
-        args.output_manifest.parent.mkdir(exist_ok=True, parents=True)
-        with args.output_manifest.open('w') as f:
-            for item, t in zip(manifest, processed_texts):
-                item[text_key] = t
-                f.write(json.dumps(item) + '\n')
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py b/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py
deleted file mode 100644
index 508e434bb598..000000000000
--- a/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import lightning.pytorch as pl
-import torch
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models.token_classification.punctuation_capitalization_config import (
-    PunctuationCapitalizationLexicalAudioConfig,
-)
-from nemo.collections.nlp.models.token_classification.punctuation_capitalization_lexical_audio_model import (
-    PunctuationCapitalizationLexicalAudioModel,
-)
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-"""
-This script show how to train a Punctuation and Capitalization Model with lexical and acoustic features.
-More details on the task and data format could be found in tutorials/nlp/Punctuation_and_Capitalization.ipynb
-
-*** Setting the configs ***
-
-The model and the PT trainer are defined in a config file which declares multiple important sections.
-The most important ones are:
-    model: All arguments that are related to the Model - language model, audio encoder, tokenizer, token classifier, optimizer,
-            schedulers, and datasets/data loaders.
-    trainer: Any argument to be passed to PyTorch Lightning including number of epochs, number of GPUs,
-            precision level, etc.
-This script uses the `/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml` config file
-by default. You may update the config file from the file directly. 
-The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'.
-
-*** Model training ***
-
-To run this script and train the model from scratch, use:
-    python punctuation_capitalization_lexical_audio_train_evaluate.py \
-        model.train_ds.ds_item=<PATH/TO/TRAIN/DATA> \
-        model.train_ds.text_file=<NAME_OF_TRAIN_INPUT_TEXT_FILE> \
-        model.train_ds.labels_file=<NAME_OF_TRAIN_LABELS_FILE> \
-        model.train_ds.audio_file=<NAME_OF_TRAIN_AUDIO_FILE> \
-        model.validation_ds.ds_item=<PATH/TO/DEV/DATA> \
-        model.validation_ds.text_file=<NAME_OF_DEV_INPUT_TEXT_FILE> \
-        model.validation_ds.labels_file=<NAME_OF_DEV_LABELS_FILE> \
-        model.validation_ds.audio_file=<NAME_OF_DEV_AUDIO_FILE>
-
-To use BERT-like pretrained P&C models' weights to initialize lexical encoder, use:
-    python punctuation_capitalization_lexical_audio_train_evaluate.py \
-        model.train_ds.ds_item=<PATH/TO/TRAIN/DATA> \
-        model.train_ds.text_file=<NAME_OF_TRAIN_INPUT_TEXT_FILE> \
-        model.train_ds.labels_file=<NAME_OF_TRAIN_LABELS_FILE> \
-        model.train_ds.audio_file=<NAME_OF_TRAIN_AUDIO_FILE> \
-        model.validation_ds.ds_item=<PATH/TO/DEV/DATA> \
-        model.validation_ds.text_file=<NAME_OF_DEV_INPUT_TEXT_FILE> \
-        model.validation_ds.labels_file=<NAME_OF_DEV_LABELS_FILE> \
-        model.validation_ds.audio_file=<NAME_OF_DEV_AUDIO_FILE> \
-        model.restore_lexical_encoder_from=<PATH/TO/CHECKPOINT.nemo>
-
-
-If you wish to perform testing after training set `do_testing` to `true:
-    python punctuation_capitalization_lexical_audio_train_evaluate.py \
-        +do_testing=true \
-        pretrained_model=<PATH/TO/CHECKPOINT.nemo> \
-        model.train_ds.ds_item=<PATH/TO/TRAIN/DATA> \
-        model.train_ds.text_file=<NAME_OF_TRAIN_INPUT_TEXT_FILE> \
-        model.train_ds.labels_file=<NAME_OF_TRAIN_LABELS_FILE> \
-        model.train_ds.audio_file=<NAME_OF_TRAIN_AUDIO_FILE> \
-        model.validation_ds.ds_item=<PATH/TO/DEV/DATA> \
-        model.validation_ds.text_file=<NAME_OF_DEV_INPUT_TEXT_FILE> \
-        model.validation_ds.labels_file=<NAME_OF_DEV_LABELS_FILE> \
-        model.validation_ds.audio_file=<NAME_OF_DEV_AUDIO_FILE> \
-        model.test_ds.ds_item=<PATH/TO/TEST_DATA> \
-        model.test_ds.text_file=<NAME_OF_TEST_INPUT_TEXT_FILE> \
-        model.test_ds.labels_file=<NAME_OF_TEST_LABELS_FILE> \
-        model.test_ds.audio_file=<NAME_OF_TEST_AUDIO_FILE>
-
-Set `do_training` to `false` and `do_testing` to `true` to perform evaluation without training:
-    python punctuation_capitalization_lexical_audio_train_evaluate.py \
-        +do_testing=true \
-        +do_training=false \
-        pretrained_model==<PATH/TO/CHECKPOINT.nemo> \
-        model.test_ds.ds_item=<PATH/TO/DEV/DATA> \
-        model.test_ds.text_file=<NAME_OF_TEST_INPUT_TEXT_FILE> \
-        model.test_ds.labels_file=<NAME_OF_TEST_LABELS_FILE> \
-        model.test_ds.audio_file=<NAME_OF_TEST_AUDIO_FILE>
-
-"""
-
-
-@hydra_runner(config_path="conf", config_name="punctuation_capitalization_lexical_audio_config")
-def main(cfg: DictConfig) -> None:
-    # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True
-    # when there are unused parameters like here
-    if cfg.trainer.strategy == 'ddp':
-        cfg.trainer.strategy = "ddp_find_unused_parameters_true"
-    torch.manual_seed(42)
-    cfg = OmegaConf.merge(OmegaConf.structured(PunctuationCapitalizationLexicalAudioConfig()), cfg)
-    trainer = pl.Trainer(**cfg.trainer)
-    exp_manager(trainer, cfg.get("exp_manager", None))
-    if not cfg.do_training and not cfg.do_testing:
-        raise ValueError("At least one of config parameters `do_training` and `do_testing` has to be `true`.")
-    if cfg.do_training:
-        if cfg.model.get('train_ds') is None:
-            raise ValueError('`model.train_ds` config section is required if `do_training` config item is `True`.')
-    if cfg.do_testing:
-        if cfg.model.get('test_ds') is None:
-            raise ValueError('`model.test_ds` config section is required if `do_testing` config item is `True`.')
-
-    if not cfg.pretrained_model:
-        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
-        model = PunctuationCapitalizationLexicalAudioModel(cfg.model, trainer=trainer)
-    else:
-        if os.path.exists(cfg.pretrained_model):
-            model = PunctuationCapitalizationLexicalAudioModel.restore_from(cfg.pretrained_model)
-        elif cfg.pretrained_model in PunctuationCapitalizationLexicalAudioModel.get_available_model_names():
-            model = PunctuationCapitalizationLexicalAudioModel.from_pretrained(cfg.pretrained_model)
-        else:
-            raise ValueError(
-                f'Provide path to the pre-trained .nemo file or choose from '
-                f'{PunctuationCapitalizationLexicalAudioModel.list_available_models()}'
-            )
-        model.update_config_after_restoring_from_checkpoint(
-            class_labels=cfg.model.class_labels,
-            common_dataset_parameters=cfg.model.common_dataset_parameters,
-            train_ds=cfg.model.get('train_ds') if cfg.do_training else None,
-            validation_ds=cfg.model.get('validation_ds') if cfg.do_training else None,
-            test_ds=cfg.model.get('test_ds') if cfg.do_testing else None,
-            optim=cfg.model.get('optim') if cfg.do_training else None,
-        )
-        model.set_trainer(trainer)
-        if cfg.do_training:
-            model.setup_training_data()
-            model.setup_multiple_validation_data(cfg.model.validation_ds)
-            model.setup_optimization()
-        else:
-            model.setup_multiple_test_data(cfg.model.test_ds)
-    if cfg.do_training:
-        trainer.fit(model)
-    if cfg.do_testing:
-        trainer.test(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py b/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py
deleted file mode 100644
index b16e1ecd0bdc..000000000000
--- a/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import lightning.pytorch as pl
-import torch
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models import PunctuationCapitalizationModel
-from nemo.collections.nlp.models.token_classification.punctuation_capitalization_config import (
-    PunctuationCapitalizationConfig,
-)
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-"""
-This script show how to train a Punctuation and Capitalization Model.
-More details on the task and data format could be found in tutorials/nlp/Punctuation_and_Capitalization.ipynb
-
-*** Setting the configs ***
-
-The model and the PT trainer are defined in a config file which declares multiple important sections.
-The most important ones are:
-    model: All arguments that are related to the Model - language model, tokenizer, token classifier, optimizer,
-            schedulers, and datasets/data loaders.
-    trainer: Any argument to be passed to PyTorch Lightning including number of epochs, number of GPUs,
-            precision level, etc.
-This script uses the `/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml` config file
-by default. You may update the config file from the file directly. 
-The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'.
-
-Additional default parameters could be found in PunctuationCapitalizationDataConfigBase from 
-/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py, 
-use `+` to modify their values via command line, e.g.: `+model.train_ds.num_workers=2`
-
-For more details about the config files and different ways of model restoration, see tutorials/00_NeMo_Primer.ipynb
-
-*** Model training ***
-
-To run this script and train the model from scratch, use:
-    python punctuation_capitalization_train_evaluate.py \
-        model.train_ds.ds_item=<PATH/TO/TRAIN/DATA> \
-        model.train_ds.text_file=<NAME_OF_TRAIN_INPUT_TEXT_FILE> \
-        model.train_ds.labels_file=<NAME_OF_TRAIN_LABELS_FILE> \
-        model.validation_ds.ds_item=<PATH/TO/DEV/DATA> \
-        model.validation_ds.text_file=<NAME_OF_DEV_INPUT_TEXT_FILE> \
-        model.validation_ds.labels_file=<NAME_OF_DEV_LABELS_FILE> \
-        ~model.test_ds
-
-To use one of the pretrained versions of the model and finetune it, run:
-    python punctuation_capitalization_train_evaluate.py \
-        pretrained_model=punctuation_en_bert \
-        model.train_ds.ds_item=<PATH/TO/TRAIN/DATA> \
-        model.train_ds.text_file=<NAME_OF_TRAIN_INPUT_TEXT_FILE> \
-        model.train_ds.labels_file=<NAME_OF_TRAIN_LABELS_FILE> \
-        model.validation_ds.ds_item=<PATH/TO/DEV/DATA> \
-        model.validation_ds.text_file=<NAME_OF_DEV_INPUT_TEXT_FILE> \
-        model.validation_ds.labels_file=<NAME_OF_DEV_LABELS_FILE> \
-        ~model.test_ds
-    
-    pretrained_model   - pretrained PunctuationCapitalization model from list_available_models() or 
-        path to a .nemo file, for example: punctuation_en_bert or model.nemo
-
-If you wish to perform testing after training set `do_testing` to `true:
-    python punctuation_capitalization_train_evaluate.py \
-        +do_testing=true \
-        pretrained_model=punctuation_en_bert \
-        model.train_ds.ds_item=<PATH/TO/TRAIN/DATA> \
-        model.train_ds.text_file=<NAME_OF_TRAIN_INPUT_TEXT_FILE> \
-        model.train_ds.labels_file=<NAME_OF_TRAIN_LABELS_FILE> \
-        model.validation_ds.ds_item=<PATH/TO/DEV/DATA> \
-        model.validation_ds.text_file=<NAME_OF_DEV_INPUT_TEXT_FILE> \
-        model.validation_ds.labels_file=<NAME_OF_DEV_LABELS_FILE> \
-        model.test_ds.ds_item=<PATH/TO/TEST_DATA> \
-        model.test_ds.text_file=<NAME_OF_TEST_INPUT_TEXT_FILE> \
-        model.test_ds.labels_file=<NAME_OF_TEST_LABELS_FILE>
-
-Set `do_training` to `false` and `do_testing` to `true` to perform evaluation without training:
-    python punctuation_capitalization_train_evaluate.py \
-        +do_testing=true \
-        +do_training=false \
-        pretrained_model=punctuation_en_bert \
-        model.test_ds.ds_item=<PATH/TO/TEST/DATA> \
-        model.test_ds.text_file=<NAME_OF_TEST_INPUT_TEXT_FILE> \
-        model.test_ds.labels_file=<NAME_OF_TEST_LABELS_FILE>
-
-"""
-
-
-@hydra_runner(config_path="conf", config_name="punctuation_capitalization_config")
-def main(cfg: DictConfig) -> None:
-    # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True
-    # when there are unused parameters like here
-    if cfg.trainer.strategy == 'ddp':
-        cfg.trainer.strategy = "ddp_find_unused_parameters_true"
-    torch.manual_seed(42)
-    cfg = OmegaConf.merge(OmegaConf.structured(PunctuationCapitalizationConfig()), cfg)
-    trainer = pl.Trainer(**cfg.trainer)
-    exp_manager(trainer, cfg.get("exp_manager", None))
-    if not cfg.do_training and not cfg.do_testing:
-        raise ValueError("At least one of config parameters `do_training` and `do_testing` has to `true`.")
-    if cfg.do_training:
-        if cfg.model.get('train_ds') is None:
-            raise ValueError('`model.train_ds` config section is required if `do_training` config item is `True`.')
-    if cfg.do_testing:
-        if cfg.model.get('test_ds') is None:
-            raise ValueError('`model.test_ds` config section is required if `do_testing` config item is `True`.')
-
-    if not cfg.pretrained_model:
-        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
-        model = PunctuationCapitalizationModel(cfg.model, trainer=trainer)
-    else:
-        if os.path.exists(cfg.pretrained_model):
-            model = PunctuationCapitalizationModel.restore_from(cfg.pretrained_model)
-        elif cfg.pretrained_model in PunctuationCapitalizationModel.get_available_model_names():
-            model = PunctuationCapitalizationModel.from_pretrained(cfg.pretrained_model)
-        else:
-            raise ValueError(
-                f'Config parameter `pretrained_model` should contain a path to the pre-trained .nemo file or a model '
-                f'name from '
-                f'{[m.pretrained_model_name for m in PunctuationCapitalizationModel.list_available_models()]}. '
-                f'Provided `pretrained_model="{cfg.pretrained_model}"` is neither a valid path, nor a valid model '
-                f'name.'
-            )
-        model.update_config_after_restoring_from_checkpoint(
-            class_labels=cfg.model.class_labels,
-            common_dataset_parameters=cfg.model.common_dataset_parameters,
-            train_ds=cfg.model.get('train_ds') if cfg.do_training else None,
-            validation_ds=cfg.model.get('validation_ds') if cfg.do_training else None,
-            test_ds=cfg.model.get('test_ds') if cfg.do_testing else None,
-            optim=cfg.model.get('optim') if cfg.do_training else None,
-        )
-        model.set_trainer(trainer)
-        if cfg.do_training:
-            model.setup_training_data()
-            model.setup_multiple_validation_data(cfg.model.validation_ds)
-            model.setup_optimization()
-        else:
-            model.setup_multiple_test_data(cfg.model.test_ds)
-    if cfg.do_training:
-        trainer.fit(model)
-    if cfg.do_testing:
-        trainer.test(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/token_classification/token_classification_evaluate.py b/examples/nlp/token_classification/token_classification_evaluate.py
deleted file mode 100644
index 764aa90c8593..000000000000
--- a/examples/nlp/token_classification/token_classification_evaluate.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import lightning.pytorch as pl
-from omegaconf import DictConfig
-
-from nemo.collections.nlp.models import TokenClassificationModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-"""
-This script shows how to perform evaluation and runs inference of a few examples.
-
-More details on Token Classification model could be found in tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb
-
-*** Setting the configs ***
-
-This script uses the `/examples/nlp/token_classification/conf/token_classification_config.yaml` config file
-by default. You may update the config file from the file directly. 
-The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'.
-
-For more details about the config files and different ways of model restoration, see tutorials/00_NeMo_Primer.ipynb
-
-*** Model Evaluation ***
-
-The script runs two types of evaluation: 
-    * model.test() - this eval will use the config setting for evaluation such as model.dataset.max_seq_length
-    * model.evaluate_from_file():
-        * disregards model.dataset.max_seq_length and evaluates all the tokens, BERT max seq length - 512 tokens after tokenization
-        * creates confusion matrix
-        * saves predictions and labels (if provided)
-
-To run the script:
-
-    python token_classification_evaluate.py \
-    model.dataset.data_dir=<PATH_TO_DATA_DIR>  \
-    pretrained_model=ner_en_bert 
-
-<PATH_TO_DATA_DIR> - a directory that contains test_ds.text_file and test_ds.labels_file (see the config)
-pretrained_model   - pretrained TokenClassification model from list_available_models() or 
-                     path to a .nemo file, for example: ner_en_bert or your_model.nemo
-
-"""
-
-
-@hydra_runner(config_path="conf", config_name="token_classification_config")
-def main(cfg: DictConfig) -> None:
-    logging.info(
-        'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \
-            no DDP to obtain accurate results'
-    )
-
-    if not hasattr(cfg.model, 'test_ds'):
-        raise ValueError(f'model.test_ds was not found in the config, skipping evaluation')
-
-    trainer = pl.Trainer(
-        devices=1,
-        precision=cfg.trainer.precision,
-        logger=False,
-        enable_checkpointing=False,
-        accelerator=cfg.trainer.accelerator,
-    )
-    exp_dir = exp_manager(trainer, cfg.exp_manager)
-
-    if not cfg.pretrained_model:
-        raise ValueError(
-            'To run evaluation and inference script a pre-trained model or .nemo file must be provided.'
-            f'Choose from {TokenClassificationModel.list_available_models()} or "pretrained_model"="your_model.nemo"'
-        )
-
-    if os.path.exists(cfg.pretrained_model):
-        model = TokenClassificationModel.restore_from(cfg.pretrained_model)
-    elif cfg.pretrained_model in TokenClassificationModel.get_available_model_names():
-        model = TokenClassificationModel.from_pretrained(cfg.pretrained_model)
-    else:
-        raise ValueError(
-            f'Provide path to the pre-trained .nemo checkpoint or choose from {TokenClassificationModel.list_available_models()}'
-        )
-
-    data_dir = cfg.model.dataset.get('data_dir', None)
-    if data_dir is None:
-        logging.error(
-            'No dataset directory provided. Skipping evaluation. '
-            'To run evaluation on a file, specify path to the directory that contains test_ds.text_file and test_ds.labels_file with "model.dataset.data_dir" argument.'
-        )
-    elif not os.path.exists(data_dir):
-        logging.error(f'{data_dir} is not found, skipping evaluation on the test set.')
-    else:
-        model.update_data_dir(data_dir=data_dir)
-        model._cfg.dataset = cfg.model.dataset
-
-        if not hasattr(cfg.model, 'test_ds'):
-            logging.error(f'model.test_ds was not found in the config, skipping evaluation')
-        elif model.prepare_test(trainer):
-            model.setup_test_data(cfg.model.test_ds)
-            trainer.test(model)
-
-            model.evaluate_from_file(
-                text_file=os.path.join(data_dir, cfg.model.test_ds.text_file),
-                labels_file=os.path.join(data_dir, cfg.model.test_ds.labels_file),
-                output_dir=exp_dir,
-                add_confusion_matrix=True,
-                normalize_confusion_matrix=True,
-            )
-        else:
-            logging.error('Skipping the evaluation. The trainer is not setup properly.')
-
-    # run an inference on a few examples
-    queries = ['we bought four shirts from the nvidia gear store in santa clara.', 'Nvidia is a company.']
-    results = model.add_predictions(queries, output_file='predictions.txt')
-
-    for query, result in zip(queries, results):
-        logging.info(f'Query : {query}')
-        logging.info(f'Result: {result.strip()}\n')
-
-    logging.info(f'Results are saved at {exp_dir}')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/token_classification/token_classification_train.py b/examples/nlp/token_classification/token_classification_train.py
deleted file mode 100644
index 536327aff6da..000000000000
--- a/examples/nlp/token_classification/token_classification_train.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import lightning.pytorch as pl
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models import TokenClassificationModel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-"""
-This scripts shows how to train a Token Classification model.
-
-The Token Classification model supports Named Entity Recognition task and other token level classification tasks,
-as long as the data follows the format specified below.
-
-More details on how to use this script could be found in 
-tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb
-
-*** Data Format *** 
-Token Classification Model requires the data to be split into 2 files: text.txt and labels.txt.
-Each line of the text.txt file contains text sequences, where words are separated with spaces, i.e.:
-[WORD] [SPACE] [WORD] [SPACE] [WORD].
-The labels.txt file contains corresponding labels for each word in text.txt, the labels are separated with spaces, i.e.:
-[LABEL] [SPACE] [LABEL] [SPACE] [LABEL].
-
-Example of a text.txt file:
-Jennifer is from New York City .
-She likes ...
-...
-
-Corresponding labels.txt file:
-B-PER O O B-LOC I-LOC I-LOC O
-O O ...
-...
-
-*** Preparing the dataset ***
-
-To convert an IOB format data to the format required for training, run
-examples/nlp/token_classification/data/import_from_iob_format.py on your train and dev files, as follows:
-
-python examples/nlp/token_classification/data/import_from_iob_format.py --data_file PATH_TO_IOB_FORMAT_DATAFILE
-
-*** Setting the configs ***
-
-The model and the PT trainer are defined in a config file which declares multiple important sections.
-The most important ones are:
-    model: All arguments that are related to the Model - language model, tokenizer, token classifier, optimizer,
-            schedulers, and datasets/data loaders.
-    trainer: Any argument to be passed to PyTorch Lightning including number of epochs, number of GPUs,
-            precision level, etc.
-This script uses the `/examples/nlp/token_classification/conf/token_classification_config.yaml` config file
-by default. You may update the config file from the file directly. 
-The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'.
-
-For more details about the config files and different ways of model restoration, see tutorials/00_NeMo_Primer.ipynb
-
-*** Model Training ***
-
-To train TokenClassification model from scratch with the default config file, run:
-
-    python token_classification_train.py \
-           model.dataset.data_dir=<PATH_TO_DATA_DIR>  \
-           trainer.max_epochs=<NUM_EPOCHS> \
-           trainer.devices=[<CHANGE_TO_GPU(s)_YOU_WANT_TO_USE>]
-
-To use one of the pretrained versions of the model specify a `pretrained_model` arg with either 
-TokenClassification model from list_available_models() or path to a .nemo file, for example: 
-ner_en_bert or model.nemo, run:
-
-    python token_classification_train.py pretrained_model=ner_en_bert
-
-To use one of the pretrained versions of the model and fine-tune it, run:
-
-    python token_classification_train.py \
-           model.dataset.data_dir=<PATH_TO_DATA_DIR>  \
-           pretrained_model=ner_en_bert
-
-<PATH_TO_DATA_DIR> - a directory that contains test_ds.text_file and test_ds.labels_file (see the config)
-pretrained_model   - pretrained TokenClassification model from list_available_models() or 
-                     path to a .nemo file, for example: ner_en_bert or model.nemo
-                     
-For more ways of restoring a pre-trained model, see tutorials/00_NeMo_Primer.ipynb
-"""
-
-
-@hydra_runner(config_path="conf", config_name="token_classification_config")
-def main(cfg: DictConfig) -> None:
-    try:
-        strategy = NLPDDPStrategy(find_unused_parameters=True)
-    except (ImportError, ModuleNotFoundError):
-        strategy = 'auto'
-
-    trainer = pl.Trainer(strategy=strategy, **cfg.trainer)
-    exp_manager(trainer, cfg.get("exp_manager", None))
-
-    if not cfg.pretrained_model:
-        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
-        model = TokenClassificationModel(cfg.model, trainer=trainer)
-    else:
-        if os.path.exists(cfg.pretrained_model):
-            # TODO: can we drop strict=False?
-            model = TokenClassificationModel.restore_from(cfg.pretrained_model, trainer=trainer, strict=False)
-        elif cfg.pretrained_model in TokenClassificationModel.get_available_model_names():
-            model = TokenClassificationModel.from_pretrained(cfg.pretrained_model)
-        else:
-            raise ValueError(
-                f'Provide path to the pre-trained .nemo file or choose from {TokenClassificationModel.list_available_models()}'
-            )
-
-        data_dir = cfg.model.dataset.get('data_dir', None)
-        if data_dir:
-            if not os.path.exists(data_dir):
-                raise ValueError(f'{data_dir} is not found at')
-
-            # we can also do finetuning of the pretrained model but it will require
-            # setup the data dir to get class weights statistics
-            model.update_data_dir(data_dir=data_dir)
-            # finally, setup train and validation Pytorch DataLoaders
-            model.setup_training_data()
-            model.setup_validation_data()
-            # then we're setting up loss, use model.dataset.class_balancing,
-            # if you want to add class weights to the CrossEntropyLoss
-            model.setup_loss(class_balancing=cfg.model.dataset.class_balancing)
-            logging.info(f'Using config file of the pretrained model')
-        else:
-            raise ValueError(
-                'Specify a valid dataset directory that contains test_ds.text_file and test_ds.labels_file \
-                with "model.dataset.data_dir" argument'
-            )
-
-    trainer.fit(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml b/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml
deleted file mode 100644
index 64fd5f8542f3..000000000000
--- a/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Config file for Zero Shot Intent Recognition (BERT model trained NLI)
-trainer:
-  devices: 1 # the number of gpus, 0 for CPU
-  num_nodes: 1
-  max_epochs: 1
-  max_steps: -1 # precedence over max_epochs
-  accumulate_grad_batches: 1 # accumulates grads every k batches
-  precision: 16
-  accelerator: gpu
-  strategy: ddp
-  log_every_n_steps: 1  # Interval of logging.
-  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
-  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
-  enable_checkpointing: False  # Provided by exp_manager
-  logger: False  # Provided by exp_manager
-
-model:
-  dataset:
-    data_dir: ??? # /path/to/data
-    sentence_1_column: 8  # index of the column containing the premise or sentence 1
-    sentence_2_column: 9   # index of the column containing the hypothesis or sentence 2
-    label_column: -1   # index of the column containing labels. Labels should be "entailment", "contradiction", and "neutral".
-    class_balancing: null # null or 'weighted_loss'. 'weighted_loss' enables the weighted class balancing of the loss, may be used for handling unbalanced classes
-    use_cache: true # uses a cache to store the processed dataset, you may use it for large datasets for speed up
-    num_classes: 3
-    max_seq_length: 128
-    do_lower_case: true # true for uncased models, false for cased models, will be set automatically if pre-trained tokenizer model is used
-
-  train_ds:
-    file_name: train.tsv
-    batch_size: 64
-    shuffle: true
-    num_samples: -1 # number of samples to be considered, -1 means all the dataset
-    num_workers: 2
-    drop_last: false
-    pin_memory: false
-
-  validation_ds:
-    file_name: dev_matched.tsv
-    batch_size: 64
-    shuffle: false
-    num_samples: -1 # number of samples to be considered, -1 means all the dataset
-    num_workers: 2
-    drop_last: false
-    pin_memory: false
-
-  test_ds:
-    file_name: null
-    batch_size: 64
-    shuffle: false
-    num_samples: -1 # number of samples to be considered, -1 means all the dataset
-    num_workers: 2
-    drop_last: false
-    pin_memory: false
-
-  tokenizer:
-      tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
-      vocab_file: null # path to vocab file
-      tokenizer_model: null # only used if tokenizer is sentencepiece
-      special_tokens: null # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently.
-
-  language_model:
-    pretrained_model_name: bert-base-uncased
-    lm_checkpoint: null
-    config_file: null # json file, precedence over config
-    config: null
-
-  classifier_head:
-    num_output_layers: 2
-    fc_dropout: 0.1
-
-  optim:
-    name: adam
-    lr: 5e-5
-    weight_decay: 0.00
-
-    sched:
-      name: WarmupAnnealing
-      # Scheduler params
-      warmup_steps: null
-      warmup_ratio: 0.1
-      last_epoch: -1
-      # pytorch lightning args
-      monitor: val_loss
-      reduce_on_plateau: false
-
-exp_manager:
-  exp_dir: null  # exp_dir for your experiment, if None, defaults to "./NeMo_experiments"
-  name: "ZeroShotIntentRecognition" # The name of your model
-  create_tensorboard_logger: True  # Whether you want exp_manger to create a tb logger
-  create_checkpoint_callback: True  # Whether you want exp_manager to create a modelcheckpoint callback
-  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
-
-pretrained_model:  # pretrained ZeroShotIntent model to be used for inference (.nemo file)
\ No newline at end of file
diff --git a/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_infer.py b/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_infer.py
deleted file mode 100644
index eca8f1ef87c6..000000000000
--- a/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_infer.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models import ZeroShotIntentModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-
-
-@hydra_runner(config_path="conf", config_name="zero_shot_intent_config")
-def main(cfg: DictConfig) -> None:
-    logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}')
-
-    # initialize the model using the config file
-    if cfg.pretrained_model and os.path.exists(cfg.pretrained_model):
-        model = ZeroShotIntentModel.restore_from(cfg.pretrained_model, strict=False)
-    else:
-        raise ValueError('Provide path to the pre-trained .nemo checkpoint')
-
-    # predicting an intent of a query
-    queries = [
-        "I'd like a veggie burger and fries",
-        "Turn off the lights in the living room",
-    ]
-
-    candidate_labels = ['Food order', 'Play music', 'Request for directions', 'Change lighting', 'Calendar query']
-
-    predictions = model.predict(queries, candidate_labels, batch_size=4, multi_label=True)
-
-    logging.info('The prediction results of some sample queries with the trained model:')
-    for query in predictions:
-        logging.info(json.dumps(query, indent=4))
-    logging.info("Inference finished!")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py b/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py
deleted file mode 100644
index 4dbbf01c935e..000000000000
--- a/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import lightning.pytorch as pl
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.models import ZeroShotIntentModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
-@hydra_runner(config_path="conf", config_name="zero_shot_intent_config")
-def main(cfg: DictConfig) -> None:
-    logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}')
-    trainer = pl.Trainer(**cfg.trainer)
-    exp_manager(trainer, cfg.get("exp_manager", None))
-
-    # initialize the model using the config file
-    model = ZeroShotIntentModel(cfg.model, trainer=trainer)
-
-    # training
-    logging.info("================================================================================================")
-    logging.info('Starting training...')
-    trainer.fit(model)
-    logging.info('Training finished!')
-    if cfg.model.nemo_path:
-        model.save_to(cfg.model.nemo_path)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/nemo/collections/nlp/data/__init__.py b/nemo/collections/nlp/data/__init__.py
index 7c1b59d3868c..ffc0bdabe0e7 100644
--- a/nemo/collections/nlp/data/__init__.py
+++ b/nemo/collections/nlp/data/__init__.py
@@ -13,10 +13,6 @@
 # limitations under the License.
 
 from nemo.collections.nlp.data.data_utils import *  # noqa: F401
-from nemo.collections.nlp.data.entity_linking.entity_linking_dataset import EntityLinkingDataset  # noqa: F401
-from nemo.collections.nlp.data.information_retrieval.information_retrieval_dataset import (  # noqa: F401
-    BertInformationRetrievalDataset,
-)
 from nemo.collections.nlp.data.language_modeling.l2r_lm_dataset import (  # noqa: F401
     L2RLanguageModelingDataset,
     TarredL2RLanguageModelingDataset,
@@ -33,11 +29,3 @@
     TarredTranslationDataset,
     TranslationDataset,
 )
-from nemo.collections.nlp.data.token_classification.token_classification_dataset import (  # noqa: F401
-    BertTokenClassificationDataset,
-    BertTokenClassificationInferDataset,
-)
-from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import (  # noqa: F401
-    ZeroShotIntentDataset,
-    ZeroShotIntentInferenceDataset,
-)
diff --git a/nemo/collections/nlp/data/entity_linking/__init__.py b/nemo/collections/nlp/data/entity_linking/__init__.py
deleted file mode 100644
index 659718d71b82..000000000000
--- a/nemo/collections/nlp/data/entity_linking/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.data.entity_linking.entity_linking_dataset import EntityLinkingDataset
diff --git a/nemo/collections/nlp/data/entity_linking/entity_linking_dataset.py b/nemo/collections/nlp/data/entity_linking/entity_linking_dataset.py
deleted file mode 100644
index 3b1d97a354f0..000000000000
--- a/nemo/collections/nlp/data/entity_linking/entity_linking_dataset.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import array
-import pickle as pkl
-from typing import Optional
-
-import torch
-
-from nemo.collections.nlp.data.data_utils.data_preprocessing import find_newlines, load_data_indices
-from nemo.core.classes import Dataset
-from nemo.utils import logging
-
-__all__ = ['EntityLinkingDataset']
-
-
-class EntityLinkingDataset(Dataset):
-    """
-    Parent class for entity linking encoder training and index
-    datasets
-
-    Args:
-        tokenizer (obj): huggingface tokenizer,
-        data_file (str): path to tab separated column file where data 
-            pairs apear in the format 
-            concept_ID\tconcept_synonym1\tconcept_synonym2\n
-        newline_idx_file (str): path to pickle file containing location
-            of data_file newline characters
-        max_seq_length (int): maximum length of a concept in tokens
-        is_index_data (bool): Whether dataset will be used for building
-                            a nearest neighbors index
-    """
-
-    def __init__(
-        self,
-        tokenizer: object,
-        data_file: str,
-        newline_idx_file: Optional[str] = None,
-        max_seq_length: Optional[int] = 512,
-        is_index_data: bool = False,
-    ):
-
-        self.tokenizer = tokenizer
-
-        # Try and load pair indices file if already exists
-        newline_indices, newline_idx_file, _ = load_data_indices(newline_idx_file, data_file, "newline_indices")
-
-        # If pair indices file doesn't exists, generate and store them
-        if newline_indices is None:
-            logging.info("Getting datafile newline indices")
-
-            with open(data_file, "rb") as f:
-                contents = f.read()
-                newline_indices = find_newlines(contents)
-                newline_indices = array.array("I", newline_indices)
-
-            # Store data file indicies to avoid generating them again
-            with open(newline_idx_file, "wb") as f:
-                pkl.dump(newline_indices, f)
-
-        self.newline_indices = newline_indices
-        self.data_file = data_file
-        self.num_lines = len(newline_indices)
-        self.max_seq_length = max_seq_length
-        self.is_index_data = is_index_data
-
-        logging.info(f"Loaded dataset with {self.num_lines} examples")
-
-    def __len__(self):
-        return self.num_lines
-
-    def __getitem__(self, idx):
-
-        concept_offset = self.newline_indices[idx]
-
-        with open(self.data_file, "r", encoding='utf-8-sig') as f:
-            # Find data pair within datafile using byte offset
-            f.seek(concept_offset)
-            concept = f.readline()[:-1]
-            concept = concept.strip().split("\t")
-
-            if self.is_index_data:
-                concept_id, concept = concept
-                return (int(concept_id), concept)
-
-            else:
-                concept_id, concept1, concept2 = concept
-                return (int(concept_id), concept1, concept2)
-
-    def _collate_fn(self, batch):
-        """collate batch of input_ids, segment_ids, input_mask, and label
-
-        Args:
-            batch:  A list of tuples of format (concept_ID, concept_synonym1, concept_synonym2).
-        """
-        if self.is_index_data:
-            concept_ids, concepts = zip(*batch)
-            concept_ids = list(concept_ids)
-            concepts = list(concepts)
-
-        else:
-            concept_ids, concepts1, concepts2 = zip(*batch)
-            concept_ids = list(concept_ids)
-            concept_ids.extend(concept_ids)  # Need to double label list to match each concept
-            concepts = list(concepts1)
-            concepts.extend(concepts2)
-
-        batch = self.tokenizer(
-            concepts,
-            add_special_tokens=True,
-            padding=True,
-            truncation=True,
-            max_length=self.max_seq_length,
-            return_token_type_ids=True,
-            return_attention_mask=True,
-            return_length=True,
-        )
-
-        return (
-            torch.LongTensor(batch["input_ids"]),
-            torch.LongTensor(batch["token_type_ids"]),
-            torch.LongTensor(batch["attention_mask"]),
-            torch.LongTensor(concept_ids),
-        )
diff --git a/nemo/collections/nlp/data/glue_benchmark/__init__.py b/nemo/collections/nlp/data/glue_benchmark/__init__.py
deleted file mode 100644
index 753411382bc1..000000000000
--- a/nemo/collections/nlp/data/glue_benchmark/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset
diff --git a/nemo/collections/nlp/data/glue_benchmark/data_processors.py b/nemo/collections/nlp/data/glue_benchmark/data_processors.py
deleted file mode 100644
index 3d907f24eff8..000000000000
--- a/nemo/collections/nlp/data/glue_benchmark/data_processors.py
+++ /dev/null
@@ -1,445 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor
-from nemo.utils import logging
-
-__all__ = [
-    'ColaProcessor',
-    'MnliProcessor',
-    'MnliMismatchedProcessor',
-    'MrpcProcessor',
-    'Sst2Processor',
-    'StsbProcessor',
-    'QqpProcessor',
-    'QnliProcessor',
-    'RteProcessor',
-    'WnliProcessor',
-    'XNLIProcessor',
-]
-
-
-class MrpcProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}')
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_examples(self, file_path):
-        return self._create_examples(self._read_tsv(file_path), "example")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            text_b = line[4]
-            label = line[0]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-    def get_t5_prompted_query(self, text_a, text_b):
-        return f"mrpc sentence1: {text_a} sentence2: {text_b}"
-
-    def label2string(self, label):
-        return "equivalent" if label == "1" else "not equivalent"
-
-
-class MnliProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
-
-    def get_examples(self, file_path):
-        return self._create_examples(self._read_tsv(file_path), "example")
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[8]
-            text_b = line[9]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-    def get_t5_prompted_query(self, text_a, text_b):
-        return f"mnli hypothesis: {text_a} premise: {text_b}"
-
-    def label2string(self, label):
-        return label
-
-
-class XNLIProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-
-    def get_examples(self, file_path):
-        return self._create_examples(self._read_tsv(file_path), "example")
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[6]
-            text_b = line[7]
-            label = line[1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-    def get_t5_prompted_query(self, text_a, text_b):
-        return f"mnli hypothesis: {text_a} premise: {text_b}"
-
-    def label2string(self, label):
-        return label
-
-
-class MnliMismatchedProcessor(MnliProcessor):
-    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched")
-
-    def get_examples(self, file_path):
-        return self._create_examples(self._read_tsv(file_path), "example")
-
-
-class ColaProcessor(DataProcessor):
-    """Processor for the CoLA data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_examples(self, file_path):
-        return self._create_examples(self._read_tsv(file_path), "example")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            label = line[1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-    def get_t5_prompted_query(self, text_a, text_b):
-        assert text_b is None
-        return f"cola sentence: {text_a}"
-
-    def label2string(self, label):
-        return "acceptable" if label == "1" else "not acceptable"
-
-
-class Sst2Processor(DataProcessor):
-    """Processor for the SST-2 data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_examples(self, file_path):
-        return self._create_examples(self._read_tsv(file_path), "example")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[0]
-            label = line[1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-    def get_t5_prompted_query(self, text_a, text_b):
-        assert text_b is None
-        return f"sst2 sentence: {text_a}"
-
-    def label2string(self, label):
-        return "positive" if label == "1" else "negative"
-
-
-class StsbProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_examples(self, file_path):
-        return self._create_examples(self._read_tsv(file_path), "example")
-
-    def get_labels(self):
-        """See base class."""
-        return [None]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[7]
-            text_b = line[8]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-    def get_t5_prompted_query(self, text_a, text_b):
-        return f"stsb sentence1: {text_a} sentence2: {text_b}"
-
-    def label2string(self, label):
-        return '%.1f' % float(label)
-
-
-class QqpProcessor(DataProcessor):
-    """Processor for the QQP data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_examples(self, file_path):
-        return self._create_examples(self._read_tsv(file_path), "example")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            try:
-                text_a = line[3]
-                text_b = line[4]
-                label = line[5]
-            except IndexError:
-                continue
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-    def get_t5_prompted_query(self, text_a, text_b):
-        return f"qqp question1: {text_a} question2: {text_b}"
-
-    def label2string(self, label):
-        return "duplicate" if label == "1" else "not_duplicate"
-
-
-class QnliProcessor(DataProcessor):
-    """Processor for the QNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_examples(self, file_path):
-        return self._create_examples(self._read_tsv(file_path), "example")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-    def get_t5_prompted_query(self, text_a, text_b):
-        return f"qnli question: {text_a} sentence: {text_b}"
-
-    def label2string(self, label):
-        return label
-
-
-class RteProcessor(DataProcessor):
-    """Processor for the RTE data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_examples(self, file_path):
-        return self._create_examples(self._read_tsv(file_path), "example")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-    def get_t5_prompted_query(self, text_a, text_b):
-        return f"rte sentence1: {text_a} sentence2: {text_b}"
-
-    def label2string(self, label):
-        return label
-
-
-class WnliProcessor(DataProcessor):
-    """Processor for the WNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_examples(self, file_path):
-        return self._create_examples(self._read_tsv(file_path), "example")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-    def get_t5_prompted_query(self, text_a, text_b):
-        raise NotImplementedError("NeMo-Megatron T5 does not support WNLI at the moment.")
-
-    def label2string(self, label):
-        raise NotImplementedError("NeMo-Megatron T5 does not support WNLI at the moment.")
-
-
-class InputExample(object):
-    """A single training/test example for simple sequence classification.
-
-    Args:
-        guid: Unique id for the example.
-        text_a: The untokenized text of the first sequence.
-        For single sequence tasks, only this sequence must be specified.
-        text_b: The untokenized text of the second
-        sequence. Only must be specified for sequence pair tasks.
-        label:The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
-    """
-
-    def __init__(self, guid: int, text_a: str, text_b: str = None, label: str = None):
-        """Constructs a InputExample."""
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-    def __repr__(self):
-        return (
-            f"InputExample(guid='{self.guid}', text_a='{self.text_a}', text_b='{self.text_b}', label='{self.label}')"
-        )
diff --git a/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py b/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py
deleted file mode 100644
index ef7845895a72..000000000000
--- a/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py
+++ /dev/null
@@ -1,561 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Some code of this file was adapted from the HuggingFace library available at
-# https://github.com/huggingface/transformers
-
-import os
-import pickle
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-import torch
-
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-from nemo.collections.nlp.data.glue_benchmark.data_processors import (
-    ColaProcessor,
-    MnliMismatchedProcessor,
-    MnliProcessor,
-    MrpcProcessor,
-    QnliProcessor,
-    QqpProcessor,
-    RteProcessor,
-    Sst2Processor,
-    StsbProcessor,
-    WnliProcessor,
-    XNLIProcessor,
-)
-from nemo.core.classes import Dataset
-from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType, RegressionValuesType
-from nemo.utils import logging
-
-__all__ = ['GLUEDataset', 'TextToTextGLUEDataset', 'TextToTextXNLIDataset']
-
-processors = {
-    "cola": ColaProcessor,
-    "mnli": MnliProcessor,
-    "mnli-mm": MnliMismatchedProcessor,
-    "mrpc": MrpcProcessor,
-    "sst-2": Sst2Processor,
-    "sts-b": StsbProcessor,
-    "qqp": QqpProcessor,
-    "qnli": QnliProcessor,
-    "rte": RteProcessor,
-    "wnli": WnliProcessor,
-    "xnli": XNLIProcessor,
-}
-output_modes = {
-    "cola": "classification",
-    "mnli": "classification",
-    "mnli-mm": "classification",
-    "mrpc": "classification",
-    "sst-2": "classification",
-    "sts-b": "regression",
-    "qqp": "classification",
-    "qnli": "classification",
-    "rte": "classification",
-    "wnli": "classification",
-    "xnli": "classification",
-}
-GLUE_TASKS_NUM_LABELS = {
-    "cola": 2,
-    "mnli": 3,
-    "mrpc": 2,
-    "sst-2": 2,
-    "sts-b": 1,
-    "qqp": 2,
-    "qnli": 2,
-    "rte": 2,
-    "wnli": 2,
-}
-
-
-class GLUEDataset(Dataset):
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            "labels": NeuralType(
-                tuple('B'), RegressionValuesType() if self.task_name == 'sts-b' else CategoricalValuesType()
-            ),
-        }
-
-    def __init__(
-        self,
-        file_name: str,
-        task_name: str,
-        tokenizer: TokenizerSpec,
-        max_seq_length: str,
-        use_cache: bool = True,
-        compute_features: bool = True,
-    ):
-        """
-        Processes GLUE datasets
-        Args:
-            file_name: path to file
-            task_name: GLUE task name
-            tokenizer: such as AutoTokenizer
-            max_seq_length: max sequence length minus 2 for [CLS] and [SEP]
-            use_cache: whether to use data cache
-        """
-        original_file_name = file_name
-        logging.info(f'Processing {file_name}')
-        data_dir, file_name = os.path.split(file_name)
-        file_name = file_name[:-4]
-        self.tokenizer = tokenizer
-        evaluate = False if 'train' in file_name else True
-
-        if task_name not in processors:
-            raise ValueError(f'{task_name} not supported. Choose from {processors.keys()}')
-
-        if task_name == 'mnli' and 'dev_mismatched' in file_name:
-            self.task_name = 'mnli-mm'
-        else:
-            self.task_name = task_name
-
-        processor = processors[self.task_name]()
-        output_mode = output_modes[self.task_name]
-        self.label_list = processor.get_labels()
-
-        # TODO: use a different variable to decide whether to trust the user provided filename. This is a temporary workaround for T5 GLUE and XNLI.
-        if not compute_features:
-            if not os.path.exists(original_file_name):
-                raise ValueError(f"Could not find file : {original_file_name}")
-            self.examples = processor.get_examples(original_file_name)
-        else:
-            self.examples = (
-                processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
-            )
-        processor_name = type(processor).__name__
-        vocab_size = getattr(tokenizer, "vocab_size", 0)
-        if compute_features:
-            cached_features_file = os.path.join(
-                data_dir,
-                "cached_{}_{}_{}_{}_{}".format(
-                    processor_name, file_name, tokenizer.name, str(max_seq_length), str(vocab_size)
-                ),
-            )
-
-            if use_cache and os.path.exists(cached_features_file):
-                logging.info(f"loading from {cached_features_file}")
-                with open(cached_features_file, "rb") as reader:
-                    self.features = pickle.load(reader)
-            else:
-                token_params = {
-                    'bos_token': None,
-                    'eos_token': tokenizer.eos_token,
-                    'pad_token': tokenizer.pad_token,
-                    'cls_token': tokenizer.cls_token,
-                    'sep_token_extra': tokenizer.eos_token if 'roberta' in tokenizer.name.lower() else None,
-                }
-
-                self.features = self.convert_examples_to_features(
-                    self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params
-                )
-                master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
-                if master_device:
-                    logging.info(f'Saving train features into {cached_features_file}')
-                    with open(cached_features_file, "wb") as writer:
-                        pickle.dump(self.features, writer)
-
-    def __len__(self):
-        return len(self.features)
-
-    def __getitem__(self, idx):
-        feature = self.features[idx]
-        return (
-            np.array(feature.input_ids),
-            np.array(feature.segment_ids),
-            np.array(feature.input_mask, dtype=np.longlong),
-            np.array(feature.label_id),
-        )
-
-    def convert_examples_to_features(
-        self,
-        examples: List[str],
-        label_list: List[int],
-        max_seq_length: int,
-        tokenizer: TokenizerSpec,
-        output_mode: str,
-        bos_token: str = None,
-        eos_token: str = '[SEP]',
-        pad_token: str = '[PAD]',
-        cls_token: str = '[CLS]',
-        sep_token_extra: str = None,
-        cls_token_at_end: bool = False,
-        cls_token_segment_id: int = 0,
-        pad_token_segment_id: int = 0,
-        pad_on_left: bool = False,
-        mask_padding_with_zero: bool = True,
-        sequence_a_segment_id: int = 0,
-        sequence_b_segment_id: int = 1,
-    ):
-        """
-        Loads a data file into a list of `InputBatch`s.
-        The `cls_token_at_end` defines the location of the CLS token:
-
-            * False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            * True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-
-        The `cls_token_segment_id` defines the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
-        
-        The convention in BERT is:
-        
-            a. For sequence pairs:
-                * tokens:   [CLS] is this jack ##ville ? [SEP] no it is not . [SEP]
-                * type_ids:   0   0  0    0    0       0   0   1  1  1  1   1   1
-            b. For single sequences:
-                * tokens:   [CLS] the dog is hairy . [SEP]
-                * type_ids:   0   0   0   0  0     0   0
-
-        Where "type_ids" are used to indicate whether this is the first
-        sequence or the second sequence. The embedding vectors for `type=0`
-        and `type=1` were learned during pre-training and are added to the
-        wordpiece embedding vector (and position vector). This is
-        not *strictly* necessarysince the [SEP] token unambiguously separates
-        the sequences, but it makes it easier for the model to learn
-        the concept of sequences.
-        For classification tasks, the first vector (corresponding to [CLS])
-        is used as as the "sentence vector". Note that this only makes sense
-        because the entire model is fine-tuned.
-        
-        The convention for NMT is:
-        
-            a. For sequence pairs:
-                * tokens:<BOS> is this jack ##ville ? <EOS> <BOS> no it is not . <EOS>
-                * type_ids:0   0  0    0    0       0   0     1   1  1  1  1   1   1
-            b. For single sequences:
-                * tokens:   <BOS> the dog is hairy . <EOS>
-                * type_ids:   0   0   0   0  0     0   0
-
-        """
-        label_map = {label: i for i, label in enumerate(label_list)}
-
-        features = []
-        for ex_index, example in enumerate(examples):
-            if example.label == "-":  # skip examples without a consensus label (e.g. in SNLI data set)
-                continue
-            if ex_index % 10000 == 0:
-                logging.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-            tokens_a = tokenizer.text_to_tokens(example.text_a)
-
-            tokens_b = None
-            if example.text_b:
-                tokens_b = tokenizer.text_to_tokens(example.text_b)
-
-                special_tokens_count = 2 if eos_token else 0
-                special_tokens_count += 1 if sep_token_extra else 0
-                special_tokens_count += 2 if bos_token else 0
-                special_tokens_count += 1 if cls_token else 0
-                self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
-            else:
-                special_tokens_count = 1 if eos_token else 0
-                special_tokens_count += 1 if sep_token_extra else 0
-                special_tokens_count += 1 if bos_token else 0
-                if len(tokens_a) > max_seq_length - special_tokens_count:
-                    tokens_a = tokens_a[: max_seq_length - special_tokens_count]
-            # Add special tokens to sequence_a
-            tokens = tokens_a
-            if bos_token:
-                tokens = [bos_token] + tokens
-            if eos_token:
-                tokens += [eos_token]
-            segment_ids = [sequence_a_segment_id] * len(tokens)
-
-            # Add sequence separator between sequences
-            if tokens_b and sep_token_extra:
-                tokens += [sep_token_extra]
-                segment_ids += [sequence_a_segment_id]
-
-            # Add special tokens to sequence_b
-            if tokens_b:
-                if bos_token:
-                    tokens += [bos_token]
-                    segment_ids += [sequence_b_segment_id]
-                tokens += tokens_b
-                segment_ids += [sequence_b_segment_id] * (len(tokens_b))
-                if eos_token:
-                    tokens += [eos_token]
-                    segment_ids += [sequence_b_segment_id]
-
-            # Add classification token - for BERT models
-            if cls_token:
-                if cls_token_at_end:
-                    tokens += [cls_token]
-                    segment_ids += [cls_token_segment_id]
-                else:
-                    tokens = [cls_token] + tokens
-                    segment_ids = [cls_token_segment_id] + segment_ids
-            input_ids = tokenizer.tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            padding_length = max_seq_length - len(input_ids)
-            pad_token_id = tokenizer.tokens_to_ids([pad_token])[0]
-            if pad_on_left:
-                input_ids = ([pad_token_id] * padding_length) + input_ids
-                input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-                segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
-            else:
-                input_ids = input_ids + ([pad_token_id] * padding_length)
-                input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-                segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
-            if len(input_ids) != max_seq_length:
-                raise ValueError("input_ids must be of length max_seq_length")
-            if len(input_mask) != max_seq_length:
-                raise ValueError("input_mask must be of length max_seq_length")
-            if len(segment_ids) != max_seq_length:
-                raise ValueError("segment_ids must be of length max_seq_length")
-            if output_mode == "classification":
-                label_id = label_map[example.label]
-            elif output_mode == "regression":
-                label_id = np.float32(example.label)
-            else:
-                raise KeyError(output_mode)
-
-            if ex_index < 5:
-                logging.info("*** Example ***")
-                logging.info("guid: %s" % (example.guid))
-                logging.info("tokens: %s" % " ".join(list(map(str, tokens))))
-                logging.info("input_ids: %s" % " ".join(list(map(str, input_ids))))
-                logging.info("input_mask: %s" % " ".join(list(map(str, input_mask))))
-                logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids))))
-                logging.info("label: %s (id = %d)" % (example.label, label_id))
-
-            features.append(
-                InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)
-            )
-        return features
-
-    def _truncate_seq_pair(self, tokens_a: str, tokens_b: str, max_length: int):
-        """Truncates a sequence pair in place to the maximum length.
-
-        This will always truncate the longer sequence one token at a time.
-        This makes more sense than truncating an equal percent
-        of tokens from each, since if one sequence is very short then each token
-        that's truncated likely contains more information than a longer sequence.
-        """
-        while True:
-            total_length = len(tokens_a) + len(tokens_b)
-            if total_length <= max_length:
-                break
-            if len(tokens_a) > len(tokens_b):
-                tokens_a.pop()
-            else:
-                tokens_b.pop()
-
-
-class TextToTextGLUEDataset(GLUEDataset):
-    """GLUE Dataset in a text-to-text format."""
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return
-
-    def __init__(
-        self,
-        file_name: str,
-        task_name: str,
-        tokenizer: TokenizerSpec,
-        max_seq_length: int,
-        max_seq_length_decoder: int = 128,
-        use_cache: bool = True,
-        prefix_override: str = None,
-        pad_to_max_length: bool = True,
-    ):
-        """
-        Processes GLUE datasets
-        Args:
-            file_name: path to file
-            task_name: GLUE task name
-            tokenizer: such as AutoTokenizer
-            max_seq_length: max sequence length minus 2 for [CLS] and [SEP]
-            use_cache: whether to use data cache
-            prefix_override: if you want to override default prompt for this task specify this via a string.
-            pad_to_max_length: If true, pad to the maximum length.
-        """
-        super().__init__(file_name, task_name, tokenizer, max_seq_length, use_cache, compute_features=False)
-        self.max_seq_length = max_seq_length
-        self.max_seq_length_decoder = max_seq_length_decoder
-        self.pad_to_max_length = pad_to_max_length
-        self.processor = processors[self.task_name]()
-        self.prefix_override = prefix_override
-        self.features = self.convert_examples_to_features()
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, idx):
-        enc_query, dec_input, labels = self.features[idx]
-        return {'text_enc': enc_query, 'text_dec': dec_input, 'labels': labels}
-
-    def collate_fn(self, batch):
-        enc_query = [item['text_enc'] for item in batch]
-        dec_input = [item['text_dec'] for item in batch]
-        labels = [item['labels'] for item in batch]
-
-        max_enc_query_length = max([len(item) for item in enc_query]) if enc_query else 0
-        max_dec_input_length = max([len(item) for item in dec_input]) if dec_input else 0
-        max_label_length = max([len(item) for item in labels]) if labels else 0
-        if self.pad_to_max_length:
-            assert max_enc_query_length <= self.max_seq_length
-            assert max_dec_input_length <= self.max_seq_length_decoder
-            assert max_label_length <= self.max_seq_length_decoder
-            max_enc_query_length = self.max_seq_length
-            max_dec_input_length = self.max_seq_length_decoder
-            max_label_length = self.max_seq_length_decoder
-
-        loss_mask = [([1] * (len(item))) + ([0] * (max_label_length - len(item))) for item in labels]
-        enc_query = [item + [self.tokenizer.pad_id] * (max_enc_query_length - len(item)) for item in enc_query]
-        dec_input = [item + [self.tokenizer.pad_id] * (max_dec_input_length - len(item)) for item in dec_input]
-        labels = [item + [self.tokenizer.pad_id] * (max_label_length - len(item)) for item in labels]
-
-        enc_query = torch.LongTensor(enc_query)
-        dec_input = torch.LongTensor(dec_input)
-        labels = torch.LongTensor(labels)
-        loss_mask = torch.LongTensor(loss_mask)
-
-        enc_mask = (enc_query != self.tokenizer.pad_id).long()
-        dec_mask = (dec_input != self.tokenizer.pad_id).long()
-
-        return {
-            'text_enc': enc_query,
-            'text_dec': dec_input,
-            'labels': labels,
-            'loss_mask': loss_mask,
-            'enc_mask': enc_mask,
-            'dec_mask': dec_mask,
-        }
-
-    def make_history_mask_3d(self, block):
-        batch, length = block.shape
-        arange = np.arange(length)
-        history_mask = (arange[None,] <= arange[:, None])[
-            None,
-        ]
-        history_mask = np.repeat(history_mask, batch, 0)
-        return history_mask
-
-    def convert_examples_to_features(self):
-        """
-        Converts examples into Text-to-Text batches to be used with a model like T5.
-        Inputs are prefixed with a text prompt that indicates the task to perform.
-        """
-        features = []
-        for ex_index, example in enumerate(self.examples):
-            if ex_index % 10000 == 0:
-                logging.info(f"Writing example {ex_index} of {len(self.examples)}")
-
-            text_to_text_query = self.processor.get_t5_prompted_query(example.text_a, example.text_b)
-            enc_query = self.tokenizer.text_to_ids(text_to_text_query)
-            if len(enc_query) > self.max_seq_length:
-                enc_query = enc_query[: self.max_seq_length]
-            dec_query = (
-                [self.tokenizer.bos_id]
-                + self.tokenizer.text_to_ids(self.processor.label2string(example.label))
-                + [self.tokenizer.eos_id]
-            )
-
-            dec_input = dec_query[:-1]
-            labels = dec_query[1:]
-
-            features.append([enc_query, dec_input, labels])
-
-        return features
-
-
-class TextToTextXNLIDataset(TextToTextGLUEDataset):
-    """XNLI Dataset in a text-to-text format."""
-
-    def __init__(
-        self,
-        file_name: str,
-        task_name: str,
-        tokenizer: TokenizerSpec,
-        max_seq_length: int,
-        max_seq_length_decoder: int = 128,
-        use_cache: bool = True,
-        prefix_override: str = None,
-        lang_list: List[str] = None,
-        pad_to_max_length: bool = True,
-    ):
-        self.lang_list = set(lang_list)
-        super().__init__(
-            file_name,
-            task_name,
-            tokenizer,
-            max_seq_length,
-            max_seq_length_decoder,
-            use_cache,
-            prefix_override,
-            pad_to_max_length,
-        )
-        if len(lang_list) <= 0 or lang_list is None:
-            raise ValueError(f"Found an empty or None lang_list for {self.task_name}")
-        self.features = self.convert_xnli_examples_to_features()
-
-    def __getitem__(self, idx):
-        enc_query, dec_input, labels, lang = self.features[idx]
-        return {'text_enc': enc_query, 'text_dec': dec_input, 'labels': labels, 'lang': lang}
-
-    def collate_fn(self, batch):
-        base_batch = super().collate_fn(batch)
-        base_batch['lang'] = [item['lang'] for item in batch]
-        return base_batch
-
-    def convert_xnli_examples_to_features(self):
-        """
-        Converts examples into Text-to-Text batches to be used with a model like T5.
-        Inputs are prefixed with a text prompt that indicates the task to perform.
-        """
-        features = self.features
-        lang_filtered_features = []
-        for ex_index, example in enumerate(self.examples):
-            language = example.guid.split('-')[1]
-            if language in self.lang_list:
-                lang_filtered_features.append(features[ex_index] + [language])
-        return lang_filtered_features
-
-    def __len__(self):
-        return len(self.features)
-
-
-class InputFeatures(object):
-    """A single set of features of data.
-
-    Args:
-        input_ids: input/token ids
-        input_mask: masks out subword tokens
-        segment_ids: distinguish one sentence from the other one (if present)
-        label_ids: label for the current example
-    """
-
-    def __init__(
-        self, input_ids: List[int], input_mask: List[int], segment_ids: List[int], label_id: Union[float, int]
-    ):
-        """Initialized InputFeatures."""
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
diff --git a/nemo/collections/nlp/data/information_retrieval/__init__.py b/nemo/collections/nlp/data/information_retrieval/__init__.py
deleted file mode 100644
index a32196ee7c11..000000000000
--- a/nemo/collections/nlp/data/information_retrieval/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.data.information_retrieval.information_retrieval_dataset import (
-    BertInformationRetrievalDataset,
-)
diff --git a/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py b/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
deleted file mode 100644
index 0da7af6ed96d..000000000000
--- a/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from random import choices, sample
-from typing import Literal, Mapping, Optional
-
-import datasets
-import numpy as np
-import torch
-from torch.utils.data import Dataset
-
-# hack to avoid the "not enough disk space" error in some slurm cluster
-datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory='.': True
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping
-from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import JSONLMemMapDataset
-from nemo.core.classes import Dataset
-from nemo.utils import logging
-
-__all__ = ['BertEmbeddingDataset']
-
-
-class BertEmbeddingDataset(Dataset):
-    """
-    Embedding Dataset Class.
-    """
-
-    def __init__(
-        self,
-        file_path: str,
-        tokenizer: TokenizerSpec,
-        max_seq_length: int = 1024,
-        min_seq_length: int = 1,
-        add_bos: bool = True,
-        add_eos: bool = True,
-        max_num_samples: int = None,
-        seed: int = 1234,
-        index_mapping_dir: str = None,
-        virtual_tokens: int = 0,
-        memmap_workers: Optional[int] = None,
-        truncation_method: str = 'right',
-        special_tokens: Optional[Mapping[str, str]] = None,  # special tokens, a dictory of {token_type: token}
-        data_type: str = 'train',  # train, query or doc
-        num_hard_negatives: int = 4,
-        negative_sample_strategy: Literal["random", "first"] = 'first',
-    ):
-        """
-        file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format.
-        tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec
-                   (ex: YTTM, SentencePiece).
-        max_seq_length (int): maximum sequence length for each dataset examples.
-                   Examples will either be truncated to fit this length or dropped if they cannot be truncated.
-        min_seq_length (int): min length of each data example in the dataset.
-                   Data examples will be dropped if they do not meet the min length requirements.
-        add_bos (bool): Whether to add a beginning of sentence token to each data example
-        add_eos (bool): Whether to add an end of sentence token to each data example
-        seed: Random seed for data shuffling.
-        max_num_samples: Maximum number of samples to load. This can be > dataset length
-                   if you want to oversample data. If None, all samples will be loaded.
-        index_mapping_dir: Directory to save the index mapping to.
-                   If None, will write to the same folder as the dataset.
-        truncation_method: Truncation from which position. Options: ['left', 'right']
-        special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}.
-                   Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>',
-                   'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
-        negative_sample_strategy: Strategy for negative samples. Options: ['random', 'first']
-        """
-        # TODO: lot of copy-paste from GPTSFDDataset, should refactor both to use a common base class (@adithyare)
-        self.tokenizer = tokenizer
-        self.file_path = file_path
-        self.max_seq_length = max_seq_length
-        self.min_seq_length = min_seq_length
-        self.add_bos = add_bos
-        self.add_eos = add_eos
-        self.max_num_samples = max_num_samples
-        self.seed = seed
-        self.index_mapping_dir = index_mapping_dir
-        self.virtual_tokens = virtual_tokens
-        self.truncation_method = truncation_method
-        self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id else self.tokenizer.eos_id
-        self.negative_sample_strategy = negative_sample_strategy
-        assert (
-            truncation_method == 'left' or truncation_method == 'right'
-        ), 'truncation_method must be either "left" or "right"'
-        assert (
-            negative_sample_strategy == 'random' or negative_sample_strategy == 'first'
-        ), 'negative_sample_strategy must be either "random" or "first"'
-        if special_tokens is None:
-            self.special_tokens = {
-                "system_turn_start": "<extra_id_0>",
-                "turn_start": "<extra_id_1>",
-                "label_start": "<extra_id_2>",
-                "end_of_turn": "\n",
-                "end_of_name": "\n",
-            }
-        else:
-            self.special_tokens = special_tokens
-        self.data_type = data_type
-        self.num_hard_negatives = num_hard_negatives
-
-        self.indexed_dataset = JSONLMemMapDataset(
-            dataset_paths=[file_path],
-            tokenizer=None,
-            header_lines=0,
-            index_mapping_dir=index_mapping_dir,
-            workers=memmap_workers,
-        )
-        # Will be None after this call if `max_num_samples` is None
-        self.samples_mapping = None
-        self._build_samples_mapping()
-        logging.info(
-            f"Creating EmbeddingDataset with seed={self.seed},\n"
-            f"add_bos={self.add_bos}, add_eos={self.add_eos},\n"
-            f"max_seq_length={self.max_seq_length}, min_seq_length={self.min_seq_length},\n"
-            f"pad_token_id={self.pad_token_id}, negative_sample_strategy={self.negative_sample_strategy},\n"
-            f"num_hard_negatives={self.num_hard_negatives}."
-        )
-
-    def _build_samples_mapping(self):
-        if self.max_num_samples is not None:
-            self.samples_mapping = get_samples_mapping(
-                indexed_dataset=self.indexed_dataset,
-                data_prefix=self.file_path,
-                num_epochs=None,
-                max_num_samples=self.max_num_samples,
-                max_seq_length=self.max_seq_length - 2,
-                short_seq_prob=0,
-                seed=self.seed,
-                name=self.file_path.split('/')[-1],
-                binary_head=False,
-                index_mapping_dir=self.index_mapping_dir,
-            )
-        else:
-            self.samples_mapping = None
-
-    def __len__(self):
-        if self.max_num_samples is None:
-            return len(self.indexed_dataset)
-        else:
-            assert self.samples_mapping is not None
-            return len(self.samples_mapping)
-
-    def __getitem__(self, idx):
-        if isinstance(idx, np.int64):
-            idx = idx.item()
-
-        if self.samples_mapping is not None:
-            assert idx < len(self.samples_mapping)
-            idx, _, _ = self.samples_mapping[idx]
-            if isinstance(idx, np.uint32):
-                idx = idx.item()
-
-        if idx is not None:
-            assert idx < len(self.indexed_dataset)
-        else:
-            idx = -1
-        # idx may < 0 because we pad_samples_to_global_batch_size, e.g. id = -1
-        if idx < 0:
-            idx = len(self) + idx
-            auto_gen_idx = True
-        else:
-            auto_gen_idx = False
-        try:
-            example = self.indexed_dataset[idx]
-            if auto_gen_idx:
-                example['__AUTOGENERATED__'] = True
-        except Exception as e:
-            logging.error(f"Error while loading example {idx} from dataset {self.file_path}")
-            raise e
-        return self._process_example(example)
-
-    def _process_example(self, example):
-        """
-        Create an example by concatenating text and answer.
-        Truncation is carried out when needed, but it is performed only on the prompt side.
-        BOS, EOS, and SEP, are added if specified.
-        """
-
-        metadata = {k: v for k, v in example.items()}
-        if self.data_type == 'train':
-            q = self.tokenizer.text_to_ids("query: " + example['query'].strip())
-            d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip())
-            # handle cases where the required number of hard negatives are not present
-            if len(example['neg_doc']) < self.num_hard_negatives:
-                nd = example['neg_doc']
-                # sample rest with replacement
-                nd = nd + choices(example['neg_doc'], k=self.num_hard_negatives - len(example['neg_doc']))
-            else:
-                if self.negative_sample_strategy == 'random':
-                    # sample without replacement
-                    # Choose the first self.num_hard_negatives
-                    nd = sample(example['neg_doc'], k=self.num_hard_negatives)
-                else:
-                    # Choose the first self.num_hard_negatives samples
-                    nd = example['neg_doc'][: self.num_hard_negatives]
-            assert len(nd) == self.num_hard_negatives, "Error in sampling required number of hard negatives"
-            nd = [self.tokenizer.text_to_ids("passage: " + ex.strip()) for ex in nd]
-
-        elif self.data_type == 'query':
-            q = self.tokenizer.text_to_ids("query: " + example['query'].strip())
-            d, nd = None, None
-            assert "query_id" in example, "query_id is required for query dataset"
-            assert "doc_id" in example, "doc_id is required for query dataset"
-        elif self.data_type == 'doc':
-            d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip())
-            assert "doc_id" in example, "doc_id is required for doc dataset"
-            q, nd = None, None
-        else:
-            raise ValueError(f"Invalid data type: {self.data_type}")
-
-        q = q if q is not None else []
-        d = d if d is not None else []
-        nd = nd if nd is not None else []
-
-        if self.virtual_tokens:
-            # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context
-            # these pad/eos tokens are placeholders for virtual tokens for ptuning (if used)
-            q = [self.tokenizer.eos_id] * self.virtual_tokens + q  # type: ignore
-            d = [self.tokenizer.eos_id] * self.virtual_tokens + d  # type: ignore
-            nd = [[self.tokenizer.eos_id] * self.virtual_tokens + n for n in nd]  # type: ignore
-
-        if self.add_bos:
-            q = [self.tokenizer.bos_id] + q  # type: ignore
-            d = [self.tokenizer.bos_id] + d  # type: ignore
-            nd = [[self.tokenizer.bos_id] + n for n in nd]  # type: ignore
-
-        # TODO: (@adithyare) should probably add a warning before truncation
-        q = q[: self.max_seq_length - 1]
-        d = d[: self.max_seq_length - 1]
-        nd = [n[: self.max_seq_length - 1] for n in nd]
-
-        if self.add_eos:
-            q = q + [self.tokenizer.eos_id]  # type: ignore
-            d = d + [self.tokenizer.eos_id]  # type: ignore
-            nd = [n + [self.tokenizer.eos_id] for n in nd]  # type: ignore
-
-        processed_example = {
-            'query': q,
-            'pos_doc': d,
-            'neg_doc': nd,
-            'metadata': metadata,
-        }
-        return processed_example
-
-    def _maybe_cast_to_list(self, x):
-        if isinstance(x, np.ndarray):
-            return [item.tolist() for item in x]
-        return x
-
-    def _ceil_to_nearest(self, n, m):
-        return (n + m - 1) // m * m
-
-    def _collate_item(self, item, max_length):
-        item = self._maybe_cast_to_list(item)
-        pad_id = self.pad_token_id
-        if self.truncation_method == 'left':
-            item = [[pad_id] * (max_length - len(x)) + x for x in item]
-        else:
-            item = [x + [pad_id] * (max_length - len(x)) for x in item]
-        return item
-
-    @torch.no_grad()
-    def _create_attention_mask2(self, max_length, item_length):
-        """Create `attention_mask`.
-        Args:
-            input_ids: A 1D tensor that holds the indices of tokens.
-        """
-        # seq_length = len(input_ids)
-        # `attention_mask` has the shape of [1, seq_length, seq_length]
-        attention_mask = torch.zeros(max_length)
-        if self.truncation_method == 'left':
-            # input ids:      [pad] [pad] token token |
-            # attention mask: 0      0    1     1
-            attention_mask[max_length - item_length :] = 1
-        else:
-            # input ids:      token token [pad] [pad] |
-            # attention mask: 1     1     0      0
-            attention_mask[:item_length] = 1
-        return attention_mask
-
-    def _collate_fn(self, batch):
-        """
-        Collate query passage together
-        """
-        input_ids = []
-        metadata = []
-        lengths = []
-        max_length = -1
-        for item in batch:
-            metadata.append(item['metadata'])
-            if self.data_type == 'train':
-                input_ids.append(item['query'])
-                lengths.append(len(item['query']))
-                input_ids.append(item['pos_doc'])
-                lengths.append(len(item['pos_doc']))
-                for nd in item['neg_doc']:
-                    input_ids.append(nd)
-                    lengths.append(len(nd))
-                max_length = max(
-                    max_length, len(item['query']), len(item['pos_doc']), *(len(nd) for nd in item['neg_doc'])
-                )
-            elif self.data_type == 'query':
-                input_ids.append(item['query'])
-                lengths.append(len(item['query']))
-                max_length = max(max_length, len(item['query']))
-            elif self.data_type == 'doc':
-                input_ids.append(item['pos_doc'])
-                lengths.append(len(item['pos_doc']))
-                max_length = max(max_length, len(item['pos_doc']))
-            else:
-                raise ValueError(f"Invalid data type: {self.data_type}")
-
-        max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16))
-        assert max_length <= self.max_seq_length
-
-        attention_mask = [self._create_attention_mask2(max_length, len) for len in lengths]
-        attention_mask = torch.stack(attention_mask)
-        position_ids = [list(range(max_length)) for _ in batch]
-        position_ids = torch.LongTensor(position_ids)
-        input_ids = torch.LongTensor(self._collate_item(input_ids, max_length=max_length))
-        lengths = torch.LongTensor(lengths) - 1  # subtract 1 to account for the eos token
-
-        processed_batch = {
-            'input_ids': input_ids,
-            'token_type_ids': torch.zeros_like(input_ids),
-            'attention_mask': attention_mask,
-            'metadata': metadata,
-            'position_ids': position_ids,
-        }
-
-        return processed_batch
diff --git a/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py b/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py
deleted file mode 100644
index 3a2a8152313e..000000000000
--- a/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Mapping, Optional
-
-import datasets
-import numpy as np
-import torch
-
-# hack to avoid the "not enough disk space" error in some slurm cluster
-datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory='.': True
-
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping
-from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import JSONLMemMapDataset
-from nemo.core.classes import Dataset
-from nemo.utils import logging
-
-__all__ = ['GPTEmbeddingDataset', 'GPTRerankerDataset']
-
-
-class GPTEmbeddingDataset(Dataset):
-    def __init__(
-        self,
-        file_path: str,
-        tokenizer: TokenizerSpec,
-        max_seq_length: int = 1024,
-        min_seq_length: int = 1,
-        add_bos: bool = False,
-        add_eos: bool = True,
-        max_num_samples: int = None,
-        seed: int = 1234,
-        index_mapping_dir: str = None,
-        virtual_tokens: int = 0,
-        memmap_workers: Optional[int] = None,
-        truncation_method: str = 'right',
-        special_tokens: Optional[Mapping[str, str]] = None,  # special tokens, a dictory of {token_type: token}
-        data_type: str = 'train',  # train, query or doc
-    ):
-        """
-        file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format.
-        tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece).
-        max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
-        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
-        add_bos (bool): Whether to add a beginning of sentence token to each data example
-        add_eos (bool): Whether to add an end of sentence token to each data example
-        seed: Random seed for data shuffling.
-        max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded.
-        index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset.
-        truncation_method: Truncation from which position. Options: ['left', 'right']
-        special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
-        """
-        # TODO: lot of copy-paste from GPTSFDDataset, should refactor both to use a common base class (@adithyare)
-        self.tokenizer = tokenizer
-        self.file_path = file_path
-        self.max_seq_length = max_seq_length
-        self.min_seq_length = min_seq_length
-        self.add_bos = add_bos
-        self.add_eos = add_eos
-        self.max_num_samples = max_num_samples
-        self.seed = seed
-        self.index_mapping_dir = index_mapping_dir
-        self.virtual_tokens = virtual_tokens
-        self.truncation_method = truncation_method
-        if special_tokens is None:
-            self.special_tokens = {
-                "system_turn_start": "<extra_id_0>",
-                "turn_start": "<extra_id_1>",
-                "label_start": "<extra_id_2>",
-                "end_of_turn": "\n",
-                "end_of_name": "\n",
-            }
-        else:
-            self.special_tokens = special_tokens
-        self.data_type = data_type
-
-        self.indexed_dataset = JSONLMemMapDataset(
-            dataset_paths=[file_path],
-            tokenizer=None,
-            header_lines=0,
-            index_mapping_dir=index_mapping_dir,
-            workers=memmap_workers,
-        )
-
-        # Will be None after this call if `max_num_samples` is None
-        self.samples_mapping = None
-        self._build_samples_mapping()
-
-    def _build_samples_mapping(self):
-        if self.max_num_samples is not None:
-            self.samples_mapping = get_samples_mapping(
-                indexed_dataset=self.indexed_dataset,
-                data_prefix=self.file_path,
-                num_epochs=None,
-                max_num_samples=self.max_num_samples,
-                max_seq_length=self.max_seq_length - 2,
-                short_seq_prob=0,
-                seed=self.seed,
-                name=self.file_path.split('/')[-1],
-                binary_head=False,
-                index_mapping_dir=self.index_mapping_dir,
-            )
-        else:
-            self.samples_mapping = None
-
-    def __len__(self):
-        if self.max_num_samples is None:
-            return len(self.indexed_dataset)
-        else:
-            assert self.samples_mapping is not None
-            return len(self.samples_mapping)
-
-    def __getitem__(self, idx):
-        if isinstance(idx, np.int64):
-            idx = idx.item()
-
-        if self.samples_mapping is not None:
-            assert idx < len(self.samples_mapping)
-            idx, _, _ = self.samples_mapping[idx]
-            if isinstance(idx, np.uint32):
-                idx = idx.item()
-
-        assert idx < len(self.indexed_dataset)
-        # idx may < 0 because we pad_samples_to_global_batch_size, e.g. id = -1
-        if idx < 0:
-            idx = len(self) + idx
-            auto_gen_idx = True
-        else:
-            auto_gen_idx = False
-        try:
-            example = self.indexed_dataset[idx]
-            if auto_gen_idx:
-                example['__AUTOGENERATED__'] = True
-        except Exception as e:
-            logging.error(f"Error while loading example {idx} from dataset {self.file_path}")
-            raise e
-        return self._process_example(example)
-
-    def _process_example(self, example):
-        """
-        Create an example by concatenating text and answer.
-        Truncation is carried out when needed, but it is performed only on the prompt side.
-        BOS, EOS, and SEP, are added if specified.
-        """
-        metadata = {k: v for k, v in example.items()}
-        if self.data_type == 'train':
-            q = self.tokenizer.text_to_ids("query: " + example['query'].strip())
-            d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip())
-            nd = self.tokenizer.text_to_ids("passage: " + example['neg_doc'].strip())
-        elif self.data_type == 'query':
-            q = self.tokenizer.text_to_ids("query: " + example['query'].strip())
-            d, nd = None, None
-            assert "query_id" in example, "query_id is required for query dataset"
-            assert "doc_id" in example, "doc_id is required for query dataset"
-        elif self.data_type == 'doc':
-            d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip())
-            assert "doc_id" in example, "doc_id is required for doc dataset"
-            q, nd = None, None
-        else:
-            raise ValueError(f"Invalid data type: {self.data_type}")
-
-        q = q if q is not None else []
-        d = d if d is not None else []
-        nd = nd if nd is not None else []
-
-        if self.virtual_tokens:
-            # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context
-            # these pad/eos tokens are placeholders for virtual tokens for ptuning (if used)
-            q = [self.tokenizer.eos_id] * self.virtual_tokens + q  # type: ignore
-            d = [self.tokenizer.eos_id] * self.virtual_tokens + d  # type: ignore
-            nd = [self.tokenizer.eos_id] * self.virtual_tokens + nd  # type: ignore
-
-        if self.add_bos:
-            q = [self.tokenizer.bos_id] + q  # type: ignore
-            d = [self.tokenizer.bos_id] + d  # type: ignore
-            nd = [self.tokenizer.bos_id] + nd  # type: ignore
-
-        # TODO: (@adithyare) should probably add a warning before truncation
-        q = q[: self.max_seq_length - 1]
-        d = d[: self.max_seq_length - 1]
-        nd = nd[: self.max_seq_length - 1]
-
-        if self.add_eos:
-            q = q + [self.tokenizer.eos_id]  # type: ignore
-            d = d + [self.tokenizer.eos_id]  # type: ignore
-            nd = nd + [self.tokenizer.eos_id]  # type: ignore
-
-        processed_example = {
-            'query': q,
-            'pos_doc': d,
-            'neg_doc': nd,
-            'metadata': metadata,
-        }
-
-        return processed_example
-
-    def _maybe_cast_to_list(self, x):
-        if isinstance(x, np.ndarray):
-            return [item.tolist() for item in x]
-        return x
-
-    def _ceil_to_nearest(self, n, m):
-        return (n + m - 1) // m * m
-
-    def _collate_item(self, item, max_length, pad_id):
-        item = self._maybe_cast_to_list(item)
-        # max_length = max([len(x) for x in item]) if item else 0
-        # here [0] should be tokenizer.pad_id
-        item = [x + [pad_id] * (max_length - len(x)) for x in item]
-        return item
-
-    @torch.no_grad()
-    def _create_attention_mask(self, max_length):
-        """Create `attention_mask`.
-        Args:
-            input_ids: A 1D tensor that holds the indices of tokens.
-        """
-        # seq_length = len(input_ids)
-        # `attention_mask` has the shape of [1, seq_length, seq_length]
-        attention_mask = torch.tril(torch.ones((max_length, max_length))).unsqueeze(0)
-        attention_mask = attention_mask < 0.5
-        return attention_mask
-
-    def collate_fn(self, batch):
-        input_ids = []
-        metadata = []
-        lengths = []
-        max_length = -1
-        for item in batch:
-            metadata.append(item['metadata'])
-            if self.data_type == 'train':
-                input_ids.append(item['query'])
-                lengths.append(len(item['query']))
-                input_ids.append(item['pos_doc'])
-                lengths.append(len(item['pos_doc']))
-                input_ids.append(item['neg_doc'])
-                lengths.append(len(item['neg_doc']))
-                max_length = max(max_length, len(item['query']), len(item['pos_doc']), len(item['neg_doc']))
-            elif self.data_type == 'query':
-                input_ids.append(item['query'])
-                lengths.append(len(item['query']))
-                max_length = max(max_length, len(item['query']))
-            elif self.data_type == 'doc':
-                input_ids.append(item['pos_doc'])
-                lengths.append(len(item['pos_doc']))
-                max_length = max(max_length, len(item['pos_doc']))
-            else:
-                raise ValueError(f"Invalid data type: {self.data_type}")
-
-        max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16))
-        assert max_length <= self.max_seq_length
-
-        attention_mask = [self._create_attention_mask(max_length) for _ in input_ids]
-        attention_mask = torch.stack(attention_mask)
-        position_ids = [list(range(max_length)) for _ in input_ids]
-        position_ids = torch.LongTensor(position_ids)
-        input_ids = torch.LongTensor(
-            self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id)
-        )
-        lengths = torch.LongTensor(lengths) - 1  # subtract 1 to account for the eos token
-
-        processed_batch = {
-            'tokens': input_ids,
-            'attention_mask': attention_mask,
-            'loss_mask': lengths,
-            'position_ids': position_ids,
-            'metadata': metadata,
-        }
-
-        return processed_batch
-
-
-class GPTRerankerDataset(GPTEmbeddingDataset):
-    def __init__(
-        self,
-        file_path: str,
-        tokenizer: TokenizerSpec,
-        max_seq_length: int = 1024,
-        min_seq_length: int = 1,
-        add_bos: bool = False,
-        add_eos: bool = True,
-        max_num_samples: int = None,
-        seed: int = 1234,
-        index_mapping_dir: str = None,
-        virtual_tokens: int = 0,
-        memmap_workers: Optional[int] = None,
-        truncation_method: str = 'right',
-        special_tokens: Optional[Mapping[str, str]] = None,  # special tokens, a dictory of {token_type: token}
-        data_type: str = 'train',  # train, query or doc
-    ):
-        """
-        file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format.
-        tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece).
-        max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
-        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
-        add_bos (bool): Whether to add a beginning of sentence token to each data example
-        add_eos (bool): Whether to add an end of sentence token to each data example
-        seed: Random seed for data shuffling.
-        max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded.
-        index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset.
-        truncation_method: Truncation from which position. Options: ['left', 'right']
-        special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
-        """
-        super().__init__(
-            file_path=file_path,
-            tokenizer=tokenizer,
-            max_seq_length=max_seq_length,
-            min_seq_length=min_seq_length,
-            add_bos=add_bos,
-            add_eos=add_eos,
-            max_num_samples=max_num_samples,
-            seed=seed,
-            index_mapping_dir=index_mapping_dir,
-            virtual_tokens=virtual_tokens,
-            memmap_workers=memmap_workers,
-            truncation_method=truncation_method,
-            special_tokens=special_tokens,
-            data_type=data_type,
-        )
-
-    def _process_example(self, example):
-        """
-        Create an example by concatenating text and answer.
-        Truncation is carried out when needed, but it is performed only on the prompt side.
-        BOS, EOS, and SEP, are added if specified.
-        """
-        metadata = {k: v for k, v in example.items()}
-        if self.data_type == 'train':
-            qd = self.tokenizer.text_to_ids(
-                "query: " + example['query'].strip() + " passage: " + example['pos_doc'].strip()
-            )
-            qnd = self.tokenizer.text_to_ids(
-                "query: " + example['query'].strip() + " passage: " + example['neg_doc'].strip()
-            )
-        else:
-            qd = self.tokenizer.text_to_ids(
-                "query: " + example['query'].strip() + " passage: " + example['pos_doc'].strip()
-            )
-            qnd = []
-
-        if self.virtual_tokens:
-            # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context
-            # these pad/eos tokens are placeholders for virtual tokens for ptuning (if used)
-            qd = [self.tokenizer.eos_id] * self.virtual_tokens + qd  # type: ignore
-            qnd = [self.tokenizer.eos_id] * self.virtual_tokens + qnd  # type: ignore
-
-        if self.add_bos:
-            qd = [self.tokenizer.bos_id] + qd  # type: ignore
-            qnd = [self.tokenizer.bos_id] + qnd  # type: ignore
-
-        # TODO: (@adithyare) should probably add a warning before truncation
-        qd = qd[: self.max_seq_length - 1]
-        qnd = qnd[: self.max_seq_length - 1]
-
-        if self.add_eos:
-            qd = qd + [self.tokenizer.eos_id]  # type: ignore
-            qnd = qnd + [self.tokenizer.eos_id]  # type: ignore
-
-        processed_example = {
-            'query_pos_doc': qd,
-            'query_neg_doc': qnd,
-            'metadata': metadata,
-        }
-
-        return processed_example
-
-    def collate_fn(self, batch):
-        input_ids = []
-        metadata = []
-        lengths = []
-        max_length = -1
-        for item in batch:
-            metadata.append(item['metadata'])
-            if self.data_type == 'train':
-                input_ids.append(item['query_pos_doc'])
-                lengths.append(len(item['query_pos_doc']))
-                input_ids.append(item['query_neg_doc'])
-                lengths.append(len(item['query_neg_doc']))
-                max_length = max(max_length, len(item['query_pos_doc']), len(item['query_neg_doc']))
-            else:
-                input_ids.append(item['query_pos_doc'])
-                lengths.append(len(item['query_pos_doc']))
-                max_length = max(max_length, len(item['query_pos_doc']))
-
-        max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16))
-        assert max_length <= self.max_seq_length
-
-        attention_mask = [self._create_attention_mask(max_length) for _ in input_ids]
-        attention_mask = torch.stack(attention_mask)
-        position_ids = [list(range(max_length)) for _ in input_ids]
-        position_ids = torch.LongTensor(position_ids)
-        input_ids = torch.LongTensor(
-            self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id)
-        )
-        lengths = torch.LongTensor(lengths) - 1  # subtract 1 to account for the eos token
-
-        processed_batch = {
-            'tokens': input_ids,
-            'attention_mask': attention_mask,
-            'loss_mask': lengths,
-            'position_ids': position_ids,
-            'metadata': metadata,
-        }
-
-        return processed_batch
diff --git a/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py b/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py
deleted file mode 100644
index 349f9e43ef97..000000000000
--- a/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import multiprocessing as mp
-import os
-import pickle
-import random
-from typing import Optional
-
-import numpy as np
-from torch.utils.data import Dataset
-
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-
-__all__ = ["BertInformationRetrievalDataset"]
-
-
-class BaseInformationRetrievalDataset(Dataset):
-    """
-    Base information retrieval dataset on which other datasets are built.
-
-    Args:
-        tokenizer: tokenizer
-        max_query_length: maximum length of query in tokens
-        max_passage_length: maximum length of passage in tokens
-    """
-
-    def __init__(
-        self, tokenizer: TokenizerSpec, max_query_length: Optional[int] = 31, max_passage_length: Optional[int] = 190,
-    ):
-        self.tokenizer = tokenizer
-        self.max_query_length = max_query_length
-        self.max_passage_length = max_passage_length
-
-    def parse_npz(self, file, max_seq_length):
-        """
-        Function which parses passages (documents) in npz format.
-        After pre-processing and tokenization, the dataset will be saved
-        as numpy matrix, i_th entry of which corresponds to i_th passage (document)
-        and has the following form:
-        [n, token_1, ..., token_n, 0, ..., 0]
-        where n is the passage length (in tokens) and 0s correspond to pad tokens.
-
-        Args:
-            file: str, path to file with passages (documents)
-            max_seq_length: maximum length of sequence in tokens
-        """
-        cached_collection = file + ".npz"
-        if os.path.isfile(cached_collection):
-            dataset_npz = np.load(cached_collection)["data"]
-        else:
-            dataset_dict = self.tokenize_dataset(file, max_seq_length)
-            dataset_npz = np.zeros((len(dataset_dict), max_seq_length + 1))
-            for key in dataset_dict:
-                dataset_npz[key][0] = len(dataset_dict[key])
-                dataset_npz[key][1 : len(dataset_dict[key]) + 1] = dataset_dict[key]
-            np.savez(cached_collection, data=dataset_npz)
-        return dataset_npz
-
-    def parse_pkl(self, file, max_seq_length):
-        """
-        Function which parses passages (documents, queries) in pkl format.
-        After pre-processing and tokenization, the dataset will be saved
-        as pkl dict, i_th entry of which corresponds to i_th passage (document, query)
-        and has the following form:
-        {passage_id: [token_1, ..., token_n]}
-        where n is the passage length (in tokens).
-
-        Args:
-            file: str, path to file with passages (documents)
-            max_seq_length: maximum length of sequence in tokens
-        """
-        cached_collection = file + ".pkl"
-        if os.path.isfile(cached_collection):
-            dataset_dict = pickle.load(open(cached_collection, "rb"))
-        else:
-            dataset_dict = self.tokenize_dataset(file, max_seq_length)
-            pickle.dump(dataset_dict, open(cached_collection, "wb"))
-        return dataset_dict
-
-    def tokenize_dataset(self, file, max_seq_length):
-        """
-        Function which pre-tokenizes the dataset.
-        """
-        lines = open(file, "r").readlines()
-        with mp.Pool() as pool:
-            dataset_dict = pool.map(self.preprocess_line, lines)
-        dataset_dict = {id_: tokens[:max_seq_length] for (id_, tokens) in dataset_dict}
-        return dataset_dict
-
-    def preprocess_line(self, line):
-        """
-        Parse a single entry (line) of tsv file.
-        """
-        if "\t" not in line:
-            raise ValueError(f"Provided dataset does not have a form of tsv file")
-        id_, text = line.split("\t")
-        token_ids = self.tokenizer.text_to_ids(text.strip())
-        return int(id_), token_ids
-
-    def construct_input(self, token_ids1, max_seq_length, token_ids2=None):
-        """
-        Function which constructs a valid input to BERT from tokens.
-
-        If only one list of tokens (token_ids1) is passed, the input will be
-        [CLS] token_ids1 [SEP]
-
-        if two lists of tokens are passed, the input will be
-        [CLS] token_ids1 [SEP] token_ids2 [SEP]
-        """
-
-        input_ids = [self.tokenizer.pad_id] * max_seq_length
-        bert_input = [self.tokenizer.cls_id] + token_ids1 + [self.tokenizer.sep_id]
-        sentence1_length = len(bert_input)
-        if token_ids2 is not None:
-            bert_input = bert_input + token_ids2 + [self.tokenizer.sep_id]
-
-        bert_input = bert_input[:max_seq_length]
-
-        num_nonpad_tokens = len(bert_input)
-
-        input_ids[:num_nonpad_tokens] = bert_input
-        input_ids = np.array(input_ids, dtype=np.longlong)
-        input_mask = input_ids != self.tokenizer.pad_id
-        input_type_ids = np.ones_like(input_ids)
-        input_type_ids[:sentence1_length] = 0
-
-        return input_ids, input_mask, input_type_ids
-
-    def preprocess_bert(self, query_id, psg_ids):
-        """
-        Transforms query id (Q) and a list of passages ids (P1, ..., Pk)
-        into a tensor of size [k, max_length] with the following rows:
-        [CLS] Q_text [SEP] Pi_text [SEP], i = 1, ..., k
-        """
-
-        max_seq_length = self.max_query_length + self.max_passage_length + 3
-        input_ids, input_mask, input_type_ids = [], [], []
-        for psg_id in psg_ids:
-            inputs = self.construct_input(self.queries[query_id], max_seq_length, self._psgid2tokens(psg_id))
-            input_ids.append(inputs[0])
-            input_mask.append(inputs[1])
-            input_type_ids.append(inputs[2])
-
-        input_ids = np.stack(input_ids)
-        input_mask = np.stack(input_mask)
-        input_type_ids = np.stack(input_type_ids)
-
-        return input_ids, input_mask, input_type_ids
-
-    def preprocess_dpr(self, query_id, psg_ids):
-        """
-        Transforms query id (Q) and a list of passages ids (P1, ..., Pk)
-        into two tensors of sizes [1, max_q_length] and [k, max_p_length]
-        with the following rows:
-        1) [CLS] Q_text [SEP]
-        2) [CLS] Pi_text [SEP], i = 1, ..., k
-        """
-
-        q_input_ids, q_input_mask, q_type_ids = self.construct_input(self.queries[query_id], self.max_query_length + 2)
-        input_ids, input_mask, input_type_ids = [], [], []
-        for psg_id in psg_ids:
-            inputs = self.construct_input(self._psgid2tokens(psg_id), self.max_passage_length + 2)
-            input_ids.append(inputs[0])
-            input_mask.append(inputs[1])
-            input_type_ids.append(inputs[2])
-        input_ids = np.stack(input_ids)
-        input_mask = np.stack(input_mask)
-        input_type_ids = np.stack(input_type_ids)
-        return (
-            q_input_ids[None, ...],
-            q_input_mask[None, ...],
-            q_type_ids[None, ...],
-            input_ids,
-            input_mask,
-            input_type_ids,
-        )
-
-    def _psgid2tokens(self, psg_id):
-        """
-        Internal function which maps passage id to its tokens.
-        """
-        pass
-
-    def psgid2tokens_npz(self, psg_id):
-        """
-        Mapping from passage id to its tokens in case of npz cache format.
-        """
-        seq_len = self.passages[psg_id][0]
-        return self.passages[psg_id][1 : seq_len + 1].tolist()
-
-    def psgid2tokens_pkl(self, psg_id):
-        """
-        Mapping from passage id to its tokens in case of pkl cache format.
-        """
-        return self.passages[psg_id]
-
-
-class BertInformationRetrievalDataset(BaseInformationRetrievalDataset):
-    def __init__(
-        self,
-        tokenizer: TokenizerSpec,
-        passages: str,
-        queries: str,
-        query_to_passages: str,
-        max_query_length: Optional[int] = 31,
-        max_passage_length: Optional[int] = 190,
-        num_negatives: Optional[int] = 10,
-        preprocess_fn: Optional[str] = "preprocess_bert",
-        psg_cache_format: Optional[str] = "npz",
-    ):
-        """
-        Dataset for training information retrieval models.
-        
-        Args:
-            tokenizer: tokenizer
-            passages: path to tsv with [psg_id, psg_text] entries
-            queries: path to tsv with [query_id, query_text] entries
-            query_to_passages: path to tsv with
-                [query_id, pos_psg_id, neg_psg_id_1, ..., neg_psg_id_k] entries
-            max_query_length: maximum length of query in tokens
-            max_passage_length: maximum length of passage in tokens
-            num_negatives: number of negative passages per positive to use for training
-            preprocess_fn: either preprocess_bert or preprocess_dpr
-                preprocess_bert: joint input: [CLS] query [SEP] passage [SEP]
-                preprocess_dpr: separate inputs: [CLS] query [SEP], [CLS] passage [SEP]
-            psg_cache_format: either pkl or npz
-        """
-
-        super().__init__(tokenizer, max_query_length, max_passage_length)
-        self.num_negatives = num_negatives
-
-        self.passages = getattr(self, f"parse_{psg_cache_format}")(passages, max_passage_length)
-        self._psgid2tokens = getattr(self, f"psgid2tokens_{psg_cache_format}")
-        self.queries = self.parse_pkl(queries, max_query_length)
-        self.idx2psgs = self.parse_query_to_passages(query_to_passages)
-        self._preprocess_fn = getattr(self, preprocess_fn)
-
-    def __getitem__(self, idx):
-        query_and_psgs = self.idx2psgs[idx]
-        query_id, psg_ids = query_and_psgs[0], query_and_psgs[1:]
-        inputs = self._preprocess_fn(query_id, psg_ids)
-        return [*inputs, query_id, np.array(psg_ids)]
-
-    def __len__(self):
-        return len(self.idx2psgs)
-
-    def parse_query_to_passages(self, file):
-        """
-        Function which parses query to passages correspondence file.
-        """
-        idx2psgs = {}
-        idx = 0
-        for line in open(file, "r").readlines():
-            if "\t" not in line:
-                raise ValueError(f"Provided dataset does not have a form of tsv file")
-            query_and_psgs = line.split("\t")
-            query_and_psgs_ids = [int(id_) for id_ in query_and_psgs]
-            query_and_rel_psg_ids, irrel_psgs_ids = query_and_psgs_ids[:2], query_and_psgs_ids[2:]
-            random.shuffle(irrel_psgs_ids)
-            num_samples = len(irrel_psgs_ids) // self.num_negatives
-            for j in range(num_samples):
-                left = self.num_negatives * j
-                right = self.num_negatives * (j + 1)
-                idx2psgs[idx] = query_and_rel_psg_ids + irrel_psgs_ids[left:right]
-                idx += 1
-        return idx2psgs
diff --git a/nemo/collections/nlp/data/intent_slot_classification/__init__.py b/nemo/collections/nlp/data/intent_slot_classification/__init__.py
deleted file mode 100644
index 3e1782e02e4f..000000000000
--- a/nemo/collections/nlp/data/intent_slot_classification/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from nemo.collections.nlp.data.intent_slot_classification.intent_slot_classification_dataset import (
-    IntentSlotClassificationDataset,
-    IntentSlotInferenceDataset,
-)
-from nemo.collections.nlp.data.intent_slot_classification.intent_slot_classification_descriptor import (
-    IntentSlotDataDesc,
-)
-from nemo.collections.nlp.data.intent_slot_classification.multi_label_intent_slot_classification_dataset import (
-    MultiLabelIntentSlotClassificationDataset,
-)
-from nemo.collections.nlp.data.intent_slot_classification.multi_label_intent_slot_classification_descriptor import (
-    MultiLabelIntentSlotDataDesc,
-)
diff --git a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py b/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py
deleted file mode 100644
index a73341aa719d..000000000000
--- a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional
-
-import numpy as np
-
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-from nemo.collections.nlp.data.data_utils import get_stats
-from nemo.core.classes import Dataset
-from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType
-from nemo.utils import logging
-
-__all__ = ['IntentSlotClassificationDataset', 'IntentSlotInferenceDataset']
-
-
-def get_features(
-    queries,
-    max_seq_length,
-    tokenizer,
-    pad_label=128,
-    raw_slots=None,
-    ignore_extra_tokens=False,
-    ignore_start_end=False,
-):
-    all_subtokens = []
-    all_loss_mask = []
-    all_subtokens_mask = []
-    all_segment_ids = []
-    all_input_ids = []
-    all_input_mask = []
-    sent_lengths = []
-    all_slots = []
-
-    with_label = False
-    if raw_slots is not None:
-        with_label = True
-
-    for i, query in enumerate(queries):
-        words = query.strip().split()
-        subtokens = [tokenizer.cls_token]
-        loss_mask = [1 - ignore_start_end]
-        subtokens_mask = [0]
-        if with_label:
-            slots = [pad_label]
-
-        for j, word in enumerate(words):
-            word_tokens = tokenizer.text_to_tokens(word)
-
-            # to handle emojis that could be neglected during tokenization
-            if len(word.strip()) > 0 and len(word_tokens) == 0:
-                word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)]
-
-            subtokens.extend(word_tokens)
-
-            loss_mask.append(1)
-            loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1))
-
-            subtokens_mask.append(1)
-            subtokens_mask.extend([0] * (len(word_tokens) - 1))
-
-            if with_label:
-                slots.extend([raw_slots[i][j]] * len(word_tokens))
-
-        subtokens.append(tokenizer.sep_token)
-        loss_mask.append(1 - ignore_start_end)
-        subtokens_mask.append(0)
-        sent_lengths.append(len(subtokens))
-        all_subtokens.append(subtokens)
-        all_loss_mask.append(loss_mask)
-        all_subtokens_mask.append(subtokens_mask)
-        all_input_mask.append([1] * len(subtokens))
-        if with_label:
-            slots.append(pad_label)
-            all_slots.append(slots)
-
-    max_seq_length_data = max(sent_lengths)
-    max_seq_length = min(max_seq_length, max_seq_length_data) if max_seq_length > 0 else max_seq_length_data
-    logging.info(f'Setting max length to: {max_seq_length}')
-    get_stats(sent_lengths)
-    too_long_count = 0
-
-    for i, subtokens in enumerate(all_subtokens):
-        if len(subtokens) > max_seq_length:
-            subtokens = [tokenizer.cls_token] + subtokens[-max_seq_length + 1 :]
-            all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :]
-            all_loss_mask[i] = [1 - ignore_start_end] + all_loss_mask[i][-max_seq_length + 1 :]
-            all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :]
-
-            if with_label:
-                all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1 :]
-            too_long_count += 1
-
-        all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens])
-
-        if len(subtokens) < max_seq_length:
-            extra = max_seq_length - len(subtokens)
-            all_input_ids[i] = all_input_ids[i] + [0] * extra
-            all_loss_mask[i] = all_loss_mask[i] + [0] * extra
-            all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra
-            all_input_mask[i] = all_input_mask[i] + [0] * extra
-
-            if with_label:
-                all_slots[i] = all_slots[i] + [pad_label] * extra
-
-        all_segment_ids.append([0] * max_seq_length)
-
-    logging.info(f'{too_long_count} are longer than {max_seq_length}')
-
-    # May be useful for debugging
-    logging.debug("*** Some Examples of Processed Data ***")
-    for i in range(min(len(all_input_ids), 5)):
-        logging.debug("i: %s" % (i))
-        logging.debug("subtokens: %s" % " ".join(list(map(str, all_subtokens[i]))))
-        logging.debug("loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i]))))
-        logging.debug("input_mask: %s" % " ".join(list(map(str, all_input_mask[i]))))
-        logging.debug("subtokens_mask: %s" % " ".join(list(map(str, all_subtokens_mask[i]))))
-        if with_label:
-            logging.debug("slots_label: %s" % " ".join(list(map(str, all_slots[i]))))
-
-    return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots)
-
-
-class IntentSlotClassificationDataset(Dataset):
-    """
-    Creates dataset to use for the task of joint intent
-    and slot classification with pretrained model.
-
-    Converts from raw data to an instance that can be used by
-    NMDataLayer.
-
-    For dataset to use during inference without labels, see
-    IntentSlotDataset.
-
-    Args:
-        input_file: file to sequence + label. the first line is header (sentence [tab] label)
-            each line should be [sentence][tab][label]
-        slot_file: file to slot labels, each line corresponding to slot labels for a sentence in input_file. No header.
-        max_seq_length: max sequence length minus 2 for [CLS] and [SEP]
-        tokenizer: such as NemoBertTokenizer
-        num_samples: number of samples you want to use for the dataset. If -1, use all dataset. Useful for testing.
-        pad_label: pad value use for slot labels. by default, it's the neutral label.
-        ignore_extra_tokens: whether to ignore extra tokens in the loss_mask.
-        ignore_start_end: whether to ignore bos and eos tokens in the loss_mask.
-        do_lower_case: convert query to lower case or not
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'loss_mask': NeuralType(('B', 'T'), MaskType()),
-            'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-            'intent_labels': NeuralType(('B'), LabelsType()),
-            'slot_labels': NeuralType(('B', 'T'), LabelsType()),
-        }
-
-    def __init__(
-        self,
-        input_file: str,
-        slot_file: str,
-        max_seq_length: int,
-        tokenizer: TokenizerSpec,
-        num_samples: int = -1,
-        pad_label: int = 128,
-        ignore_extra_tokens: bool = False,
-        ignore_start_end: bool = False,
-        do_lower_case: bool = False,
-    ):
-        if num_samples == 0:
-            raise ValueError("num_samples has to be positive", num_samples)
-
-        with open(slot_file, 'r') as f:
-            slot_lines = f.readlines()
-
-        with open(input_file, 'r') as f:
-            input_lines = f.readlines()[1:]
-
-        assert len(slot_lines) == len(input_lines)
-
-        dataset = list(zip(slot_lines, input_lines))
-
-        if num_samples > 0:
-            dataset = dataset[:num_samples]
-
-        raw_slots, queries, raw_intents = [], [], []
-        for slot_line, input_line in dataset:
-            raw_slots.append([int(slot) for slot in slot_line.strip().split()])
-            parts = input_line.strip().split()
-            raw_intents.append(int(parts[-1]))
-            query = ' '.join(parts[:-1])
-            if do_lower_case:
-                query = query.lower()
-            queries.append(query)
-
-        features = get_features(
-            queries,
-            max_seq_length,
-            tokenizer,
-            pad_label=pad_label,
-            raw_slots=raw_slots,
-            ignore_extra_tokens=ignore_extra_tokens,
-            ignore_start_end=ignore_start_end,
-        )
-        self.all_input_ids = features[0]
-        self.all_segment_ids = features[1]
-        self.all_input_mask = features[2]
-        self.all_loss_mask = features[3]
-        self.all_subtokens_mask = features[4]
-        self.all_slots = features[5]
-        self.all_intents = raw_intents
-
-    def __len__(self):
-        return len(self.all_input_ids)
-
-    def __getitem__(self, idx):
-        return (
-            np.array(self.all_input_ids[idx]),
-            np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.longlong),
-            np.array(self.all_loss_mask[idx]),
-            np.array(self.all_subtokens_mask[idx]),
-            self.all_intents[idx],
-            np.array(self.all_slots[idx]),
-        )
-
-
-class IntentSlotInferenceDataset(Dataset):
-    """
-    Creates dataset to use for the task of joint intent
-    and slot classification with pretrained model.
-    This is to be used during inference only.
-    It uses list of queries as the input.
-
-    Args:
-        queries (list): list of queries to run inference on
-        max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
-        tokenizer (Tokenizer): such as NemoBertTokenizer
-        pad_label (int): pad value use for slot labels.
-            by default, it's the neutral label.
-
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """
-            Returns definitions of module output ports.
-        """
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'loss_mask': NeuralType(('B', 'T'), MaskType()),
-            'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-        }
-
-    def __init__(self, queries, max_seq_length, tokenizer, do_lower_case):
-        if do_lower_case:
-            for idx, query in enumerate(queries):
-                queries[idx] = queries[idx].lower()
-
-        features = get_features(queries, max_seq_length, tokenizer)
-
-        self.all_input_ids = features[0]
-        self.all_segment_ids = features[1]
-        self.all_input_mask = features[2]
-        self.all_loss_mask = features[3]
-        self.all_subtokens_mask = features[4]
-
-    def __len__(self):
-        return len(self.all_input_ids)
-
-    def __getitem__(self, idx):
-        return (
-            np.array(self.all_input_ids[idx]),
-            np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.longlong),
-            np.array(self.all_loss_mask[idx]),
-            np.array(self.all_subtokens_mask[idx]),
-        )
diff --git a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_descriptor.py b/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_descriptor.py
deleted file mode 100644
index 544b5e1db858..000000000000
--- a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_descriptor.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import itertools
-from typing import List
-
-from nemo.collections.nlp.data.data_utils.data_preprocessing import (
-    fill_class_weights,
-    get_freq_weights,
-    get_label_stats,
-    if_exist,
-)
-from nemo.utils import logging
-
-
-class IntentSlotDataDesc:
-    """ Convert the raw data to the standard format supported by
-    IntentSlotDataDesc.
-
-    By default, the None label for slots is 'O'.
-
-    IntentSlotDataDesc requires two files:
-
-        input_file: file to sequence + label.
-            the first line is header (sentence [tab] label)
-            each line should be [sentence][tab][label]
-
-        slot_file: file to slot labels, each line corresponding to
-            slot labels for a sentence in input_file. No header.
-
-    To keep the mapping from label index to label consistent during
-    training and inferencing we require the following files:
-        dicts.intents.csv: each line is an intent. The first line
-            corresponding to the 0 intent label, the second line
-            corresponding to the 1 intent label, and so on.
-
-        dicts.slots.csv: each line is a slot. The first line
-            corresponding to the 0 slot label, the second line
-            corresponding to the 1 slot label, and so on.
-
-    Args:
-        data_dir: the directory of the dataset
-        modes: ['train', 'test', 'dev'],
-        none_slot_label: the label for slots that aren't identified defaulted to 'O'
-        pad_label: the int used for padding. If set to -1, it'll be set to the whatever the None label is.
-    """
-
-    def __init__(
-        self,
-        data_dir: str,
-        modes: List[str] = ['train', 'test', 'dev'],
-        none_slot_label: str = 'O',
-        pad_label: int = -1,
-    ):
-        if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
-            raise FileNotFoundError(
-                "Make sure that your data follows the standard format "
-                "supported by JointIntentSlotDataset. Your data must "
-                "contain dict.intents.csv and dict.slots.csv."
-            )
-
-        self.data_dir = data_dir
-        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
-        self.slot_dict_file = self.data_dir + '/dict.slots.csv'
-
-        self.intents_label_ids = IntentSlotDataDesc.label2idx(self.intent_dict_file)
-        self.num_intents = len(self.intents_label_ids)
-        self.slots_label_ids = IntentSlotDataDesc.label2idx(self.slot_dict_file)
-        self.num_slots = len(self.slots_label_ids)
-
-        infold = self.data_dir
-        for mode in modes:
-            if not if_exist(self.data_dir, [f'{mode}.tsv']):
-                logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.')
-                continue
-            logging.info(f' Stats calculating for {mode} mode...')
-            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
-            with open(slot_file, 'r') as f:
-                slot_lines = f.readlines()
-
-            input_file = f'{self.data_dir}/{mode}.tsv'
-            with open(input_file, 'r') as f:
-                input_lines = f.readlines()[1:]  # Skipping headers at index 0
-
-            if len(slot_lines) != len(input_lines):
-                raise ValueError(
-                    "Make sure that the number of slot lines match the "
-                    "number of intent lines. There should be a 1-1 "
-                    "correspondence between every slot and intent lines."
-                )
-
-            dataset = list(zip(slot_lines, input_lines))
-
-            raw_slots, raw_intents = [], []
-            for slot_line, input_line in dataset:
-                slot_list = [int(slot) for slot in slot_line.strip().split()]
-                raw_slots.append(slot_list)
-                parts = input_line.strip().split()
-                raw_intents.append(int(parts[-1]))
-
-            logging.info(f'Three most popular intents in {mode} mode:')
-            total_intents, intent_label_freq, max_id = get_label_stats(
-                raw_intents, infold + f'/{mode}_intent_stats.tsv'
-            )
-
-            merged_slots = itertools.chain.from_iterable(raw_slots)
-            logging.info(f'Three most popular slots in {mode} mode:')
-            slots_total, slots_label_freq, max_id = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv')
-
-            logging.info(f'Total Number of Intents: {total_intents}')
-            logging.info(f'Intent Label Frequencies: {intent_label_freq}')
-            logging.info(f'Total Number of Slots: {slots_total}')
-            logging.info(f'Slots Label Frequencies: {slots_label_freq}')
-
-            if mode == 'train':
-                intent_weights_dict = get_freq_weights(intent_label_freq)
-                logging.info(f'Intent Weights: {intent_weights_dict}')
-                slot_weights_dict = get_freq_weights(slots_label_freq)
-                logging.info(f'Slot Weights: {slot_weights_dict}')
-
-        self.intent_weights = fill_class_weights(intent_weights_dict, self.num_intents - 1)
-        self.slot_weights = fill_class_weights(slot_weights_dict, self.num_slots - 1)
-
-        if pad_label != -1:
-            self.pad_label = pad_label
-        else:
-            if none_slot_label not in self.slots_label_ids:
-                raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.')
-            self.pad_label = self.slots_label_ids[none_slot_label]
-
-    @staticmethod
-    def label2idx(file):
-        lines = open(file, 'r').readlines()
-        lines = [line.strip() for line in lines if line.strip()]
-        labels = {lines[i]: i for i in range(len(lines))}
-        return labels
-
-    @staticmethod
-    def intent_slot_dicts(data_dir):
-        '''
-        Return Intent and slot dictionaries
-        '''
-        intent_dict_file = data_dir + '/dict.intents.csv'
-        slot_dict_file = data_dir + '/dict.slots.csv'
-
-        intents_labels = open(intent_dict_file, 'r').readlines()
-        intents_labels = [line.strip() for line in intents_labels if line.strip()]
-
-        slots_labels = open(slot_dict_file, 'r').readlines()
-        slots_labels = [line.strip() for line in slots_labels if line.strip()]
-
-        return intents_labels, slots_labels
diff --git a/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_dataset.py b/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_dataset.py
deleted file mode 100644
index 32a72d107193..000000000000
--- a/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_dataset.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional
-
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-from nemo.collections.nlp.data.intent_slot_classification import IntentSlotClassificationDataset
-from nemo.collections.nlp.data.intent_slot_classification.intent_slot_classification_dataset import get_features
-from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType
-
-__all__ = ['MultiLabelIntentSlotClassificationDataset']
-
-
-class MultiLabelIntentSlotClassificationDataset(IntentSlotClassificationDataset):
-    """
-    Creates dataset to use for the task of multi-label joint intent
-    and slot classification with pretrained model.
-
-    Converts from raw data to an instance that can be used by
-    NMDataLayer.
-
-    Args:
-        input_file: file containing sentences + labels. The first line is header (sentence [tab] label)
-            each line should be [sentence][tab][label] where label can be multiple labels separated by a comma
-        slot_file: file containing slot labels, each line corresponding to slot labels for a sentence in input_file. No header.
-        num_intents: total number of intents in dict.intents file
-        max_seq_length: max sequence length minus 2 for [CLS] and [SEP]
-        tokenizer: such as NemoBertTokenizer
-        num_samples: number of samples you want to use for the dataset. If -1, use all dataset. Useful for testing.
-        pad_label: pad value use for slot labels. by default, it's the neutral label.
-        ignore_extra_tokens: whether to ignore extra tokens in the loss_mask.
-        ignore_start_end: whether to ignore bos and eos tokens in the loss_mask.
-        do_lower_case: convert query to lower case or not
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'loss_mask': NeuralType(('B', 'T'), MaskType()),
-            'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-            'intent_labels': [NeuralType(('B'), LabelsType())],
-            'slot_labels': NeuralType(('B', 'T'), LabelsType()),
-        }
-
-    def __init__(
-        self,
-        input_file: str,
-        slot_file: str,
-        num_intents: int,
-        max_seq_length: int,
-        tokenizer: TokenizerSpec,
-        num_samples: int = -1,
-        pad_label: int = 128,
-        ignore_extra_tokens: bool = False,
-        ignore_start_end: bool = False,
-        do_lower_case: bool = False,
-    ):
-        if num_samples == 0:
-            raise ValueError("num_samples has to be positive", num_samples)
-
-        with open(slot_file, 'r') as f:
-            slot_lines = f.readlines()
-
-        with open(input_file, 'r') as f:
-            input_lines = f.readlines()[1:]
-
-        assert len(slot_lines) == len(input_lines)
-
-        dataset = list(zip(slot_lines, input_lines))
-
-        if num_samples > 0:
-            dataset = dataset[:num_samples]
-
-        raw_slots, queries, raw_intents = [], [], []
-        for slot_line, input_line in dataset:
-            raw_slots.append([int(slot) for slot in slot_line.strip().split()])
-            parts = input_line.strip().split("\t")[1:][0]
-            parts = list(map(int, parts.split(",")))
-            parts = [1 if label in parts else 0 for label in range(num_intents)]
-            raw_intents.append(tuple(parts))
-            tokens = input_line.strip().split("\t")[0].split()
-            query = ' '.join(tokens)
-            if do_lower_case:
-                query = query.lower()
-            queries.append(query)
-
-        features = get_features(
-            queries,
-            max_seq_length,
-            tokenizer,
-            pad_label=pad_label,
-            raw_slots=raw_slots,
-            ignore_extra_tokens=ignore_extra_tokens,
-            ignore_start_end=ignore_start_end,
-        )
-
-        self.all_input_ids = features[0]
-        self.all_segment_ids = features[1]
-        self.all_input_mask = features[2]
-        self.all_loss_mask = features[3]
-        self.all_subtokens_mask = features[4]
-        self.all_slots = features[5]
-        self.all_intents = raw_intents
diff --git a/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_descriptor.py b/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_descriptor.py
deleted file mode 100644
index ddde1a2896de..000000000000
--- a/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_descriptor.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import itertools
-from typing import List
-
-from nemo.collections.nlp.data.data_utils.data_preprocessing import (
-    fill_class_weights,
-    get_freq_weights,
-    get_freq_weights_bce_with_logits_loss,
-    get_label_stats,
-    get_labels_to_labels_id_mapping,
-    get_multi_label_stats,
-    if_exist,
-)
-from nemo.utils import logging
-
-
-class MultiLabelIntentSlotDataDesc:
-    """ Convert the raw data to the standard format supported by
-    MultiLabelIntentSlotDataDesc.
-
-    By default, the None label for slots is 'O'.
-
-    MultiLabelIntentSlotDataDesc requires two files:
-
-        input_file: file containing sentences + labels.
-            the first line is header (sentence [tab] label)
-            each line should be [sentence][tab][label] where label is a string of comma separated values.
-            Example: 1 or  1,2 are both valid labels
-
-        slot_file: file containing slot labels, each line corresponding to
-            slot labels for a sentence in input_file. No header.
-
-    To keep the mapping from label index to label consistent during
-    training and inferencing we require the following files:
-        dicts.intents.csv: each line is an intent. The first line
-            corresponding to the 0 intent label, the second line
-            corresponding to the 1 intent label, and so on.
-
-        dicts.slots.csv: each line is a slot. The first line
-            corresponding to the 0 slot label, the second line
-            corresponding to the 1 slot label, and so on.
-
-    Args:
-        data_dir: the directory of the dataset
-        modes: ['train', 'test', 'dev'],
-        none_slot_label: the label for slots that aren't identified defaulted to 'O'
-        pad_label: the int used for padding. If set to -1, it'll be set to the whatever the None label is.
-    """
-
-    def __init__(
-        self,
-        data_dir: str,
-        modes: List[str] = ["train", "test", "dev"],
-        none_slot_label: str = "O",
-        pad_label: int = -1,
-    ):
-        if not if_exist(data_dir, ["dict.intents.csv", "dict.slots.csv"]):
-            raise FileNotFoundError(
-                "Make sure that your data follows the standard format "
-                "supported by MultiLabelIntentSlotDataset. Your data must "
-                "contain dict.intents.csv and dict.slots.csv."
-            )
-
-        self.data_dir = data_dir
-        self.intent_dict_file = self.data_dir + "/dict.intents.csv"
-        self.slot_dict_file = self.data_dir + "/dict.slots.csv"
-
-        self.intents_label_ids = get_labels_to_labels_id_mapping(self.intent_dict_file)
-        self.num_intents = len(self.intents_label_ids)
-        self.slots_label_ids = get_labels_to_labels_id_mapping(self.slot_dict_file)
-        self.num_slots = len(self.slots_label_ids)
-
-        infold = self.data_dir
-        for mode in modes:
-            if not if_exist(self.data_dir, [f"{mode}.tsv"]):
-                logging.info(f" Stats calculation for {mode} mode" f" is skipped as {mode}.tsv was not found.")
-                continue
-            logging.info(f" Stats calculating for {mode} mode...")
-            slot_file = f"{self.data_dir}/{mode}_slots.tsv"
-            with open(slot_file, "r") as f:
-                slot_lines = f.readlines()
-
-            input_file = f"{self.data_dir}/{mode}.tsv"
-            with open(input_file, "r") as f:
-                input_lines = f.readlines()[1:]  # Skipping headers at index 0
-
-            if len(slot_lines) != len(input_lines):
-                raise ValueError(
-                    "Make sure that the number of slot lines match the "
-                    "number of intent lines. There should be a 1-1 "
-                    "correspondence between every slot and intent lines."
-                )
-
-            dataset = list(zip(slot_lines, input_lines))
-
-            raw_slots, raw_intents = [], []
-            for slot_line, input_line in dataset:
-                slot_list = [int(slot) for slot in slot_line.strip().split()]
-                raw_slots.append(slot_list)
-                parts = input_line.strip().split("\t")[1:][0]
-                parts = list(map(int, parts.split(",")))
-                parts = [1 if label in parts else 0 for label in range(self.num_intents)]
-                raw_intents.append(tuple(parts))
-
-            logging.info(f"Three most popular intents in {mode} mode:")
-            total_intents, intent_label_freq, max_id = get_multi_label_stats(
-                raw_intents, infold + f"/{mode}_intent_stats.tsv"
-            )
-
-            merged_slots = itertools.chain.from_iterable(raw_slots)
-            logging.info(f"Three most popular slots in {mode} mode:")
-            slots_total, slots_label_freq, max_id = get_label_stats(merged_slots, infold + f"/{mode}_slot_stats.tsv")
-
-            logging.info(f"Total Number of Intent Labels: {total_intents}")
-            logging.info(f"Intent Label Frequencies: {intent_label_freq}")
-            logging.info(f"Total Number of Slots: {slots_total}")
-            logging.info(f"Slots Label Frequencies: {slots_label_freq}")
-
-            if mode == "train":
-                intent_weights_dict = get_freq_weights_bce_with_logits_loss(intent_label_freq)
-                logging.info(f"Intent Weights: {intent_weights_dict}")
-                slot_weights_dict = get_freq_weights(slots_label_freq)
-                logging.info(f"Slot Weights: {slot_weights_dict}")
-
-        self.intent_weights = fill_class_weights(intent_weights_dict, self.num_intents - 1)
-        self.slot_weights = fill_class_weights(slot_weights_dict, self.num_slots - 1)
-
-        if pad_label != -1:
-            self.pad_label = pad_label
-        else:
-            if none_slot_label not in self.slots_label_ids:
-                raise ValueError(f"none_slot_label {none_slot_label} not " f"found in {self.slot_dict_file}.")
-            self.pad_label = self.slots_label_ids[none_slot_label]
diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/__init__.py b/nemo/collections/nlp/data/spellchecking_asr_customization/__init__.py
deleted file mode 100644
index 4e786276108c..000000000000
--- a/nemo/collections/nlp/data/spellchecking_asr_customization/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from nemo.collections.nlp.data.spellchecking_asr_customization.dataset import (
-    SpellcheckingAsrCustomizationDataset,
-    SpellcheckingAsrCustomizationTestDataset,
-    TarredSpellcheckingAsrCustomizationDataset,
-)
diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py
deleted file mode 100644
index c98abb300c64..000000000000
--- a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py
+++ /dev/null
@@ -1,607 +0,0 @@
-# Copyright 2019 The Google Research Authors.
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from collections import OrderedDict
-from os import path
-from typing import Dict, List, Optional, Tuple, Union
-
-from transformers import PreTrainedTokenizerBase
-
-from nemo.utils.decorators import deprecated_warning
-
-"""Build BERT Examples from asr hypothesis, customization candidates, target labels, span info.
-"""
-
-
-class BertExample(object):
-    """Class for training and inference examples for BERT.
-
-    Attributes:
-        features: Feature dictionary.
-    """
-
-    def __init__(
-        self,
-        input_ids: List[int],
-        input_mask: List[int],
-        segment_ids: List[int],
-        input_ids_for_subwords: List[int],
-        input_mask_for_subwords: List[int],
-        segment_ids_for_subwords: List[int],
-        character_pos_to_subword_pos: List[int],
-        fragment_indices: List[Tuple[int, int, int]],
-        labels_mask: List[int],
-        labels: List[int],
-        spans: List[Tuple[int, int, int]],
-        default_label: int,
-    ) -> None:
-        """Inputs to the example wrapper
-
-        Args:
-            input_ids: indices of single characters (treated as subwords)
-            input_mask: list of bools with 0s in place of input_ids to be masked
-            segment_ids: list of ints from 0 to 10 to denote the text segment type (
-                0 - for tokens of ASR hypothesis,
-                1 - for tokens of the first candidate
-                ...
-                10 - for tokens of the tenth candidate
-            )
-            input_ids_for_subwords: indices of real subwords (as tokenized by bert tokenizer)
-            input_mask_for_subwords: list of bools with 0s in place of input_ids_for_subwords to be masked
-            segment_ids_for_subwords: same as segment_ids but for input_ids_for_subwords
-            character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords)
-            fragment_indices: list of tuples (start_position, end_position, candidate_id), end is exclusive, candidate_id can be -1 if not set
-            labels_mask: bool tensor with 0s in place of label tokens to be masked
-            labels: indices of semiotic classes which should be predicted from each of the
-                corresponding input tokens
-            spans: list of tuples (class_id, start_position, end_position), end is exclusive, class is always 1(CUSTOM)
-            default_label: The default label
-        """
-        # deprecation warning
-        deprecated_warning("BertExample")
-
-        input_len = len(input_ids)
-        if not (
-            input_len == len(input_mask)
-            and input_len == len(segment_ids)
-            and input_len == len(labels_mask)
-            and input_len == len(labels)
-            and input_len == len(character_pos_to_subword_pos)
-        ):
-            raise ValueError("All feature lists should have the same length ({})".format(input_len))
-
-        input_len_for_subwords = len(input_ids_for_subwords)
-        if not (
-            input_len_for_subwords == len(input_mask_for_subwords)
-            and input_len_for_subwords == len(segment_ids_for_subwords)
-        ):
-            raise ValueError(
-                "All feature lists for subwords should have the same length ({})".format(input_len_for_subwords)
-            )
-
-        self.features = OrderedDict(
-            [
-                ("input_ids", input_ids),
-                ("input_mask", input_mask),
-                ("segment_ids", segment_ids),
-                ("input_ids_for_subwords", input_ids_for_subwords),
-                ("input_mask_for_subwords", input_mask_for_subwords),
-                ("segment_ids_for_subwords", segment_ids_for_subwords),
-                ("character_pos_to_subword_pos", character_pos_to_subword_pos),
-                ("fragment_indices", fragment_indices),
-                ("labels_mask", labels_mask),
-                ("labels", labels),
-                ("spans", spans),
-            ]
-        )
-        self._default_label = default_label
-
-
-class BertExampleBuilder(object):
-    """Builder class for BertExample objects."""
-
-    def __init__(
-        self,
-        label_map: Dict[str, int],
-        semiotic_classes: Dict[str, int],
-        tokenizer: PreTrainedTokenizerBase,
-        max_seq_length: int,
-    ) -> None:
-        """Initializes an instance of BertExampleBuilder.
-
-        Args:
-            label_map: Mapping from tags to tag IDs.
-            semiotic_classes: Mapping from semiotic classes to their ids.
-            tokenizer: Tokenizer object.
-            max_seq_length: Maximum sequence length.
-        """
-        # deprecation warning
-        deprecated_warning("BertExampleBuilder")
-
-        self._label_map = label_map
-        self._semiotic_classes = semiotic_classes
-        self._tokenizer = tokenizer
-        self._max_seq_length = max_seq_length
-        # one span usually covers one or more words and it only exists for custom phrases, so there are much less spans than characters.
-        self._max_spans_length = max(4, int(max_seq_length / 20))
-        self._pad_id = self._tokenizer.pad_token_id
-        self._default_label = 0
-
-    def build_bert_example(
-        self, hyp: str, ref: str, target: Optional[str] = None, span_info: Optional[str] = None, infer: bool = False
-    ) -> Optional[BertExample]:
-        """Constructs a BERT Example.
-
-        Args:
-            hyp: Hypothesis text.
-            ref: Candidate customization variants divided by ';'
-            target:
-                if infer==False, string of labels (each label is 1-based index of correct candidate) or 0.
-                if infer==True, it can be None or string of labels (each label is 1-based index of some candidate). In inference this can be used to get corresponding fragments to fragment_indices.
-            span_info:
-                string of format "CUSTOM 6 20;CUSTOM 40 51", number of parts corresponds to number of targets. Can be empty if target is 0.
-                If infer==False, numbers are correct start and end(exclusive) positions of the corresponding target candidate in the text.
-                If infer==True, numbers are EXPECTED positions in the text. In inference this can be used to get corresponding fragments to fragment_indices.
-            infer: inference mode
-        Returns:
-            BertExample, or None if the conversion from text to tags was infeasible
-
-        Example (infer=False):
-            hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o"
-            ref: "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y"
-            target: "1 3"
-            span_info: "CUSTOM 12 23;CUSTOM 28 41"
-        """
-        if not ref.count(";") == 9:
-            raise ValueError("Expect 10 candidates: " + ref)
-
-        span_info_parts = []
-        targets = []
-
-        if len(target) > 0 and target != "0":
-            span_info_parts = span_info.split(";")
-            targets = list(map(int, target.split(" ")))
-            if len(span_info_parts) != len(targets):
-                raise ValueError(
-                    "len(span_info_parts)="
-                    + str(len(span_info_parts))
-                    + " is different from len(target_parts)="
-                    + str(len(targets))
-                )
-
-        tags = [0 for _ in hyp.split()]
-        if not infer:
-            for p, t in zip(span_info_parts, targets):
-                c, start, end = p.split(" ")
-                start = int(start)
-                end = int(end)
-                tags[start:end] = [t for i in range(end - start)]
-
-        # get input features for characters
-        (
-            input_ids,
-            input_mask,
-            segment_ids,
-            labels_mask,
-            labels,
-            _,
-            _,
-        ) = self._get_input_features(hyp=hyp, ref=ref, tags=tags)
-
-        # get input features for words
-        hyp_with_words = hyp.replace(" ", "").replace("_", " ")
-        ref_with_words = ref.replace(" ", "").replace("_", " ")
-        (
-            input_ids_for_subwords,
-            input_mask_for_subwords,
-            segment_ids_for_subwords,
-            _,
-            _,
-            _,
-            _,
-        ) = self._get_input_features(hyp=hyp_with_words, ref=ref_with_words, tags=None)
-
-        # used in forward to concatenate subword embeddings to character embeddings
-        character_pos_to_subword_pos = self._map_characters_to_subwords(input_ids, input_ids_for_subwords)
-
-        fragment_indices = []
-        if infer:
-            # used in inference to take argmax over whole fragments instead of separate characters to get more consistent predictions
-            fragment_indices = self._get_fragment_indices(hyp, targets, span_info_parts)
-
-        spans = []
-        if not infer:
-            # during training spans are used in validation step to calculate accuracy on whole custom phrases instead of separate characters
-            spans = self._get_spans(span_info_parts)
-
-        if len(input_ids) > self._max_seq_length or len(spans) > self._max_spans_length:
-            print(
-                "Max len exceeded: len(input_ids)=",
-                len(input_ids),
-                "; _max_seq_length=",
-                self._max_seq_length,
-                "; len(spans)=",
-                len(spans),
-                "; _max_spans_length=",
-                self._max_spans_length,
-            )
-            return None
-
-        example = BertExample(
-            input_ids=input_ids,
-            input_mask=input_mask,
-            segment_ids=segment_ids,
-            input_ids_for_subwords=input_ids_for_subwords,
-            input_mask_for_subwords=input_mask_for_subwords,
-            segment_ids_for_subwords=segment_ids_for_subwords,
-            character_pos_to_subword_pos=character_pos_to_subword_pos,
-            fragment_indices=fragment_indices,
-            labels_mask=labels_mask,
-            labels=labels,
-            spans=spans,
-            default_label=self._default_label,
-        )
-        return example
-
-    def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]:
-        """Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample
-
-        Example:
-            span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"]
-            result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)]
-        """
-        result_spans = []
-
-        for p in span_info_parts:
-            if p == "":
-                break
-            c, start, end = p.split(" ")
-            if c not in self._semiotic_classes:
-                raise KeyError("class=" + c + " not found in self._semiotic_classes")
-            cid = self._semiotic_classes[c]
-            # +1 because this should be indexing on input_ids which has [CLS] token at beginning
-            start = int(start) + 1
-            end = int(end) + 1
-            result_spans.append((cid, start, end))
-        return result_spans
-
-    def _get_fragment_indices(
-        self, hyp: str, targets: List[int], span_info_parts: List[str]
-    ) -> Tuple[List[Tuple[int, int, int]]]:
-        """Build fragment indices for real candidates.
-        This is used only at inference.
-        After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams).
-        In this function we
-           1) adjust start/end positions to match word borders (possibly in multiple ways).
-           2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment).
-
-        Args:
-            hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore).
-            targets: list of candidate ids (only for real candidates, not dummy)
-            span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text.
-        Returns:
-            List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id.
-            Note that returned fragments can be unsorted and can overlap, it's ok.
-        Example:
-            hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o"
-            targets: [1 2 3 4 6 7 9]
-            span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion.
-            fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)]
-        """
-
-        fragment_indices = []
-
-        letters = hyp.split()
-
-        for target, p in zip(targets, span_info_parts):
-            _, start, end = p.split(" ")
-            start = int(start)
-            end = min(int(end), len(hyp))  # guarantee that end is not outside length
-
-            # Adjusting strategy 1: expand both sides to the nearest space.
-            # Adjust start by finding the nearest left space or beginning of text. If start is already some word beginning, it won't change.
-            k = start
-            while k > 0 and letters[k] != '_':
-                k -= 1
-            adjusted_start = k if k == 0 else k + 1
-
-            # Adjust end by finding the nearest right space. If end is already space or sentence end, it won't change.
-            k = end
-            while k < len(letters) and letters[k] != '_':
-                k += 1
-            adjusted_end = k
-
-            # +1 because this should be indexing on input_ids which has [CLS] token at beginning
-            fragment_indices.append((adjusted_start + 1, adjusted_end + 1, target))
-
-            # Adjusting strategy 2: try to shrink to the closest space (from left or right or both sides).
-            # For example, here the candidate "shippers" has a matching n-gram covering part of previous word
-            # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w
-            # 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
-            expanded_fragment = "".join(letters[adjusted_start:adjusted_end])
-            left_space_position = expanded_fragment.find("_")
-            right_space_position = expanded_fragment.rfind("_")
-            is_left_shrink = False
-            is_right_shrink = False
-            if left_space_position > -1 and left_space_position < len(expanded_fragment) / 2:
-                # +1 because of CLS token, another +1 to put start position after found space
-                fragment_indices.append((adjusted_start + 1 + left_space_position + 1, adjusted_end + 1, target))
-                is_left_shrink = True
-            if right_space_position > -1 and right_space_position > len(expanded_fragment) / 2:
-                fragment_indices.append((adjusted_start + 1, adjusted_start + 1 + right_space_position, target))
-                is_right_shrink = True
-            if is_left_shrink and is_right_shrink:
-                fragment_indices.append(
-                    (adjusted_start + 1 + left_space_position + 1, adjusted_start + 1 + right_space_position, target)
-                )
-
-        return fragment_indices
-
-    def _map_characters_to_subwords(self, input_ids: List[int], input_ids_for_subwords: List[int]) -> List[int]:
-        """Maps each single character to the position of its corresponding subword.
-
-        Args:
-            input_ids: List of character token ids.
-            input_ids_for_subwords: List of subword token ids.
-        Returns:
-            List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids)
-
-        Example:
-            input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102]
-            input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102]
-            result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47]
-        """
-        character_pos_to_subword_pos = [0 for _ in input_ids]
-
-        ## '[CLS]', 'a', 's', 't', 'r', 'o', 'n', 'o', 'm', 'e', 'r', 's', '_', 'd', 'i', ..., 'l', 'o', '[SEP]', 'd', 'i', 'd', 'i', 'e', 'r', '_', 's', 'a', 'u', 'm', 'o', 'n', ..., '[SEP]'
-        tokens = self._tokenizer.convert_ids_to_tokens(input_ids)
-        ## '[CLS]', 'astronomers', 'did', '##ie', 'so', '##mon', 'and', 'tri', '##sti', '##an', 'g', '##llo', '[SEP]', 'did', '##ier', 'sa', '##um', '##on', '[SEP]', 'astro', '##no', '##mie', '[SEP]', 'tristan', 'gui', '##llo', '##t', '[SEP]', ..., '[SEP]', 'mercy', '[SEP]']
-        tokens_for_subwords = self._tokenizer.convert_ids_to_tokens(input_ids_for_subwords)
-        j = 0  # index for tokens_for_subwords
-        j_offset = 0  # current letter index within subword
-        for i in range(len(tokens)):
-            character = tokens[i]
-            subword = tokens_for_subwords[j]
-            if character == "[CLS]" and subword == "[CLS]":
-                character_pos_to_subword_pos[i] = j
-                j += 1
-                continue
-            if character == "[SEP]" and subword == "[SEP]":
-                character_pos_to_subword_pos[i] = j
-                j += 1
-                continue
-            if character == "[CLS]" or character == "[SEP]" or subword == "[CLS]" or subword == "[SEP]":
-                raise IndexError(
-                    "character["
-                    + str(i)
-                    + "]="
-                    + character
-                    + "; subword["
-                    + str(j)
-                    + ";="
-                    + subword
-                    + "subwords="
-                    + str(tokens_for_subwords)
-                )
-            # At this point we expect that
-            #    subword either 1) is a normal first token of a word or 2) starts with "##" (not first word token)
-            #    character either 1) is a normal character or 2) is a space character "_"
-            if character == "_":
-                character_pos_to_subword_pos[i] = j - 1  # space is assigned to previous subtoken
-                continue
-            if j_offset < len(subword):
-                if character == subword[j_offset]:
-                    character_pos_to_subword_pos[i] = j
-                    j_offset += 1
-                else:
-                    raise IndexError(
-                        "character mismatch:"
-                        + "i="
-                        + str(i)
-                        + "j="
-                        + str(j)
-                        + "j_offset="
-                        + str(j_offset)
-                        + "; len(tokens)="
-                        + str(len(tokens))
-                        + "; len(subwords)="
-                        + str(len(tokens_for_subwords))
-                    )
-            # if subword is finished, increase j
-            if j_offset >= len(subword):
-                j += 1
-                j_offset = 0
-                if j >= len(tokens_for_subwords):
-                    break
-                if tokens_for_subwords[j].startswith("##"):
-                    j_offset = 2
-        # check that all subword tokens are processed
-        if j < len(tokens_for_subwords):
-            raise IndexError(
-                "j="
-                + str(j)
-                + "; len(tokens)="
-                + str(len(tokens))
-                + "; len(subwords)="
-                + str(len(tokens_for_subwords))
-            )
-        return character_pos_to_subword_pos
-
-    def _get_input_features(
-        self, hyp: str, ref: str, tags: List[int]
-    ) -> Tuple[List[int], List[int], List[int], List[int], List[int], List[str], List[int]]:
-        """Converts given ASR-hypothesis(hyp) and candidate string(ref) to features(token ids, mask, segment ids, etc).
-
-        Args:
-            hyp: Hypothesis text.
-            ref: Candidate customization variants divided by ';'
-            tags: List of labels corresponding to each token of ASR-hypothesis or None when building an example during inference.
-        Returns:
-            Features (input_ids, input_mask, segment_ids, labels_mask, labels, hyp_tokens, token_start_indices)
-
-        Note that this method is called both for character-based example and for word-based example (to split to subwords).
-
-        Character-based example:
-            hyp:  "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o"
-            ref:  "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y"
-            tags: "0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3"
-
-            resulting token sequence:
-                '[CLS]', 'a', 's', 't', 'r', 'o', 'n', 'o', 'm', 'e', 'r', 's', '_', 'd', 'i', ..., 'l', 'o', '[SEP]', 'd', 'i', 'd', 'i', 'e', 'r', '_', 's', 'a', 'u', 'm', 'o', 'n', ..., '[SEP]'
-
-        Word-based example:
-            hyp:  "astronomers didie somon and tristian gllo"
-            ref:  "didier saumon;astronomie;tristan guillot;tristesse;monade;christian;astronomer;solomon;dididididi;mercy"
-            tags: None (not used for word-based case)
-
-            resulting token sequence:
-                '[CLS]', 'astronomers', 'did', '##ie', 'so', '##mon', 'and', 'tri', '##sti', '##an', 'g', '##llo', '[SEP]', 'did', '##ier', 'sa', '##um', '##on', '[SEP]', 'astro', '##no', '##mie', '[SEP]', 'tristan', 'gui', '##llo', '##t', '[SEP]', ..., '[SEP]', 'mercy', '[SEP]']
-        """
-
-        labels_mask = []
-        labels = []
-        if tags is None:
-            hyp_tokens, token_start_indices = self._split_to_wordpieces(hyp.split())
-        else:
-            hyp_tokens, labels, token_start_indices = self._split_to_wordpieces_with_labels(hyp.split(), tags)
-        references = ref.split(";")
-        all_ref_tokens = []
-        all_ref_segment_ids = []
-        for i in range(len(references)):
-            ref_tokens, _ = self._split_to_wordpieces(references[i].split())
-            all_ref_tokens.extend(ref_tokens + ["[SEP]"])
-            all_ref_segment_ids.extend([i + 1] * (len(ref_tokens) + 1))
-
-        input_tokens = ["[CLS]"] + hyp_tokens + ["[SEP]"] + all_ref_tokens  # ends with [SEP]
-        input_ids = self._tokenizer.convert_tokens_to_ids(input_tokens)
-        input_mask = [1] * len(input_ids)
-        segment_ids = [0] + [0] * len(hyp_tokens) + [0] + all_ref_segment_ids
-        if len(input_ids) != len(segment_ids):
-            raise ValueError(
-                "len(input_ids)="
-                + str(len(input_ids))
-                + " is different from len(segment_ids)="
-                + str(len(segment_ids))
-            )
-
-        if tags:
-            labels_mask = [0] + [1] * len(labels) + [0] + [0] * len(all_ref_tokens)
-            labels = [0] + labels + [0] + [0] * len(all_ref_tokens)
-        return (input_ids, input_mask, segment_ids, labels_mask, labels, hyp_tokens, token_start_indices)
-
-    def _split_to_wordpieces_with_labels(
-        self, tokens: List[str], labels: List[int]
-    ) -> Tuple[List[str], List[int], List[int]]:
-        """Splits tokens (and the labels accordingly) to WordPieces.
-
-        Args:
-            tokens: Tokens to be split.
-            labels: Labels (one per token) to be split.
-
-        Returns:
-            3-tuple with the split tokens, split labels, and the indices of starting tokens of words
-        """
-        bert_tokens = []  # Original tokens split into wordpieces.
-        bert_labels = []  # Label for each wordpiece.
-        # Index of each wordpiece that starts a new token.
-        token_start_indices = []
-        for i, token in enumerate(tokens):
-            # '+ 1' is because bert_tokens will be prepended by [CLS] token later.
-            token_start_indices.append(len(bert_tokens) + 1)
-            pieces = self._tokenizer.tokenize(token)
-            bert_tokens.extend(pieces)
-            bert_labels.extend([labels[i]] * len(pieces))
-        return bert_tokens, bert_labels, token_start_indices
-
-    def _split_to_wordpieces(self, tokens: List[str]) -> Tuple[List[str], List[int]]:
-        """Splits tokens to WordPieces.
-
-        Args:
-            tokens: Tokens to be split.
-
-        Returns:
-            tuple with the split tokens, and the indices of the WordPieces that start a token.
-        """
-        bert_tokens = []  # Original tokens split into wordpieces.
-        # Index of each wordpiece that starts a new token.
-        token_start_indices = []
-        for i, token in enumerate(tokens):
-            # '+ 1' is because bert_tokens will be prepended by [CLS] token later.
-            token_start_indices.append(len(bert_tokens) + 1)
-            pieces = self._tokenizer.tokenize(token)
-            bert_tokens.extend(pieces)
-        return bert_tokens, token_start_indices
-
-    def read_input_file(
-        self, input_filename: str, infer: bool = False
-    ) -> Union[List['BertExample'], Tuple[List['BertExample'], Tuple[str, str]]]:
-        """Reads in Tab Separated Value file and converts to training/inference-ready examples.
-
-        Args:
-            example_builder: Instance of BertExampleBuilder
-            input_filename: Path to the TSV input file.
-            infer: If true, input examples do not contain target info.
-
-        Returns:
-            examples: List of converted examples (BertExample).
-               or
-            (examples, hyps_refs): If infer==true, returns h
-        """
-
-        if not path.exists(input_filename):
-            raise ValueError("Cannot find file: " + input_filename)
-        examples = []  # output list of BertExample
-        hyps_refs = []  # output list of tuples (ASR-hypothesis, candidate_str)
-        with open(input_filename, 'r') as f:
-            for line in f:
-                if len(examples) % 1000 == 0:
-                    logging.info("{} examples processed.".format(len(examples)))
-                if infer:
-                    parts = line.rstrip('\n').split('\t')
-                    hyp, ref, target, span_info = parts[0], parts[1], None, None
-                    if len(parts) == 4:
-                        target, span_info = parts[2], parts[3]
-                    try:
-                        example = self.build_bert_example(hyp, ref, target=target, span_info=span_info, infer=infer)
-                    except Exception as e:
-                        logging.warning(str(e))
-                        logging.warning(line)
-                        continue
-                    if example is None:
-                        logging.info("cannot create example: ")
-                        logging.info(line)
-                        continue
-                    hyps_refs.append((hyp, ref))
-                    examples.append(example)
-                else:
-                    hyp, ref, target, semiotic_info = line.rstrip('\n').split('\t')
-                    try:
-                        example = self.build_bert_example(
-                            hyp, ref, target=target, span_info=semiotic_info, infer=infer
-                        )
-                    except Exception as e:
-                        logging.warning(str(e))
-                        logging.warning(line)
-                        continue
-                    if example is None:
-                        logging.info("cannot create example: ")
-                        logging.info(line)
-                        continue
-                    examples.append(example)
-        logging.info(f"Done. {len(examples)} examples converted.")
-        if infer:
-            return examples, hyps_refs
-        return examples
diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py b/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py
deleted file mode 100644
index 5898e6e83bdd..000000000000
--- a/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py
+++ /dev/null
@@ -1,523 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pickle
-from io import BytesIO
-from typing import Dict, List, Optional, Tuple
-
-import braceexpand
-import numpy as np
-import torch
-import webdataset as wds
-
-from nemo.collections.nlp.data.spellchecking_asr_customization.bert_example import BertExampleBuilder
-from nemo.core.classes.dataset import Dataset, IterableDataset
-from nemo.core.neural_types import ChannelType, IntType, LabelsType, MaskType, NeuralType
-from nemo.utils import logging
-from nemo.utils.distributed import webdataset_split_by_workers
-
-__all__ = [
-    "SpellcheckingAsrCustomizationDataset",
-    "SpellcheckingAsrCustomizationTestDataset",
-    "TarredSpellcheckingAsrCustomizationDataset",
-]
-
-
-def collate_train_dataset(
-    batch: List[
-        Tuple[
-            np.ndarray,
-            np.ndarray,
-            np.ndarray,
-            np.ndarray,
-            np.ndarray,
-            np.ndarray,
-            np.ndarray,
-            np.ndarray,
-            np.ndarray,
-            np.ndarray,
-        ]
-    ],
-    pad_token_id: int,
-) -> Tuple[
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-]:
-    """collate batch of training items 
-    Args:
-        batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, labels_mask, labels, spans).
-        pad_token_id: integer id of padding token (to use in padded_input_ids, padded_input_ids_for_subwords)
-    """
-    max_length = 0
-    max_length_for_subwords = 0
-    max_length_for_spans = 1  # to avoid empty tensor
-    for (
-        input_ids,
-        input_mask,
-        segment_ids,
-        input_ids_for_subwords,
-        input_mask_for_subwords,
-        segment_ids_for_subwords,
-        character_pos_to_subword_pos,
-        labels_mask,
-        labels,
-        spans,
-    ) in batch:
-        if len(input_ids) > max_length:
-            max_length = len(input_ids)
-        if len(input_ids_for_subwords) > max_length_for_subwords:
-            max_length_for_subwords = len(input_ids_for_subwords)
-        if len(spans) > max_length_for_spans:
-            max_length_for_spans = len(spans)
-
-    padded_input_ids = []
-    padded_input_mask = []
-    padded_segment_ids = []
-    padded_input_ids_for_subwords = []
-    padded_input_mask_for_subwords = []
-    padded_segment_ids_for_subwords = []
-    padded_character_pos_to_subword_pos = []
-    padded_labels_mask = []
-    padded_labels = []
-    padded_spans = []
-    for (
-        input_ids,
-        input_mask,
-        segment_ids,
-        input_ids_for_subwords,
-        input_mask_for_subwords,
-        segment_ids_for_subwords,
-        character_pos_to_subword_pos,
-        labels_mask,
-        labels,
-        spans,
-    ) in batch:
-        if len(input_ids) < max_length:
-            pad_length = max_length - len(input_ids)
-            padded_input_ids.append(np.pad(input_ids, pad_width=[0, pad_length], constant_values=pad_token_id))
-            padded_input_mask.append(np.pad(input_mask, pad_width=[0, pad_length], constant_values=0))
-            padded_segment_ids.append(np.pad(segment_ids, pad_width=[0, pad_length], constant_values=0))
-            padded_labels_mask.append(np.pad(labels_mask, pad_width=[0, pad_length], constant_values=0))
-            padded_labels.append(np.pad(labels, pad_width=[0, pad_length], constant_values=0))
-            padded_character_pos_to_subword_pos.append(
-                np.pad(character_pos_to_subword_pos, pad_width=[0, pad_length], constant_values=0)
-            )
-        else:
-            padded_input_ids.append(input_ids)
-            padded_input_mask.append(input_mask)
-            padded_segment_ids.append(segment_ids)
-            padded_labels_mask.append(labels_mask)
-            padded_labels.append(labels)
-            padded_character_pos_to_subword_pos.append(character_pos_to_subword_pos)
-
-        if len(input_ids_for_subwords) < max_length_for_subwords:
-            pad_length = max_length_for_subwords - len(input_ids_for_subwords)
-            padded_input_ids_for_subwords.append(
-                np.pad(input_ids_for_subwords, pad_width=[0, pad_length], constant_values=pad_token_id)
-            )
-            padded_input_mask_for_subwords.append(
-                np.pad(input_mask_for_subwords, pad_width=[0, pad_length], constant_values=0)
-            )
-            padded_segment_ids_for_subwords.append(
-                np.pad(segment_ids_for_subwords, pad_width=[0, pad_length], constant_values=0)
-            )
-        else:
-            padded_input_ids_for_subwords.append(input_ids_for_subwords)
-            padded_input_mask_for_subwords.append(input_mask_for_subwords)
-            padded_segment_ids_for_subwords.append(segment_ids_for_subwords)
-
-        if len(spans) < max_length_for_spans:
-            padded_spans.append(np.ones((max_length_for_spans, 3), dtype=int) * -1)  # pad value is [-1, -1, -1]
-            if len(spans) > 0:
-                padded_spans[-1][: spans.shape[0], : spans.shape[1]] = spans  # copy actual spans to the beginning
-        else:
-            padded_spans.append(spans)
-
-    return (
-        torch.LongTensor(np.array(padded_input_ids)),
-        torch.LongTensor(np.array(padded_input_mask)),
-        torch.LongTensor(np.array(padded_segment_ids)),
-        torch.LongTensor(np.array(padded_input_ids_for_subwords)),
-        torch.LongTensor(np.array(padded_input_mask_for_subwords)),
-        torch.LongTensor(np.array(padded_segment_ids_for_subwords)),
-        torch.LongTensor(np.array(padded_character_pos_to_subword_pos)),
-        torch.LongTensor(np.array(padded_labels_mask)),
-        torch.LongTensor(np.array(padded_labels)),
-        torch.LongTensor(np.array(padded_spans)),
-    )
-
-
-def collate_test_dataset(
-    batch: List[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]],
-    pad_token_id: int,
-) -> Tuple[
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-    torch.LongTensor,
-]:
-    """collate batch of test items 
-    Args:
-        batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, fragment_indices).
-        pad_token_id: integer id of padding token (to use in padded_input_ids, padded_input_ids_for_subwords)
-    """
-    max_length = 0
-    max_length_for_subwords = 0
-    max_length_for_fragment_indices = 1  # to avoid empty tensor
-    for (
-        input_ids,
-        input_mask,
-        segment_ids,
-        input_ids_for_subwords,
-        input_mask_for_subwords,
-        segment_ids_for_subwords,
-        character_pos_to_subword_pos,
-        fragment_indices,
-    ) in batch:
-        if len(input_ids) > max_length:
-            max_length = len(input_ids)
-        if len(input_ids_for_subwords) > max_length_for_subwords:
-            max_length_for_subwords = len(input_ids_for_subwords)
-        if len(fragment_indices) > max_length_for_fragment_indices:
-            max_length_for_fragment_indices = len(fragment_indices)
-
-    padded_input_ids = []
-    padded_input_mask = []
-    padded_segment_ids = []
-    padded_input_ids_for_subwords = []
-    padded_input_mask_for_subwords = []
-    padded_segment_ids_for_subwords = []
-    padded_character_pos_to_subword_pos = []
-    padded_fragment_indices = []
-    for (
-        input_ids,
-        input_mask,
-        segment_ids,
-        input_ids_for_subwords,
-        input_mask_for_subwords,
-        segment_ids_for_subwords,
-        character_pos_to_subword_pos,
-        fragment_indices,
-    ) in batch:
-        if len(input_ids) < max_length:
-            pad_length = max_length - len(input_ids)
-            padded_input_ids.append(np.pad(input_ids, pad_width=[0, pad_length], constant_values=pad_token_id))
-            padded_input_mask.append(np.pad(input_mask, pad_width=[0, pad_length], constant_values=0))
-            padded_segment_ids.append(np.pad(segment_ids, pad_width=[0, pad_length], constant_values=0))
-            padded_character_pos_to_subword_pos.append(
-                np.pad(character_pos_to_subword_pos, pad_width=[0, pad_length], constant_values=0)
-            )
-        else:
-            padded_input_ids.append(input_ids)
-            padded_input_mask.append(input_mask)
-            padded_segment_ids.append(segment_ids)
-            padded_character_pos_to_subword_pos.append(character_pos_to_subword_pos)
-
-        if len(input_ids_for_subwords) < max_length_for_subwords:
-            pad_length = max_length_for_subwords - len(input_ids_for_subwords)
-            padded_input_ids_for_subwords.append(
-                np.pad(input_ids_for_subwords, pad_width=[0, pad_length], constant_values=pad_token_id)
-            )
-            padded_input_mask_for_subwords.append(
-                np.pad(input_mask_for_subwords, pad_width=[0, pad_length], constant_values=0)
-            )
-            padded_segment_ids_for_subwords.append(
-                np.pad(segment_ids_for_subwords, pad_width=[0, pad_length], constant_values=0)
-            )
-        else:
-            padded_input_ids_for_subwords.append(input_ids_for_subwords)
-            padded_input_mask_for_subwords.append(input_mask_for_subwords)
-            padded_segment_ids_for_subwords.append(segment_ids_for_subwords)
-
-        if len(fragment_indices) < max_length_for_fragment_indices:
-            # we use [0, 1, 0] as padding value for fragment_indices, it corresponds to [CLS] token, which is ignored and won't affect anything
-            p = np.zeros((max_length_for_fragment_indices, 3), dtype=int)
-            p[:, 1] = 1
-            p[:, 2] = 0
-            padded_fragment_indices.append(p)
-            if len(fragment_indices) > 0:
-                padded_fragment_indices[-1][
-                    : fragment_indices.shape[0], : fragment_indices.shape[1]
-                ] = fragment_indices  # copy actual fragment_indices to the beginning
-        else:
-            padded_fragment_indices.append(fragment_indices)
-
-    return (
-        torch.LongTensor(np.array(padded_input_ids)),
-        torch.LongTensor(np.array(padded_input_mask)),
-        torch.LongTensor(np.array(padded_segment_ids)),
-        torch.LongTensor(np.array(padded_input_ids_for_subwords)),
-        torch.LongTensor(np.array(padded_input_mask_for_subwords)),
-        torch.LongTensor(np.array(padded_segment_ids_for_subwords)),
-        torch.LongTensor(np.array(padded_character_pos_to_subword_pos)),
-        torch.LongTensor(np.array(padded_fragment_indices)),
-    )
-
-
-class SpellcheckingAsrCustomizationDataset(Dataset):
-    """
-    Dataset as used by the SpellcheckingAsrCustomizationModel for training and validation pipelines.
-
-    Args:
-        input_file (str): path to tsv-file with data
-        example_builder: instance of BertExampleBuilder
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-        """
-        return {
-            "input_ids": NeuralType(('B', 'T'), ChannelType()),
-            "input_mask": NeuralType(('B', 'T'), MaskType()),
-            "segment_ids": NeuralType(('B', 'T'), ChannelType()),
-            "input_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()),
-            "input_mask_for_subwords": NeuralType(('B', 'T'), MaskType()),
-            "segment_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()),
-            "character_pos_to_subword_pos": NeuralType(('B', 'T'), ChannelType()),
-            "labels_mask": NeuralType(('B', 'T'), MaskType()),
-            "labels": NeuralType(('B', 'T'), LabelsType()),
-            "spans": NeuralType(('B', 'T', 'C'), IntType()),
-        }
-
-    def __init__(self, input_file: str, example_builder: BertExampleBuilder) -> None:
-        self.example_builder = example_builder
-        self.examples = self.example_builder.read_input_file(input_file, infer=False)
-        self.pad_token_id = self.example_builder._pad_id
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, idx: int):
-        example = self.examples[idx]
-        input_ids = np.array(example.features["input_ids"], dtype=np.int16)
-        input_mask = np.array(example.features["input_mask"], dtype=np.int8)
-        segment_ids = np.array(example.features["segment_ids"], dtype=np.int8)
-        input_ids_for_subwords = np.array(example.features["input_ids_for_subwords"], dtype=np.int16)
-        input_mask_for_subwords = np.array(example.features["input_mask_for_subwords"], dtype=np.int8)
-        segment_ids_for_subwords = np.array(example.features["segment_ids_for_subwords"], dtype=np.int8)
-        character_pos_to_subword_pos = np.array(example.features["character_pos_to_subword_pos"], dtype=np.int16)
-        labels_mask = np.array(example.features["labels_mask"], dtype=np.int8)
-        labels = np.array(example.features["labels"], dtype=np.int8)
-        spans = np.array(example.features["spans"], dtype=np.int16)
-        return (
-            input_ids,
-            input_mask,
-            segment_ids,
-            input_ids_for_subwords,
-            input_mask_for_subwords,
-            segment_ids_for_subwords,
-            character_pos_to_subword_pos,
-            labels_mask,
-            labels,
-            spans,
-        )
-
-    def _collate_fn(self, batch):
-        """collate batch of items
-        Args:
-            batch:  A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, labels_mask, labels, spans).
-        """
-        return collate_train_dataset(batch, pad_token_id=self.pad_token_id)
-
-
-class TarredSpellcheckingAsrCustomizationDataset(IterableDataset):
-    """
-    This Dataset loads training examples from tarred tokenized pickle files.
-    If using multiple processes the number of shards should be divisible by the number of workers to ensure an
-    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
-    Additionally, please note that the len() of this DataLayer is assumed to be the number of tokens
-    of the text data. Shard strategy is scatter - each node gets a unique set of shards, which are permanently
-    pre-allocated and never changed at runtime.
-    Args:
-        text_tar_filepaths: a string (can be brace-expandable).
-        shuffle_n (int): How many samples to look ahead and load to be shuffled.
-            See WebDataset documentation for more details.
-            Defaults to 0.
-        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
-        world_size (int): Total number of processes, used for partitioning shards. Defaults to 1.
-        pad_token_id: id of pad token (used in collate_fn)
-    """
-
-    def __init__(
-        self,
-        text_tar_filepaths: str,
-        shuffle_n: int = 1,
-        global_rank: int = 0,
-        world_size: int = 1,
-        pad_token_id: int = -1,  # use real value or get error
-    ):
-        super(TarredSpellcheckingAsrCustomizationDataset, self).__init__()
-        if pad_token_id < 0:
-            raise ValueError("use non-negative pad_token_id: " + str(pad_token_id))
-
-        self.pad_token_id = pad_token_id
-
-        # Replace '(', '[', '<' and '_OP_' with '{'
-        brace_keys_open = ['(', '[', '<', '_OP_']
-        for bkey in brace_keys_open:
-            if bkey in text_tar_filepaths:
-                text_tar_filepaths = text_tar_filepaths.replace(bkey, "{")
-
-        # Replace ')', ']', '>' and '_CL_' with '}'
-        brace_keys_close = [')', ']', '>', '_CL_']
-        for bkey in brace_keys_close:
-            if bkey in text_tar_filepaths:
-                text_tar_filepaths = text_tar_filepaths.replace(bkey, "}")
-
-        # Brace expand
-        text_tar_filepaths = list(braceexpand.braceexpand(text_tar_filepaths))
-
-        logging.info("Tarred dataset shards will be scattered evenly across all nodes.")
-        if len(text_tar_filepaths) % world_size != 0:
-            logging.warning(
-                f"Number of shards in tarred dataset ({len(text_tar_filepaths)}) is not divisible "
-                f"by number of distributed workers ({world_size}). "
-                f"Some shards will not be used ({len(text_tar_filepaths) % world_size})."
-            )
-        begin_idx = (len(text_tar_filepaths) // world_size) * global_rank
-        end_idx = begin_idx + (len(text_tar_filepaths) // world_size)
-        logging.info('Begin Index : %d' % (begin_idx))
-        logging.info('End Index : %d' % (end_idx))
-        text_tar_filepaths = text_tar_filepaths[begin_idx:end_idx]
-        logging.info(
-            "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx
-        )
-
-        self.tarpath = text_tar_filepaths
-
-        # Put together WebDataset
-        self._dataset = wds.DataPipeline(
-            wds.SimpleShardList(urls=text_tar_filepaths),
-            webdataset_split_by_workers,
-            wds.shuffle(shuffle_n),
-            wds.tarfile_to_samples(),
-            wds.rename(pkl='pkl', key='__key__'),
-            wds.to_tuple('pkl', 'key'),
-            wds.map(self._build_sample),
-        )
-
-    def _build_sample(self, fname):
-        # Load file
-        pkl_file, _ = fname
-        pkl_file = BytesIO(pkl_file)
-        data = pickle.load(pkl_file)
-        pkl_file.close()
-        input_ids = data["input_ids"]
-        input_mask = data["input_mask"]
-        segment_ids = data["segment_ids"]
-        input_ids_for_subwords = data["input_ids_for_subwords"]
-        input_mask_for_subwords = data["input_mask_for_subwords"]
-        segment_ids_for_subwords = data["segment_ids_for_subwords"]
-        character_pos_to_subword_pos = data["character_pos_to_subword_pos"]
-        labels_mask = data["labels_mask"]
-        labels = data["labels"]
-        spans = data["spans"]
-
-        return (
-            input_ids,
-            input_mask,
-            segment_ids,
-            input_ids_for_subwords,
-            input_mask_for_subwords,
-            segment_ids_for_subwords,
-            character_pos_to_subword_pos,
-            labels_mask,
-            labels,
-            spans,
-        )
-
-    def __iter__(self):
-        return self._dataset.__iter__()
-
-    def _collate_fn(self, batch):
-        """collate batch of items
-        Args:
-            batch:  A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, labels_mask, labels, spans).
-        """
-        return collate_train_dataset(batch, pad_token_id=self.pad_token_id)
-
-
-class SpellcheckingAsrCustomizationTestDataset(Dataset):
-    """
-    Dataset for inference pipeline.
-
-    Args:
-        sents: list of strings
-        example_builder: instance of BertExampleBuilder
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-        """
-        return {
-            "input_ids": NeuralType(('B', 'T'), ChannelType()),
-            "input_mask": NeuralType(('B', 'T'), MaskType()),
-            "segment_ids": NeuralType(('B', 'T'), ChannelType()),
-            "input_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()),
-            "input_mask_for_subwords": NeuralType(('B', 'T'), MaskType()),
-            "segment_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()),
-            "character_pos_to_subword_pos": NeuralType(('B', 'T'), ChannelType()),
-            "fragment_indices": NeuralType(('B', 'T', 'C'), IntType()),
-        }
-
-    def __init__(self, input_file: str, example_builder: BertExampleBuilder) -> None:
-        self.example_builder = example_builder
-        self.examples, self.hyps_refs = self.example_builder.read_input_file(input_file, infer=True)
-        self.pad_token_id = self.example_builder._pad_id
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, idx: int):
-        example = self.examples[idx]
-        input_ids = np.array(example.features["input_ids"])
-        input_mask = np.array(example.features["input_mask"])
-        segment_ids = np.array(example.features["segment_ids"])
-        input_ids_for_subwords = np.array(example.features["input_ids_for_subwords"])
-        input_mask_for_subwords = np.array(example.features["input_mask_for_subwords"])
-        segment_ids_for_subwords = np.array(example.features["segment_ids_for_subwords"])
-        character_pos_to_subword_pos = np.array(example.features["character_pos_to_subword_pos"], dtype=np.int64)
-        fragment_indices = np.array(example.features["fragment_indices"], dtype=np.int16)
-        return (
-            input_ids,
-            input_mask,
-            segment_ids,
-            input_ids_for_subwords,
-            input_mask_for_subwords,
-            segment_ids_for_subwords,
-            character_pos_to_subword_pos,
-            fragment_indices,
-        )
-
-    def _collate_fn(self, batch):
-        """collate batch of items
-        Args:
-            batch:  A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos).
-        """
-        return collate_test_dataset(batch, pad_token_id=self.pad_token_id)
diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py b/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py
deleted file mode 100644
index 7385f19b414a..000000000000
--- a/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py
+++ /dev/null
@@ -1,929 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import math
-import random
-import re
-from collections import defaultdict, namedtuple
-from typing import Dict, List, Set, Tuple, Union
-
-import numpy as np
-from numba import jit
-
-"""Utility functions for Spellchecking ASR Customization."""
-
-
-def replace_diacritics(text):
-    text = re.sub(r"[éèëēêęěė]", "e", text)  # latin
-    text = re.sub(r"[ё]", "е", text)  # cyrillic
-    text = re.sub(r"[ãâāáäăàąåạảǎ]", "a", text)
-    text = re.sub(r"[úūüùưûů]", "u", text)
-    text = re.sub(r"[ôōóöõòőø]", "o", text)
-    text = re.sub(r"[ćçč]", "c", text)
-    text = re.sub(r"[ïīíîıì]", "i", text)
-    text = re.sub(r"[ñńňņ]", "n", text)
-    text = re.sub(r"[țťţ]", "t", text)
-    text = re.sub(r"[łľļ]", "l", text)
-    text = re.sub(r"[żžź]", "z", text)
-    text = re.sub(r"[ğ]", "g", text)
-    text = re.sub(r"[ďđ]", "d", text)
-    text = re.sub(r"[ķ]", "k", text)
-    text = re.sub(r"[ř]", "r", text)
-    text = re.sub(r"[ý]", "y", text)
-    text = re.sub(r"[æ]", "ae", text)
-    text = re.sub(r"[œ]", "oe", text)
-    text = re.sub(r"[șşšś]", "s", text)
-    return text
-
-
-def load_ngram_mappings(input_name: str, max_misspelled_freq: int = 1000000000) -> Tuple[defaultdict, Set]:
-    """Loads n-gram mapping vocabularies in form required by dynamic programming
-    Args:
-        input_name: file with n-gram mappings
-        max_misspelled_freq: threshold on misspelled n-gram frequency
-    Returns:
-        vocab: dict {key=original_ngram, value=dict{key=misspelled_ngram, value=frequency}}
-        ban_ngram: set of banned misspelled n-grams
-
-    Input format:
-        u t o	u+i t o	49	8145	114
-        u t o	<DELETE> t e	63	8145	16970
-        u t o	o+_ t o	42	8145	1807
-    """
-    vocab = defaultdict(dict)
-    ban_ngram = set()
-
-    with open(input_name, "r", encoding="utf-8") as f:
-        for line in f:
-            orig, misspelled, joint_freq, orig_freq, misspelled_freq = line.strip().split("\t")
-            if orig == "" or misspelled == "":
-                raise ValueError("Empty n-gram: orig=" + orig + "; misspelled=" + misspelled)
-            misspelled = misspelled.replace("<DELETE>", "=")
-            if misspelled.replace("=", "").strip() == "":  # skip if resulting ngram doesn't contain any real character
-                continue
-            if int(misspelled_freq) > max_misspelled_freq:
-                ban_ngram.add(misspelled + " ")  # space at the end is required within get_index function
-            vocab[orig][misspelled] = int(joint_freq) / int(orig_freq)
-    return vocab, ban_ngram
-
-
-def load_ngram_mappings_for_dp(input_name: str) -> Tuple[defaultdict, defaultdict, defaultdict, int]:
-    """Loads n-gram mapping vocabularies in form required by dynamic programming
-    Args:
-        input_name: file with n-gram mappings
-    Returns:
-        joint_vocab: dict where key=(original_ngram, misspelled_ngram), value=frequency
-        orig_vocab: dict where key=original_ngram, value=frequency
-        misspelled_vocab: dict where key=misspelled_ngram, value=frequency
-        max_len: maximum n-gram length seen in vocabulary
-
-    Input format: original \t misspelled \t joint_freq \t original_freq \t misspelled_freq
-        u t o	u+i t o	49	8145	114
-        u t o	<DELETE> t e	63	8145	16970
-        u t o	o+_ t o	42	8145	1807
-    """
-    joint_vocab = defaultdict(int)
-    orig_vocab = defaultdict(int)
-    misspelled_vocab = defaultdict(int)
-    max_len = 0
-    with open(input_name, "r", encoding="utf-8") as f:
-        for line in f:
-            orig, misspelled, joint_freq, _, _ = line.strip().split("\t")
-            if orig == "" or misspelled == "":
-                raise ValueError("Emty n-gram: orig=" + orig + "; misspelled=" + misspelled)
-            misspelled = misspelled.replace("<DELETE>", " ").replace("+", " ")
-            misspelled = " ".join(misspelled.split())
-            if misspelled == "":  # skip if resulting ngram doesn't contain any real character
-                continue
-            max_len = max(max_len, orig.count(" ") + 1, misspelled.count(" ") + 1)
-            joint_vocab[(orig, misspelled)] += int(joint_freq)
-            orig_vocab[orig] += int(joint_freq)
-            misspelled_vocab[misspelled] += int(joint_freq)
-    return joint_vocab, orig_vocab, misspelled_vocab, max_len
-
-
-def get_alignment_by_dp(
-    ref_phrase: str, hyp_phrase: str, dp_data: Tuple[defaultdict, defaultdict, defaultdict, int]
-) -> List[Tuple[str, str, float, float, int, int, int]]:
-    """Get best alignment path between a reference and (possibly) misspelled phrase using n-gram mappings vocabulary.
-    Args:
-        ref_phrase: candidate reference phrase (letters separated by space, real space replaced by underscore) 
-        hyp_phrase: (possibly) misspelled phrase (letters separated by space, real space replaced by underscore)
-        dp_data: n-gram mapping vocabularies used by dynamic programming
-    Returns:
-        list of tuples (hyp_ngram, ref_ngram, logprob, sum_logprob, joint_freq, orig_freq, misspelled_freq)
-            This is best alignment path.
-
-    Example:
-        ref_phrase: "a n h y d r i d e"
-        hyp_phrase: "a n d _ h y d r o d"
-
-    Result:
-	    [("*", "*", 0.0, 0.0, 0, 0, 0)
-	     ("a n d _ h", "a n h", -2.34, -2.34, 226, 2338, 2203)
-	     ("y d r o", "y d r i", -2.95, -5.29, 11, 211, 1584)
-	     ("d", "d e", -1.99, -7.28, 60610, 444714, 2450334)
-        ]
-    Final path score is in path[-1][3]:  -7.28
-    Note that the order of ref_phrase and hyp_phrase matters, because n-gram mappings vocabulary is not symmetrical.
-    """
-    joint_vocab, orig_vocab, misspelled_vocab, max_len = dp_data
-    hyp_letters = ["*"] + hyp_phrase.split()
-    ref_letters = ["*"] + ref_phrase.split()
-    DpInfo = namedtuple(
-        "DpInfo", ["hyp_pos", "ref_pos", "best_hyp_ngram_len", "best_ref_ngram_len", "score", "sum_score"]
-    )
-    history = defaultdict(DpInfo)
-    history[(0, 0)] = DpInfo(
-        hyp_pos=0, ref_pos=0, best_hyp_ngram_len=1, best_ref_ngram_len=1, score=0.0, sum_score=0.0
-    )
-    for hyp_pos in range(len(hyp_letters)):
-        for ref_pos in range(len(ref_letters)):
-            if hyp_pos == 0 and ref_pos == 0:  # cell (0, 0) is already defined
-                continue
-            # consider cell (hyp_pos, ref_pos) and find best path to get there
-            best_hyp_ngram_len = 0
-            best_ref_ngram_len = 0
-            best_ngram_score = float("-inf")
-            best_sum_score = float("-inf")
-            # loop over paths ending on non-empty ngram mapping
-            for hyp_ngram_len in range(1, 1 + min(max_len, hyp_pos + 1)):
-                hyp_ngram = " ".join(hyp_letters[(hyp_pos - hyp_ngram_len + 1) : (hyp_pos + 1)])
-                for ref_ngram_len in range(1, 1 + min(max_len, ref_pos + 1)):
-                    ref_ngram = " ".join(ref_letters[(ref_pos - ref_ngram_len + 1) : (ref_pos + 1)])
-                    if (ref_ngram, hyp_ngram) not in joint_vocab:
-                        continue
-                    joint_freq = joint_vocab[(ref_ngram, hyp_ngram)]
-                    orig_freq = orig_vocab.get(ref_ngram, 1)
-                    ngram_score = math.log(joint_freq / orig_freq)
-                    previous_cell = (hyp_pos - hyp_ngram_len, ref_pos - ref_ngram_len)
-                    if previous_cell not in history:
-                        print("cell ", previous_cell, "does not exist")
-                        continue
-                    previous_score = history[previous_cell].sum_score
-                    sum_score = ngram_score + previous_score
-                    if sum_score > best_sum_score:
-                        best_sum_score = sum_score
-                        best_ngram_score = ngram_score
-                        best_hyp_ngram_len = hyp_ngram_len
-                        best_ref_ngram_len = ref_ngram_len
-            # loop over two variants with deletion of one character
-            deletion_score = -6.0
-            insertion_score = -6.0
-            if hyp_pos > 0:
-                previous_cell = (hyp_pos - 1, ref_pos)
-                previous_score = history[previous_cell].sum_score
-                sum_score = deletion_score + previous_score
-                if sum_score > best_sum_score:
-                    best_sum_score = sum_score
-                    best_ngram_score = deletion_score
-                    best_hyp_ngram_len = 1
-                    best_ref_ngram_len = 0
-
-            if ref_pos > 0:
-                previous_cell = (hyp_pos, ref_pos - 1)
-                previous_score = history[previous_cell].sum_score
-                sum_score = insertion_score + previous_score
-                if sum_score > best_sum_score:
-                    best_sum_score = sum_score
-                    best_ngram_score = insertion_score
-                    best_hyp_ngram_len = 0
-                    best_ref_ngram_len = 1
-
-            if best_hyp_ngram_len == 0 and best_ref_ngram_len == 0:
-                raise ValueError("best_hyp_ngram_len = 0 and best_ref_ngram_len = 0")
-
-            # save cell to history
-            history[(hyp_pos, ref_pos)] = DpInfo(
-                hyp_pos=hyp_pos,
-                ref_pos=ref_pos,
-                best_hyp_ngram_len=best_hyp_ngram_len,
-                best_ref_ngram_len=best_ref_ngram_len,
-                score=best_ngram_score,
-                sum_score=best_sum_score,
-            )
-    # now trace back on best path starting from last positions
-    path = []
-    hyp_pos = len(hyp_letters) - 1
-    ref_pos = len(ref_letters) - 1
-    cell_info = history[(hyp_pos, ref_pos)]
-    path.append(cell_info)
-    while hyp_pos > 0 or ref_pos > 0:
-        hyp_pos -= cell_info.best_hyp_ngram_len
-        ref_pos -= cell_info.best_ref_ngram_len
-        cell_info = history[(hyp_pos, ref_pos)]
-        path.append(cell_info)
-
-    result = []
-    for info in reversed(path):
-        hyp_ngram = " ".join(hyp_letters[(info.hyp_pos - info.best_hyp_ngram_len + 1) : (info.hyp_pos + 1)])
-        ref_ngram = " ".join(ref_letters[(info.ref_pos - info.best_ref_ngram_len + 1) : (info.ref_pos + 1)])
-        joint_freq = joint_vocab.get((ref_ngram, hyp_ngram), 0)
-        orig_freq = orig_vocab.get(ref_ngram, 0)
-        misspelled_freq = misspelled_vocab.get(hyp_ngram, 0)
-        result.append((hyp_ngram, ref_ngram, info.score, info.sum_score, joint_freq, orig_freq, misspelled_freq))
-    return result
-
-
-def get_index(
-    custom_phrases: List[str],
-    vocab: defaultdict,
-    ban_ngram_global: Set[str],
-    min_log_prob: float = -4.0,
-    max_phrases_per_ngram: int = 100,
-) -> Tuple[List[str], Dict[str, List[Tuple[int, int, int, float]]]]:
-    """Given a restricted vocabulary of replacements,
-    loops through custom phrases,
-    generates all possible conversions and creates index.
-
-    Args:
-        custom_phrases: list of all custom phrases, characters should be split by space,  real space replaced to underscore.
-        vocab: n-gram mappings vocabulary - dict {key=original_ngram, value=dict{key=misspelled_ngram, value=frequency}}
-        ban_ngram_global: set of banned misspelled n-grams
-        min_log_prob: minimum log probability, after which we stop growing this n-gram.
-        max_phrases_per_ngram: maximum phrases that we allow to store per one n-gram. N-grams exceeding that quantity get banned.
-
-    Returns:
-        phrases - list of phrases. Position in this list is used as phrase_id.
-        ngram2phrases - resulting index, i.e. dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob)
-    """
-
-    ban_ngram_local = set()  # these ngrams are banned only for given custom_phrases
-    ngram_to_phrase_and_position = defaultdict(list)
-
-    for custom_phrase in custom_phrases:
-        inputs = custom_phrase.split(" ")
-        begin = 0
-        index_keys = [{} for _ in inputs]  # key - letter ngram, index - beginning positions in phrase
-
-        for begin in range(len(inputs)):
-            for end in range(begin + 1, min(len(inputs) + 1, begin + 5)):
-                inp = " ".join(inputs[begin:end])
-                if inp not in vocab:
-                    continue
-                for rep in vocab[inp]:
-                    lp = math.log(vocab[inp][rep])
-
-                    for b in range(max(0, end - 5), end):  # try to grow previous ngrams with new replacement
-                        new_ngrams = {}
-                        for ngram in index_keys[b]:
-                            lp_prev = index_keys[b][ngram]
-                            if len(ngram) + len(rep) <= 10 and b + ngram.count(" ") == begin:
-                                if lp_prev + lp > min_log_prob:
-                                    new_ngrams[ngram + rep + " "] = lp_prev + lp
-                        index_keys[b].update(new_ngrams)  #  join two dictionaries
-                    # add current replacement as ngram
-                    if lp > min_log_prob:
-                        index_keys[begin][rep + " "] = lp
-
-        for b in range(len(index_keys)):
-            for ngram, lp in sorted(index_keys[b].items(), key=lambda item: item[1], reverse=True):
-                if ngram in ban_ngram_global:  # here ngram ends with a space
-                    continue
-                real_length = ngram.count(" ")
-                ngram = ngram.replace("+", " ").replace("=", " ")
-                ngram = " ".join(ngram.split())  # here ngram doesn't end with a space anymore
-                if ngram + " " in ban_ngram_global:  # this can happen after deletion of + and =
-                    continue
-                if ngram in ban_ngram_local:
-                    continue
-                ngram_to_phrase_and_position[ngram].append((custom_phrase, b, real_length, lp))
-                if len(ngram_to_phrase_and_position[ngram]) > max_phrases_per_ngram:
-                    ban_ngram_local.add(ngram)
-                    del ngram_to_phrase_and_position[ngram]
-                    continue
-
-    phrases = []  # id to phrase
-    phrase2id = {}  # phrase to id
-    ngram2phrases = defaultdict(list)  # ngram to list of tuples (phrase_id, begin, length, logprob)
-
-    for ngram in ngram_to_phrase_and_position:
-        for phrase, b, length, lp in ngram_to_phrase_and_position[ngram]:
-            if phrase not in phrase2id:
-                phrases.append(phrase)
-                phrase2id[phrase] = len(phrases) - 1
-            ngram2phrases[ngram].append((phrase2id[phrase], b, length, lp))
-
-    return phrases, ngram2phrases
-
-
-def load_index(input_name: str) -> Tuple[List[str], Dict[str, List[Tuple[int, int, int, float]]]]:
-    """ Load index from file
-    Args:
-        input_name: file with index
-    Returns:
-        phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id.
-        ngram2phrases: dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob)
-    """
-    phrases = []  # id to phrase
-    phrase2id = {}  # phrase to id
-    ngram2phrases = defaultdict(list)  # ngram to list of tuples (phrase_id, begin_pos, size, logprob)
-    with open(input_name, "r", encoding="utf-8") as f:
-        for line in f:
-            ngram, phrase, b, size, lp = line.split("\t")
-            b = int(b)
-            size = int(size)
-            lp = float(lp)
-            if phrase not in phrase2id:
-                phrases.append(phrase)
-                phrase2id[phrase] = len(phrases) - 1
-            ngram2phrases[ngram].append((phrase2id[phrase], b, size, lp))
-    return phrases, ngram2phrases
-
-
-def search_in_index(
-    ngram2phrases: Dict[str, List[Tuple[int, int, int, float]]], phrases: List[str], letters: Union[str, List[str]]
-) -> Tuple[np.ndarray, List[Set[str]]]:
-    """ Function used to search in index
-
-    Args:
-        ngram2phrases: dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob)
-        phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id.
-        letters: list of letters of ASR-hypothesis. Should not contain spaces - real spaces should be replaced with underscores.
-
-    Returns:
-        phrases2positions: a matrix of size (len(phrases), len(letters)).
-            It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere.
-            It is used later to find phrases with many hits within a contiguous window - potential matching candidates.
-        position2ngrams: positions in ASR-hypothesis mapped to sets of ngrams starting from that position.
-            It is used later to check how well each found candidate is covered by n-grams (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered).
-    """
-
-    if " " in letters:
-        raise ValueError("letters should not contain space: " + str(letters))
-
-    phrases2positions = np.zeros((len(phrases), len(letters)), dtype=float)
-    # positions mapped to sets of ngrams starting from that position
-    position2ngrams = [set() for _ in range(len(letters))]
-
-    begin = 0
-    for begin in range(len(letters)):
-        for end in range(begin + 1, min(len(letters) + 1, begin + 7)):
-            ngram = " ".join(letters[begin:end])
-            if ngram not in ngram2phrases:
-                continue
-            for phrase_id, b, size, lp in ngram2phrases[ngram]:
-                phrases2positions[phrase_id, begin:end] = 1.0
-            position2ngrams[begin].add(ngram)
-    return phrases2positions, position2ngrams
-
-
-@jit(nopython=True)  # Set "nopython" mode for best performance, equivalent to @njit
-def get_all_candidates_coverage(phrases, phrases2positions):
-    """Get maximum hit coverage for each phrase - within a moving window of length of the phrase.
-    Args:
-        phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id.
-        phrases2positions: a matrix of size (len(phrases), len(ASR-hypothesis)).
-            It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere.
-    Returns:
-        candidate2coverage: list of size len(phrases) containing coverage (0.0 to 1.0) in best window.
-        candidate2position: list of size len(phrases) containing starting position of best window.
-    """
-    candidate2coverage = [0.0] * len(phrases)
-    candidate2position = [-1] * len(phrases)
-
-    for i in range(len(phrases)):
-        phrase_length = phrases[i].count(" ") + 1
-        all_coverage = np.sum(phrases2positions[i]) / phrase_length
-        # if total coverage on whole ASR-hypothesis is too small, there is no sense in using moving window
-        if all_coverage < 0.4:
-            continue
-        moving_sum = np.sum(phrases2positions[i, 0:phrase_length])
-        max_sum = moving_sum
-        best_pos = 0
-        for pos in range(1, phrases2positions.shape[1] - phrase_length + 1):
-            moving_sum -= phrases2positions[i, pos - 1]
-            moving_sum += phrases2positions[i, pos + phrase_length - 1]
-            if moving_sum > max_sum:
-                max_sum = moving_sum
-                best_pos = pos
-
-        coverage = max_sum / (phrase_length + 2)  # smoothing
-        candidate2coverage[i] = coverage
-        candidate2position[i] = best_pos
-    return candidate2coverage, candidate2position
-
-
-def get_candidates(
-    ngram2phrases: Dict[str, List[Tuple[int, int, int, float]]],
-    phrases: List[str],
-    letters: Union[str, List[str]],
-    pool_for_random_candidates: List[str],
-    min_phrase_coverage: float = 0.8,
-) -> List[Tuple[str, int, int, float, float]]:
-    """Given an index of custom vocabulary and an ASR-hypothesis retrieve 10 candidates.
-    Args:
-        ngram2phrases: dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob)
-        phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id.
-        letters: list of letters of ASR-hypothesis. Should not contain spaces - real spaces should be replaced with underscores.
-        pool_for_random_candidates: large list of strings, from which to sample random candidates in case when there are less than 10 real candidates
-        min_phrase_coverage: We discard candidates which are not covered by n-grams to at least to this extent
-          (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered).
-     Returns:
-        candidates: list of tuples (candidate_text, approximate_begin_position, length, coverage of window in ASR-hypothesis, coverage of phrase itself).
-    """
-    phrases2positions, position2ngrams = search_in_index(ngram2phrases, phrases, letters)
-    candidate2coverage, candidate2position = get_all_candidates_coverage(phrases, phrases2positions)
-
-    # mask for each custom phrase, how many which symbols are covered by input ngrams
-    phrases2coveredsymbols = [[0 for x in phrases[i].split(" ")] for i in range(len(phrases))]
-    candidates = []
-    k = 0
-    for idx, coverage in sorted(enumerate(candidate2coverage), key=lambda item: item[1], reverse=True):
-        begin = candidate2position[idx]  # this is most likely beginning of this candidate
-        phrase_length = phrases[idx].count(" ") + 1
-        for pos in range(begin, begin + phrase_length):
-            # we do not know exact end of custom phrase in text, it can be different from phrase length
-            if pos >= len(position2ngrams):
-                break
-            for ngram in position2ngrams[pos]:
-                for phrase_id, b, size, lp in ngram2phrases[ngram]:
-                    if phrase_id != idx:
-                        continue
-                    for ppos in range(b, b + size):
-                        if ppos >= phrase_length:
-                            break
-                        phrases2coveredsymbols[phrase_id][ppos] = 1
-        k += 1
-        if k > 100:
-            break
-        real_coverage = sum(phrases2coveredsymbols[idx]) / len(phrases2coveredsymbols[idx])
-        if real_coverage < min_phrase_coverage:
-            continue
-        candidates.append((phrases[idx], begin, phrase_length, coverage, real_coverage))
-
-    # no need to process this sentence further if it does not contain any real candidates
-    if len(candidates) == 0:
-        print("WARNING: no real candidates", candidates)
-        return []
-
-    while len(candidates) < 10:
-        dummy = random.choice(pool_for_random_candidates)
-        dummy = " ".join(list(dummy.replace(" ", "_")))
-        candidates.append((dummy, -1, dummy.count(" ") + 1, 0.0, 0.0))
-
-    candidates = candidates[:10]
-    random.shuffle(candidates)
-    if len(candidates) != 10:
-        print("WARNING: cannot get 10 candidates", candidates)
-        return []
-
-    return candidates
-
-
-def read_spellmapper_predictions(filename: str) -> List[Tuple[str, List[Tuple[int, int, str, float]], List[int]]]:
-    """Read results of SpellMapper inference from file.
-    Args:
-        filename: file with SpellMapper results
-    Returns:
-        list of tuples (sent, list of fragment predictions, list of letter predictions)
-    One fragment prediction is a tuple (begin, end, replacement_text, prob)
-    """
-    results = []
-    with open(filename, "r", encoding="utf-8") as f:
-        for line in f:
-            text, candidate_str, fragment_predictions_str, letter_predictions_str = line.strip().split("\t")
-            text = text.replace(" ", "").replace("_", " ")
-            candidate_str = candidate_str.replace(" ", "").replace("_", " ")
-            candidates = candidate_str.split(";")
-            letter_predictions = list(map(int, letter_predictions_str.split()))
-            if len(candidates) != 10:
-                raise IndexError("expect 10 candidates, got: ", len(candidates))
-            if len(text) != len(letter_predictions):
-                raise IndexError("len(text)=", len(text), "; len(letter_predictions)=", len(letter_predictions))
-            replacements = []
-            if fragment_predictions_str != "":
-                for prediction in fragment_predictions_str.split(";"):
-                    begin, end, candidate_id, prob = prediction.split(" ")
-                    begin = int(begin)
-                    end = int(end)
-                    candidate_id = int(candidate_id)
-                    prob = float(prob)
-                    replacements.append((begin, end, candidates[candidate_id - 1], prob))
-                    replacements.sort()  # it will sort by begin, then by end
-            results.append((text, replacements, letter_predictions))
-    return results
-
-
-def substitute_replacements_in_text(
-    text: str, replacements: List[Tuple[int, int, str, float]], replace_hyphen_to_space: bool
-) -> str:
-    """Substitute replacements to the input text, iterating from end to beginning, so that indexing does not change.
-       Note that we expect intersecting replacements to be already filtered.
-    Args:
-        text: sentence;
-        replacements: list of replacements, each is a tuple (begin, end, text, probability);
-        replace_hyphen_to_space: if True, hyphens in replacements will be converted to spaces;
-    Returns:
-        corrected sentence
-    """
-    replacements.sort()
-    last_begin = len(text) + 1
-    corrected_text = text
-    for begin, end, candidate, prob in reversed(replacements):
-        if end > last_begin:
-            print("WARNING: skip intersecting replacement [", candidate, "] in text: ", text)
-            continue
-        if replace_hyphen_to_space:
-            candidate = candidate.replace("-", " ")
-        corrected_text = corrected_text[:begin] + candidate + corrected_text[end:]
-        last_begin = begin
-    return corrected_text
-
-
-def apply_replacements_to_text(
-    text: str,
-    replacements: List[Tuple[int, int, str, float]],
-    min_prob: float = 0.5,
-    replace_hyphen_to_space: bool = False,
-    dp_data: Tuple[defaultdict, defaultdict, defaultdict, int] = None,
-    min_dp_score_per_symbol: float = -99.9,
-) -> str:
-    """Filter and apply replacements to the input sentence.
-    Args:
-        text: input sentence;
-        replacements: list of proposed replacements (probably intersecting), each is a tuple (begin, end, text, probability);
-        min_prob: threshold on replacement probability;
-        replace_hyphen_to_space: if True, hyphens in replacements will be converted to spaces;
-        dp_data: n-gram mapping vocabularies used by dynamic programming, if None - dynamic programming is not used;
-        min_dp_score_per_symbol: threshold on dynamic programming sum score averaged by hypothesis length
-    Returns:
-        corrected sentence
-    """
-    # sort replacements by positions
-    replacements.sort()
-    # filter replacements
-    # Note that we do not skip replacements with same text, otherwise intersecting candidates with lower probability can win
-    filtered_replacements = []
-    for j in range(len(replacements)):
-        replacement = replacements[j]
-        begin, end, candidate, prob = replacement
-        fragment = text[begin:end]
-        candidate_spaced = " ".join(list(candidate.replace(" ", "_")))
-        fragment_spaced = " ".join(list(fragment.replace(" ", "_")))
-        # apply penalty if candidate length is bigger than fragment length
-        # to avoid cases like "forward-looking" replacing "looking" in "forward looking" resulting in "forward forward looking"
-        if len(candidate) > len(fragment):
-            penalty = len(fragment) / len(candidate)
-            prob *= penalty
-        # skip replacement with low probability
-        if prob < min_prob:
-            continue
-        # skip replacements with some predefined templates, e.g. "*'s" => "*s"
-        if check_banned_replacements(fragment, candidate):
-            continue
-        if dp_data is not None:
-            path = get_alignment_by_dp(candidate_spaced, fragment_spaced, dp_data)
-            # path[-1][3] is the sum of logprobs for best path of dynamic programming: divide sum_score by length
-            if path[-1][3] / (len(fragment)) < min_dp_score_per_symbol:
-                continue
-
-        # skip replacement if it intersects with previous replacement and has lower probability, otherwise remove previous replacement
-        if len(filtered_replacements) > 0 and filtered_replacements[-1][1] > begin:
-            if filtered_replacements[-1][3] > prob:
-                continue
-            else:
-                filtered_replacements.pop()
-        filtered_replacements.append((begin, end, candidate, prob))
-
-    return substitute_replacements_in_text(text, filtered_replacements, replace_hyphen_to_space)
-
-
-def update_manifest_with_spellmapper_corrections(
-    input_manifest_name: str,
-    short2full_name: str,
-    output_manifest_name: str,
-    spellmapper_results_name: str,
-    min_prob: float = 0.5,
-    replace_hyphen_to_space: bool = True,
-    field_name: str = "pred_text",
-    use_dp: bool = True,
-    ngram_mappings: Union[str, None] = None,
-    min_dp_score_per_symbol: float = -1.5,
-) -> None:
-    """Post-process SpellMapper predictions and write corrected sentence to the specified field of nemo manifest.
-    The previous content of this field will be copied to "*_before_correction" field.
-    If the sentence was split into fragments before running SpellMapper, all replacements will be first gathered together and then applied to the original long sentence.
-    Args:
-        input_manifest_name: input nemo manifest;
-        short2full_name: text file with two columns: short_sent \t full_sent;
-        output_manifest_name: output nemo manifest;
-        spellmapper_results_name: text file with SpellMapper inference results;
-        min_prob: threshold on replacement probability;
-        replace_hyphen_to_space: if True, hyphens in replacements will be converted to spaces;
-        field_name: name of json field whose text we want to correct;
-        use_dp: bool = If True, additional replacement filtering will be applied using dynamic programming (works slow);
-        ngram_mappings: file with n-gram mappings, only needed if use_dp=True
-        min_dp_score_per_symbol: threshold on dynamic programming sum score averaged by hypothesis length
-    """
-    short2full_sent = defaultdict(list)
-    sent2corrections = defaultdict(dict)
-    with open(short2full_name, "r", encoding="utf-8") as f:
-        for line in f:
-            s = line.strip()
-            short_sent, full_sent = s.split("\t")
-            short2full_sent[short_sent].append(full_sent)
-            sent2corrections[full_sent] = []
-
-    spellmapper_results = read_spellmapper_predictions(spellmapper_results_name)
-    dp_data = None
-    if use_dp:
-        dp_data = load_ngram_mappings_for_dp(ngram_mappings)
-
-    for text, replacements, _ in spellmapper_results:
-        short_sent = text
-        if short_sent not in short2full_sent:
-            continue
-        # it can happen that one short sentence occurred in multiple full sentences
-        for full_sent in short2full_sent[short_sent]:
-            offset = full_sent.find(short_sent)
-            for begin, end, candidate, prob in replacements:
-                sent2corrections[full_sent].append((begin + offset, end + offset, candidate, prob))
-
-    out = open(output_manifest_name, "w", encoding="utf-8")
-    with open(input_manifest_name, "r", encoding="utf-8") as f:
-        for line in f:
-            record = json.loads(line.strip())
-            sent = record[field_name]
-            record[field_name + "_before_correction"] = record[field_name]
-            if sent in sent2corrections:
-                record[field_name] = apply_replacements_to_text(
-                    sent,
-                    sent2corrections[sent],
-                    min_prob=min_prob,
-                    replace_hyphen_to_space=replace_hyphen_to_space,
-                    dp_data=dp_data,
-                    min_dp_score_per_symbol=min_dp_score_per_symbol,
-                )
-            out.write(json.dumps(record) + "\n")
-    out.close()
-
-
-def extract_and_split_text_from_manifest(
-    input_name: str, output_name: str, field_name: str = "pred_text", len_in_words: int = 16, step_in_words: int = 8
-) -> None:
-    """Extract text of the specified field in nemo manifest and split it into fragments (possibly with intersection).
-    The result is saved to a text file with two columns: short_sent \t full_sent.
-    This is useful if we want to process shorter sentences and then apply the results to the original long sentence.
-    Args:
-        input_name: input nemo manifest,
-        output_name: output text file,
-        field_name: name of json field from which we extract the sentence text,
-        len_in_words: maximum number of words in a fragment,
-        step_in_words: on how many words we move at each step.
-    For example, if the len_in_words=16 and step_in_words=8 the fragments will be intersected by half.
-    """
-    short2full_sent = set()
-    with open(input_name, "r", encoding="utf-8") as f:
-        for line in f:
-            record = json.loads(line.strip())
-            sent = record[field_name]
-            if "  " in sent:
-                raise ValueError("found multiple space in: " + sent)
-            words = sent.split()
-            for i in range(0, len(words), step_in_words):
-                short_sent = " ".join(words[i : i + len_in_words])
-                short2full_sent.add((short_sent, sent))
-
-    with open(output_name, "w", encoding="utf-8") as out:
-        for short_sent, full_sent in short2full_sent:
-            out.write(short_sent + "\t" + full_sent + "\n")
-
-
-def check_banned_replacements(src: str, dst: str) -> bool:
-    """This function is used to check is a pair of words/phrases is matching some common template that we don't want to replace with one another.
-    Args:
-        src: first phrase
-        dst: second phrase
-    Returns True if this replacement should be banned.
-    """
-    # customers' => customer's
-    if src.endswith("s'") and dst.endswith("'s") and src[0:-2] == dst[0:-2]:
-        return True
-    # customer's => customers'
-    if src.endswith("'s") and dst.endswith("s'") and src[0:-2] == dst[0:-2]:
-        return True
-    # customers => customer's
-    if src.endswith("s") and dst.endswith("'s") and src[0:-1] == dst[0:-2]:
-        return True
-    # customer's => customers
-    if src.endswith("'s") and dst.endswith("s") and src[0:-2] == dst[0:-1]:
-        return True
-    # customers => customers'
-    if src.endswith("s") and dst.endswith("s'") and src[0:-1] == dst[0:-2]:
-        return True
-    # customers' => customers
-    if src.endswith("s'") and dst.endswith("s") and src[0:-2] == dst[0:-1]:
-        return True
-    # utilities => utility's
-    if src.endswith("ies") and dst.endswith("y's") and src[0:-3] == dst[0:-3]:
-        return True
-    # utility's => utilities
-    if src.endswith("y's") and dst.endswith("ies") and src[0:-3] == dst[0:-3]:
-        return True
-    # utilities => utility
-    if src.endswith("ies") and dst.endswith("y") and src[0:-3] == dst[0:-1]:
-        return True
-    # utility => utilities
-    if src.endswith("y") and dst.endswith("ies") and src[0:-1] == dst[0:-3]:
-        return True
-    # group is => group's
-    if src.endswith(" is") and dst.endswith("'s") and src[0:-3] == dst[0:-2]:
-        return True
-    # group's => group is
-    if src.endswith("'s") and dst.endswith(" is") and src[0:-2] == dst[0:-3]:
-        return True
-    # trex's => trex
-    if src.endswith("'s") and src[0:-2] == dst:
-        return True
-    # trex => trex's
-    if dst.endswith("'s") and dst[0:-2] == src:
-        return True
-    # increases => increase (but trimass => trimas is ok)
-    if src.endswith("s") and (not src.endswith("ss")) and src[0:-1] == dst:
-        return True
-    # increase => increases ((but trimas => trimass is ok))
-    if dst.endswith("s") and (not dst.endswith("ss")) and dst[0:-1] == src:
-        return True
-    # anticipate => anticipated
-    if src.endswith("e") and dst.endswith("ed") and src[0:-1] == dst[0:-2]:
-        return True
-    # anticipated => anticipate
-    if src.endswith("ed") and dst.endswith("e") and src[0:-2] == dst[0:-1]:
-        return True
-    # blocks => blocked
-    if src.endswith("s") and dst.endswith("ed") and src[0:-1] == dst[0:-2]:
-        return True
-    # blocked => blocks
-    if src.endswith("ed") and dst.endswith("s") and src[0:-2] == dst[0:-1]:
-        return True
-    # lives => lived
-    if src.endswith("es") and dst.endswith("ed") and src[0:-2] == dst[0:-2]:
-        return True
-    # lived => lives
-    if src.endswith("ed") and dst.endswith("es") and src[0:-2] == dst[0:-2]:
-        return True
-    # regarded => regard
-    if src.endswith("ed") and src[0:-2] == dst:
-        return True
-    # regard => regarded
-    if dst.endswith("ed") and dst[0:-2] == src:
-        return True
-    # regardeding => regard
-    if src.endswith("ing") and src[0:-3] == dst:
-        return True
-    # regard => regarding
-    if dst.endswith("ing") and dst[0:-3] == src:
-        return True
-    # longer => long
-    if src.endswith("er") and src[0:-2] == dst:
-        return True
-    # long => longer
-    if dst.endswith("er") and dst[0:-2] == src:
-        return True
-    # discussed => discussing
-    if src.endswith("ed") and dst.endswith("ing") and src[0:-2] == dst[0:-3]:
-        return True
-    # discussing => discussed
-    if src.endswith("ing") and dst.endswith("ed") and src[0:-3] == dst[0:-2]:
-        return True
-    # live => living
-    if src.endswith("e") and dst.endswith("ing") and src[0:-1] == dst[0:-3]:
-        return True
-    # living => live
-    if src.endswith("ing") and dst.endswith("e") and src[0:-3] == dst[0:-1]:
-        return True
-    # discussion => discussing
-    if src.endswith("ion") and dst.endswith("ing") and src[0:-3] == dst[0:-3]:
-        return True
-    # discussing => discussion
-    if src.endswith("ing") and dst.endswith("ion") and src[0:-3] == dst[0:-3]:
-        return True
-    # alignment => aligning
-    if src.endswith("ment") and dst.endswith("ing") and src[0:-4] == dst[0:-3]:
-        return True
-    # aligning => alignment
-    if src.endswith("ing") and dst.endswith("ment") and src[0:-3] == dst[0:-4]:
-        return True
-    # dispensers => dispensing
-    if src.endswith("ers") and dst.endswith("ing") and src[0:-3] == dst[0:-3]:
-        return True
-    # dispensing => dispensers
-    if src.endswith("ing") and dst.endswith("ers") and src[0:-3] == dst[0:-3]:
-        return True
-    # integrate => integrity
-    if src.endswith("ate") and dst.endswith("ity") and src[0:-3] == dst[0:-3]:
-        return True
-    # integrity => integrate
-    if src.endswith("ity") and dst.endswith("ate") and src[0:-3] == dst[0:-3]:
-        return True
-    # discussion => discussed
-    if src.endswith("ion") and dst.endswith("ed") and src[0:-3] == dst[0:-2]:
-        return True
-    # discussed => discussion
-    if src.endswith("ed") and dst.endswith("ion") and src[0:-2] == dst[0:-3]:
-        return True
-    # anticipation => anticipate
-    if src.endswith("ion") and dst.endswith("e") and src[0:-3] == dst[0:-1]:
-        return True
-    # anticipate => anticipation
-    if src.endswith("e") and dst.endswith("ion") and src[0:-1] == dst[0:-3]:
-        return True
-    # incremental => increment
-    if src.endswith("ntal") and dst.endswith("nt") and src[0:-4] == dst[0:-2]:
-        return True
-    # increment => incremental
-    if src.endswith("nt") and dst.endswith("ntal") and src[0:-2] == dst[0:-4]:
-        return True
-    # national => nation
-    if src.endswith("nal") and dst.endswith("n") and src[0:-3] == dst[0:-1]:
-        return True
-    # nation => national
-    if src.endswith("n") and dst.endswith("nal") and src[0:-1] == dst[0:-3]:
-        return True
-    # significantly => significant
-    if src.endswith("ntly") and dst.endswith("nt") and src[0:-4] == dst[0:-2]:
-        return True
-    # significant => significantly
-    if src.endswith("nt") and dst.endswith("ntly") and src[0:-2] == dst[0:-4]:
-        return True
-    # delivery => deliverer
-    if src.endswith("ery") and dst.endswith("erer") and src[0:-3] == dst[0:-4]:
-        return True
-    # deliverer => delivery
-    if src.endswith("erer") and dst.endswith("ery") and src[0:-4] == dst[0:-3]:
-        return True
-    # deliver => deliverer
-    if src.endswith("er") and dst.endswith("erer") and src[0:-2] == dst[0:-4]:
-        return True
-    # deliverer => deliver
-    if src.endswith("erer") and dst.endswith("er") and src[0:-4] == dst[0:-2]:
-        return True
-    # comparably => comparable
-    if src.endswith("bly") and dst.endswith("ble") and src[0:-3] == dst[0:-3]:
-        return True
-    # comparable => comparably
-    if src.endswith("ble") and dst.endswith("bly") and src[0:-3] == dst[0:-3]:
-        return True
-    # comparably => comparability
-    if src.endswith("bly") and dst.endswith("bility") and src[0:-3] == dst[0:-6]:
-        return True
-    # comparability => comparably
-    if src.endswith("bility") and dst.endswith("bly") and src[0:-6] == dst[0:-3]:
-        return True
-    # beautiful => beautifully
-    if src.endswith("l") and dst.endswith("lly") and src[0:-1] == dst[0:-3]:
-        return True
-    # beautifully => beautiful
-    if src.endswith("lly") and dst.endswith("l") and src[0:-3] == dst[0:-1]:
-        return True
-    # active => actively
-    if src.endswith("e") and dst.endswith("ely") and src[0:-1] == dst[0:-3]:
-        return True
-    # actively => active
-    if src.endswith("ely") and dst.endswith("e") and src[0:-3] == dst[0:-1]:
-        return True
-    # america => american
-    if src.endswith("a") and dst.endswith("an") and src[0:-1] == dst[0:-2]:
-        return True
-    # american => america
-    if src.endswith("an") and dst.endswith("a") and src[0:-2] == dst[0:-1]:
-        return True
-    # reinvesting => investing
-    if src.startswith("re") and src[2:] == dst:
-        return True
-    # investing => reinvesting
-    if dst.startswith("re") and dst[2:] == src:
-        return True
-    # unchanged => changed
-    if src.startswith("un") and src[2:] == dst:
-        return True
-    # changed => unchanged
-    if dst.startswith("un") and dst[2:] == src:
-        return True
-    # disrespected => respected
-    if src.startswith("dis") and src[3:] == dst:
-        return True
-    # respected => disrespected
-    if dst.startswith("dis") and dst[3:] == src:
-        return True
-    # outperformance => performance
-    if src.startswith("out") and src[3:] == dst:
-        return True
-    # performance => outperformance
-    if dst.startswith("out") and dst[3:] == src:
-        return True
-    return False
diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py
deleted file mode 100644
index d82ee36a8833..000000000000
--- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py
+++ /dev/null
@@ -1,2000 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = [
-    'BertPunctuationCapitalizationDataset',
-    'LABEL_ID_DIR_FOR_NEMO_CHECKPOINT',
-    'Progress',
-    'PunctuationCapitalizationEvalDataConfig',
-    'PunctuationCapitalizationTrainDataConfig',
-    'create_label_ids',
-    'create_masks_and_segment_ids',
-    'is_legacy_data_config',
-    'legacy_data_config_to_new_data_config',
-    'load_label_ids',
-    'raise_not_equal_labels_error',
-    'save_label_ids',
-]
-
-import itertools
-import multiprocessing as mp
-import os
-import pickle
-import tempfile
-from dataclasses import dataclass
-from math import ceil
-from pathlib import Path
-from queue import Empty
-from time import sleep
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
-
-import numpy as np
-import torch
-from numpy import ndarray
-from omegaconf import MISSING, DictConfig, OmegaConf
-from torch.nn.utils.rnn import pad_sequence
-from tqdm import tqdm
-
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-from nemo.collections.nlp.data.data_utils.data_preprocessing import get_label_stats, get_stats
-from nemo.core.classes import Dataset
-from nemo.core.neural_types import AudioSignal, ChannelType, LabelsType, LengthsType, MaskType, NeuralType
-from nemo.utils import logging
-from nemo.utils.get_rank import is_global_rank_zero
-
-try:
-    from nemo.collections.asr.parts.preprocessing import AudioSegment
-
-    ASR_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    ASR_AVAILABLE = False
-
-
-MAX_NUM_QUERIES_IN_SPLIT = 10 ** 4
-TOKENIZATION_PROGRESS_REPORT_PERIOD = 10 ** 3
-BATCH_MARK_UP_PROGRESS_REPORT_PERIOD = 10 ** 4
-BATCH_BUILDING_PROGRESS_REPORT_PERIOD = 10 ** 4
-
-LABEL_ID_DIR_FOR_NEMO_CHECKPOINT = "label_id_files_for_nemo_checkpoint"
-
-
-@dataclass
-class PunctuationCapitalizationDataConfigBase:
-    """A base class for punctuation and capitalization data configs. This class does not define ``ds_item``
-    attribute which works differently for train and evaluation data."""
-
-    ###################################################
-    # AUDIO DATASET PARAMETERS
-    ###################################################
-    use_audio: bool = False
-    """
-    Whether to use audio or not. If set to True you should provide ``audio_file``.  
-    """
-
-    audio_file: Optional[str] = None
-    """
-    Path to the file with audio paths one per row.
-    """
-
-    sample_rate: Optional[int] = 16000
-    """
-    Sample rate of audios to use.
-    """
-
-    use_bucketing: Optional[bool] = True
-    """
-    Whether to pack samples into ``tokens_in_batch`` or not. Increases GPU utilization but may cause significant RAM consumption if used together with ``use_audio``. 
-    """
-
-    batch_size: Optional[int] = 32
-    """
-    Batch size used if ``use_bucketing`` set to False.
-    """
-
-    preload_audios: Optional[bool] = True
-    """
-    If set to True audios will be loaded during ``__init__`` call of dataset. Otherwise it will be loaded during ``collate_fn ``call
-    """
-
-    ###################################################
-    # PARAMETERS COMMON FOR REGULAR AND TARRED DATASETS
-    ###################################################
-    use_tarred_dataset: bool = MISSING
-    """Whether to use tarred dataset. If True, then you should provide ``tar_metadata_file``. Otherwise, you should
-    provide ``text_file``, ``labels_file``, ``tokens_in_batch``."""
-
-    label_info_save_dir: Optional[str] = None
-    """A path to a directory where files created during dataset processing are stored. These files include label id
-    files and label stats files. By default, it is a directory containing ``text_file`` or ``tar_metadata_file``.
-    You may need this parameter if dataset directory is read-only and thus does not allow saving anything near dataset
-    files"""
-
-    #################################################
-    # REGULAR DATASET PARAMETERS
-    #################################################
-    text_file: Optional[str] = None
-    """A path to a file with source text data without punctuation and capitalization."""
-
-    labels_file: Optional[str] = None
-    """A path to a file with punctuation and capitalization labels in NeMo format. NeMo format is described in
-    `documentation
-    <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#nemo-data-format>`_
-    """
-
-    tokens_in_batch: Optional[int] = None
-    """Number of tokens in a batch including paddings and special tokens ([CLS], [SEP], [UNK]). This config does
-    not have ``batch_size`` parameter."""
-
-    max_seq_length: int = 512
-    """Max number of tokens in a source sequence. ``max_seq_length`` includes [CLS] and [SEP] tokens. Sequences
-    which are too long will be clipped by removal of tokens from the end of a sequence."""
-
-    num_samples: int = -1
-    """A number of samples loaded from ``text_file`` and ``labels_file`` which are used in the dataset. If this
-    parameter equals ``-1``, then all samples are used."""
-
-    use_cache: bool = True
-    """Whether to use pickled features. If pickled features file does not exist or ``use_cache=False``, then features
-    are pickled in ``cache_dir``. Pickled features include input ids, subtokens mask (mask of first tokens in words),
-    encoded punctuation and capitalization labels, label ids. Features creation consumes considerable time and this
-    ``use_cache=True`` significantly speeds up training starting. Pickled features are also used for sharing features
-    between processes if data parallel training is used."""
-
-    cache_dir: Optional[str] = None
-    """A path to a directory containing cache or directory where newly created cache is saved. By default, it is
-    a directory containing ``text_file``. You may need this parameter if cache for a dataset is going to be created
-    and the dataset directory is read-only.
-
-    ``cache_dir`` and ``label_info_save_dir`` are separate parameters for the case when a cache is ready and this cache
-    is stored in a read only directory. In this case you will separate ``label_info_save_dir``."""
-
-    get_label_frequences: bool = False
-    """Whether to show and save label frequencies. Frequencies are showed if ``verbose`` parameter is ``True``. If
-    ``get_label_frequencies=True``, then frequencies are saved into ``label_info_save_dir``"""
-
-    verbose: bool = True
-    """If ``True`` dataset instance will print progress messages and examples of acquired features."""
-
-    n_jobs: Optional[int] = 0
-    """Number of workers used for features creation (tokenization, label encoding, and clipping). If 0, then
-    multiprocessing is not used; if ``None``, then n_jobs is equal to the number of CPU cores.
-    There can be weird deadlocking errors with some tokenizers (e.g. SentencePiece) if ``n_jobs`` is greater than zero.
-    """
-
-    #################################################
-    # TARRED DATASET PARAMETERS
-    #################################################
-    tar_metadata_file: Optional[str] = None
-    """A path to tarred dataset metadata file. Tarred metadata file and other parts of tarred dataset are usually
-    created by the script
-    `examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py
-    <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py>`_
-    """
-
-    tar_shuffle_n: int = 1
-    """The size of shuffle buffer of `webdataset`. The number of batches which are permuted."""
-
-    shard_strategy: Optional[str] = 'scatter'
-    """Tarred dataset shard distribution strategy chosen as a str value during ddp. Accepted values are `scatter` and `replicate`.
-    `scatter`: The default shard strategy applied by WebDataset, where each node gets a unique set of shards, which are permanently
-    pre-allocated and never changed at runtime. `replicate` is an optional shard strategy, where each node gets the entire set of shards
-    available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. The benefit of replication is that
-    it allows each node to sample data points from the entire dataset independently of other nodes, and reduces dependence on value of
-    ``tar_shuffle_n``.
-
-    .. warning::
-        Replicated strategy allows every node to sample the entire set of available tar files, and therefore more than one node may sample
-        the same tarfile, and even sample the same data points! As such, there is no assured guarantee that all samples in the dataset
-        will be sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific occasions (when the number of
-        shards is not divisible with ``world_size``), will not sample the entire dataset. For these reasons it is not advisable to use
-        tarred datasets as validation or test datasets.
-    """
-
-    #################################################
-    # PYTORCH DATALOADER PARAMETERS
-    #################################################
-    shuffle: bool = True
-    """Shuffle batches every epoch. For regular training datasets, the parameter also activates batch repacking every
-    epoch. For tarred dataset, it would be only batches permutation."""
-
-    drop_last: bool = False
-    """In cases when data parallelism is used, ``drop_last`` defines the way data pipeline behaves when some replicas
-    are out of data and some are not. If ``drop_last`` is ``True``, then epoch ends in the moment when any replica runs
-    out of data. If ``drop_last`` is ``False``, then the replica will replace missing batch with a batch from a pool of
-    batches that the replica has already processed. If data parallelism is not used, then parameter ``drop_last`` does
-    not do anything. For more information see ``torch.utils.data.distributed.DistributedSampler``"""
-
-    pin_memory: bool = True
-    """See ``torch.utils.data.DataLoader`` documentation."""
-
-    num_workers: int = 8
-    """See ``torch.utils.data.DataLoader`` documentation."""
-
-    persistent_workers: bool = True
-    """See ``torch.utils.data.DataLoader`` documentation."""
-
-
-@dataclass
-class PunctuationCapitalizationTrainDataConfig(PunctuationCapitalizationDataConfigBase):
-    ds_item: Optional[str] = MISSING
-    """Path to a directory where `tar_metadata_file` or `text_file` and `labels_file` lay."""
-
-
-@dataclass
-class PunctuationCapitalizationEvalDataConfig(PunctuationCapitalizationDataConfigBase):
-    ds_item: Optional[Any] = MISSING
-    """Path to a directory where `tar_metadata_file` or `text_file` and `labels_file` lay. ``Any`` = ``str`` or
-    ``List[str]``. If a ``List[str]``, then the model is tested or validated on several datasets."""
-
-
-def is_legacy_data_config(ds_section: DictConfig) -> bool:
-    return 'use_tarred_dataset' not in ds_section
-
-
-def legacy_data_config_to_new_data_config(
-    ds_section: DictConfig, legacy_dataset_section: DictConfig, train: bool
-) -> DictConfig:
-    """
-    Transform old style dataset to new format dataset.
-    Args:
-        ds_section: a ds section (``train_ds``, or ``validation_ds``, or ``test_ds``) from old style config. Such
-            section contain ``batch_size`` parameter.
-        legacy_dataset_section: a ``model.dataset`` section. ``model.dataset`` section contains ``data_dir`` parameter
-        train: ``True`` if ``train_ds`` is transformed and ``False`` otherwise
-
-    Returns:
-        New format dataset based on either ``PunctuationCapitalizationTrainDataConfig`` (``train=True``) or
-            ``PunctuationCapitalizationEvalDataConfig`` (``train=False``)
-    """
-    if train:
-        cls = PunctuationCapitalizationTrainDataConfig
-        ds_item = legacy_dataset_section.get('data_dir')
-    else:
-        cls = PunctuationCapitalizationEvalDataConfig
-        ds_item = ds_section.get('ds_item')
-        ds_item = legacy_dataset_section.get('data_dir') if ds_item is None else ds_item
-    if ds_item is None:
-        raise ValueError(
-            f"Data directory was not found in legacy config.\nspecific dataset configuration:\n"
-            f"{OmegaConf.to_yaml(ds_section)}\nmodel.dataset:\n{OmegaConf.to_yaml(legacy_dataset_section)}"
-        )
-    new_config = OmegaConf.structured(
-        cls(
-            use_tarred_dataset=False,
-            text_file=ds_section.text_file,
-            labels_file=ds_section.labels_file,
-            ds_item=ds_item,
-            max_seq_length=legacy_dataset_section.get(
-                'max_seq_length', PunctuationCapitalizationDataConfigBase.max_seq_length
-            ),
-        )
-    )
-    return new_config
-
-
-def _check_number_of_labels(
-    words: List[str],
-    query: str,
-    qi: int,
-    split_i: int,
-    punctuation_labels: List[str],
-    capitalization_labels: List[str],
-) -> None:
-    if len(words) != len(punctuation_labels):
-        raise ValueError(
-            f"Number of punctuation labels for a query number {qi} in a split number {split_i} is not equal to "
-            f"number of words. Number of words: {len(words)}, number of punctuation labels: "
-            f"{len(punctuation_labels)}. First 100 characters of the query: '{query[:100]}', punctuation labels: "
-            f"'{punctuation_labels}'"
-        )
-    if len(words) != len(capitalization_labels):
-        raise ValueError(
-            f"Number of capitalization labels for a query number {qi} in a split number {split_i} is not equal to "
-            f"number of words. Number of words: {len(words)}, number of capitalization labels: "
-            f"{len(capitalization_labels)}. First 100 characters of the query: '{query[:100]}', "
-            f"capitalization labels: '{capitalization_labels}'"
-        )
-
-
-def _show_prog(queues: Tuple[mp.Queue, ...], totals: List[int], descriptions: List[str], units: List[str]) -> None:
-    """
-    Show several ``tqdm`` progress bars.
-    Args:
-        queues: a list of queues by which progress is delivered into this function. Each queue is responsible for one
-            progress bar. ``show_prog`` function extracts integers from ``queues`` elements and adds them to progress
-            bars. If value extracted from a queue equals ``-1``, then corresponding progress bar is closed. When all
-            progress bars are closed, this function returns.
-        totals: list of values 100% of progress bars. See more in a description of ``total`` parameter of
-            ``tqdm.tqdm`` function
-        descriptions: list of descriptions of progress bars. See more in a description of ``desc`` parameter of
-            ``tqdm.tqdm`` function
-        units: list of progress bar units. See more in a description of ``unit`` parameter of ``tqdm.tqdm`` function
-    """
-    if not all([len(queues) == len(v) for v in [totals, descriptions, units]]):
-        raise ValueError(
-            f"All of parameters `queues`, `total_num_lines`, `descriptions`, `units` have to have equal lengths. "
-            f"len(queues)={len(queues)}, len(total_num_lines)={len(totals)}, "
-            f"len(descriptions)={len(descriptions)}, len(units)={len(units)}."
-        )
-    prog = [
-        tqdm(total=tt, desc=dd, unit=uu, unit_scale=True, position=i)
-        for i, (tt, dd, uu) in enumerate(zip(totals, descriptions, units))
-    ]
-    finished = [False] * len(queues)
-    while True:
-        for i, queue in enumerate(queues):
-            stop = False
-            to_add = 0
-            try:
-                v = queue.get(block=False)
-                while v != -1:
-                    to_add += v
-                    v = queue.get(block=False)
-                stop = True
-            except Empty:
-                if to_add == 0 and not stop:
-                    continue
-            prog[i].n += to_add
-            prog[i].update(0)
-            if prog[i].n >= totals[i]:
-                finished[i] = True
-                prog[i].close()
-            if stop:
-                if prog[i].n < totals[i]:
-                    logging.warning(
-                        f"Progress with description '{descriptions[i]}' terminated before progress bar "
-                        f"reached 100%. prog.n={prog[i].n}, total_num_lines={totals[i]}"
-                    )
-                finished[i] = True
-                prog[i].close()
-        if all(finished):
-            break
-        sleep(0.1)
-
-
-class Progress:
-    """
-    Manages several ``tqdm`` progress bars for multiprocess tasks. This class can be used as context manager.
-
-    The class starts separate process which creates and updates progress bars. Information to progress process is
-    passed via multiprocessing queues. There is a separate queue for every progress bar.
-
-    You can use it as context manager:
-
-    .. code-block:: python
-        with Progress([10, 20], ["progress bar 1", "progress bar 2"], ["parrot", "frog"]) as progress_queues:
-            num_processes = 10
-            with multiprocessing.Pool(num_processes) as pool:
-                data = list(zip(my_data, [progress_queues[0]] * num_processes, [progress_queues[1]] * num_processes))
-                pool.starmap(worker_func, data)
-
-    Or without context manager:
-
-    .. code-block:: python
-        progress = Progress([10, 20], ["progress bar 1", "progress bar 2"], ["parrot", "frog"])
-        progress_queues = progress.get_queue()
-        num_processes = 10
-        with multiprocessing.Pool(num_processes) as pool:
-            data = list(zip(my_data, [progress_queues[0]] * num_processes, [progress_queues[1]] * num_processes))
-            pool.starmap(worker_func, data)
-        progress.finish()
-
-    In a worker function you will have to put number of processed items into the progress queues. For example:
-
-    .. code-block:: python
-        def worker_func(my_datum, parrot_progress_queue, frog_progress_queue):
-            ...
-            for i in range(10):
-                parrot_progress_queue.put(1)
-                frog_progress_queue.put(2)
-
-    Progress bars and progress process are closed when ``finish`` or ``__exit__`` methods are called.
-    """
-
-    def __init__(self, total: Union[int, List[int]], desc: Union[str, List[str]], unit: Union[str, List[str]]) -> None:
-        """
-        Starts progress process and creates queues for passing information to the progress process. Number of progress
-        bars is equal to the max length of lists ``total``, ``desc``, ``unit``. If none of these parameters is a list,
-        then 1 progress bar is created.
-
-        Args:
-            total: a list of ``int`` which length is equal to the number of progress bars OR an ``int`` OR a list of
-                one ``int``. Number which comprises 100% of progress bar. When sum of values passed through the
-                corresponding queue equals ``total`` corresponding progress bar reaches 100%. If ``total`` is an
-                ``int`` or a list of one element, then all progress bars have equal ``total`` parameter.
-            desc: a list of ``str`` which length is equal to the number of progress bars OR a ``str`` OR a list of one
-                ``str``. Description of a progress bar which is showed as a prefix. See more in description of
-                parameter ``desc`` of function ``tqdm.tqdm``.
-            unit: a list of ``str`` which length is equal to the number of progress bars OR a ``str`` OR a list of one
-                ``str``. A unit of a progress bar. See more in description of parameter ``unit`` of function
-                ``tqdm.tqdm``.
-        """
-        if not isinstance(total, list):
-            total = [total]
-        if not isinstance(desc, list):
-            desc = [desc]
-        if not isinstance(unit, list):
-            unit = [unit]
-        num_processes = max([len(total), len(desc), len(unit)])
-        for param in [total, desc, unit]:
-            if len(param) not in [num_processes, 1]:
-                raise ValueError(
-                    f"If parameter of `Progress.__init__` method is a list, then it has to be the same length as other "
-                    f"parameters which are lists"
-                )
-            if len(param) == 1:
-                param *= num_processes
-        manager = mp.Manager()
-        self.progress_queues = tuple(manager.Queue() for _ in range(num_processes))
-        self.progress_process = mp.Process(target=_show_prog, args=(self.progress_queues, total, desc, unit))
-        self.progress_process.start()
-
-    def __enter__(self) -> Tuple[mp.Queue, ...]:
-        return self.get_queues()
-
-    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
-        self.finish()
-
-    def get_queues(self) -> Tuple[mp.Queue, ...]:
-        return self.progress_queues
-
-    def finish(self) -> None:
-        for q in self.progress_queues:
-            q.put(-1)
-        self.progress_process.join()
-
-
-class TokenizeCreateMasksClipWorker:
-    """A worker for tokenization, encoding labels, creating masks for first token in a word, sequence clipping"""
-
-    def __init__(
-        self,
-        max_seq_length: int,
-        tokenizer: TokenizerSpec,
-        punct_label_ids: Optional[Dict[str, int]],
-        capit_label_ids: Optional[Dict[str, int]],
-        pad_label: str,
-        verbose: bool,
-        progress_queue: mp.Queue,
-    ) -> None:
-        """
-        Args:
-            max_seq_length: max number of tokens in an input sequence including [CLS] and [SEP] tokens. If number of
-                tokens in a sequence exceeds ``max_seq_length``, then excess tokens in the end of the sequence
-                are removed
-            tokenizer: a tokenizer instance which has properties ``cls_id``, ``pad_id``, ``sep_id``, ``unk_id``
-            punct_label_ids: dict to map punctuation labels to label ids. Starts with pad_label->0.
-            capit_label_ids: dict to map capitalization labels to label ids. Starts with pad_label->0.
-            pad_label: pad value use for labels. By default, it's the neutral label for punctuation and capitalization.
-                Its id in ``punct_label_ids`` and ``capit_label_ids`` has to be ``0``
-            verbose: whether to report when the worker finishes its job
-            progress_queue: a multiprocessing queue used for reporting progress. Useful for creating tarred dataset
-        """
-        self.max_seq_length = max_seq_length
-        self.tokenizer = tokenizer
-        self.punct_label_ids = punct_label_ids
-        self.capit_label_ids = capit_label_ids
-        self.pad_label = pad_label
-        self.verbose = verbose
-        self.progress_queue = progress_queue
-
-    def _maybe_clip(self, values: List[int], append_value: int) -> List[int]:
-        if len(values) > self.max_seq_length:
-            return values[: self.max_seq_length - 1] + [append_value]
-        return values
-
-    def __call__(
-        self,
-        queries: List[str],
-        punct_label_lines: Optional[Union[List[str], Tuple[str, ...]]],
-        capit_label_lines: Optional[Union[List[str], Tuple[str, ...]]],
-        split_i: int,
-        audio_queries: Optional[List[str]] = None,
-        sample_rate: Optional[int] = None,
-        preload_audios: Optional[bool] = True,
-    ) -> Tuple[
-        List[ndarray],
-        List[ndarray],
-        List[ndarray],
-        List[ndarray],
-        Union[List[Any], List[None]],
-        Union[List[Any], List[None]],
-        Union[List[Any], List[None]],
-    ]:
-        """
-        Tokenize, clip, encode labels, and create masks of first tokens in words.
-
-        Args:
-            queries: text sequences
-            punct_label_lines: a list or a tuple of labels for every word in a sequence (str)
-            capit_label_lines: a list of a tuple labels for every word in a sequence (str)
-            split_i: number of a split which is processed. Used for logging
-            audio_queries: a list of audio filepaths
-            sample_rate: target sample rate of audios
-            preload_audios: whether to preload audios or not
-
-        Returns:
-            input_ids: a list of 1D int32 arrays. Each array contains token ids of the corresponding query
-            subtokens_mask: a list of 1D boolean arrays. An array element is ``True`` if corresponding token is the
-                first token in a word
-            punct_labels: a list of 1D int32 arrays. Encoded punctuation labels for every token in a query. Tokens in
-                one word have identical labels
-            capit_labels: a list of 1D int32 arrays. Encoded capitalization labels for every token in a query. Tokens
-                in one word have identical labels
-        """
-        all_input_ids, all_subtokens_mask, punct_all_labels, capit_all_labels = [], [], [], []
-        dummy = [None] * len(queries)  # Needed to avoid code duplication with different values of `self.use_audio`
-        all_audio_waveforms = [] if preload_audios else dummy
-        audio_lengths = [] if preload_audios else dummy
-        audio_filepaths = [] if not preload_audios else dummy
-        progress_made = 0
-        queries = zip(queries, audio_queries) if audio_queries else zip(queries, dummy)
-        for i, (query, audio_query) in enumerate(queries):
-            words = query.split()
-            input_ids, subtokens_mask = [self.tokenizer.cls_id], [0]
-            _check_number_of_labels(words, query, i, split_i, punct_label_lines[i], capit_label_lines[i])
-            pad_id = self.punct_label_ids[self.pad_label]
-            punct_labels = [pad_id]
-            punct_query_labels = [self.punct_label_ids[lab] for lab in punct_label_lines[i]]
-            capit_labels = [pad_id]
-            capit_query_labels = [self.capit_label_ids[lab] for lab in capit_label_lines[i]]
-            for j, word in enumerate(words):
-                word_ids = self.tokenizer.text_to_ids(word)
-                if not word_ids and len(word):
-                    word_ids = [self.tokenizer.unk_id]
-                input_ids.extend(word_ids)
-
-                subtokens_mask.append(1)
-                subtokens_mask.extend([0] * (len(word_ids) - 1))
-
-                punct_labels.extend([punct_query_labels[j]] * len(word_ids))
-                capit_labels.extend([capit_query_labels[j]] * len(word_ids))
-
-            # add eos token
-            input_ids.append(self.tokenizer.sep_id)
-            subtokens_mask.append(0)
-
-            all_input_ids.append(np.array(self._maybe_clip(input_ids, self.tokenizer.sep_id), dtype=np.int32))
-            all_subtokens_mask.append(np.array(self._maybe_clip(subtokens_mask, 0), dtype=bool))
-
-            punct_labels.append(pad_id)
-            punct_all_labels.append(np.array(self._maybe_clip(punct_labels, pad_id), dtype=np.int32))
-            capit_labels.append(pad_id)
-            capit_all_labels.append(np.array(self._maybe_clip(capit_labels, pad_id), dtype=np.int32))
-            if preload_audios and audio_query:
-                if ASR_AVAILABLE:
-                    segment = AudioSegment.from_file(audio_query.strip(), target_sr=sample_rate)
-                    all_audio_waveforms.append(segment.samples)
-                    audio_lengths.append(segment.num_samples)
-                else:
-                    raise ModuleNotFoundError(
-                        'Nemo ASR was not installed, see https://github.com/NVIDIA/NeMo#installation for installation instructions'
-                    )
-
-            elif audio_query:
-                audio_filepaths.append(audio_query.strip())
-
-            progress_made += 1
-            if progress_made >= TOKENIZATION_PROGRESS_REPORT_PERIOD:
-                self.progress_queue.put(progress_made)
-                progress_made = 0
-
-        self.progress_queue.put(progress_made)
-        if self.verbose:
-            logging.info(f"Finished processing data split number {split_i}")
-
-        return (
-            all_input_ids,
-            all_subtokens_mask,
-            punct_all_labels,
-            capit_all_labels,
-            all_audio_waveforms,
-            audio_lengths,
-            audio_filepaths,
-        )
-
-
-def _get_features(
-    queries: Union[List[str], Tuple[str, ...]],
-    punct_label_lines: Union[List[str], Tuple[str, ...]],
-    capit_label_lines: Union[List[str], Tuple[str, ...]],
-    max_seq_length: int,
-    tokenizer: TokenizerSpec,
-    punct_label_ids: Dict[str, int] = None,
-    capit_label_ids: Dict[str, int] = None,
-    pad_label: str = 'O',
-    verbose: bool = True,
-    n_jobs: Optional[int] = 0,
-    progress_queue: Optional[mp.Queue] = None,
-    audio_queries: Optional[List[str]] = None,
-    sample_rate: Optional[int] = None,
-    preload_audios: Optional[bool] = True,
-) -> Tuple[List[Any], List[Any], List[Any], List[Any], List[Any], List[Any], List[Any]]:
-    """
-    Tokenizes data, encodes labels, creates masks of first tokens in words, clips sequences by number of tokens.
-
-    Args:
-        queries: text sequences
-        max_seq_length: max number of tokens in an input sequence including [CLS] and [SEP] tokens. If number of tokens
-            in a sequence exceeds ``max_seq_length``, then excess tokens in the end of the sequence are removed
-        tokenizer: a tokenizer instance which has properties ``cls_id``, ``pad_id``, ``sep_id``, ``unk_id``
-        punct_label_ids: dict to map punctuation labels to label ids. Starts with pad_label->0.
-        capit_label_ids: dict to map capitalization labels to label ids. Starts with pad_label->0.
-        pad_label: pad value use for labels. By default, it's the neutral label for punctuation and capitalization.
-            Its id in ``punct_label_ids`` and ``capit_label_ids`` has to be ``0``
-        punct_label_lines: a list of a tuple of labels for every word in a sequence (str)
-        capit_label_lines: a list or a tuple of labels for every word in a sequence (str)
-        verbose: whether to show examples of tokenized data and various progress information
-        n_jobs: a number of workers used for preparing features. If ``n_jobs <= 0``, then do not use multiprocessing
-            and run features creation in this process. If not set, number of workers will be equal to the number of
-            CPUs.
-
-            !!WARNING!!
-            There can be deadlocking problems with some tokenizers (e.g. SentencePiece, HuggingFace AlBERT)
-            if ``n_jobs > 0``.
-
-        progress_queue: a multiprocessing queue used for reporting progress. Useful for creating tarred dataset
-        audio_queries: a list of audio filepaths
-        sample_rate: target sample rate of audios
-        preload_audios: whether to preload audios or not
-
-    Returns:
-        input_ids: a list of 1D int32 arrays. Each array contains token ids of corresponding query
-        subtokens_mask: a list of 1D boolean arrays. An array element is ``True`` if corresponding token is the
-            first token in a word
-        punct_labels: a list of 1D int32 arrays. Encoded punctuation labels for every token in a query. Tokens in one
-            word have identical labels.
-        capit_labels: a list of 1D int32 arrays. Encoded capitalization labels for every token in a query. Tokens in
-            one word have identical labels
-    """
-    if verbose:
-        logging.info("Start initial tokenization.")
-    create_progress_process = progress_queue is None
-    if n_jobs is None:
-        n_jobs = min(mp.cpu_count(), len(queries))
-
-    if verbose:
-        logging.info(f"Running tokenization with {n_jobs} jobs.")
-
-    # Number of queries in split
-    split_size = min(len(queries) // max(n_jobs, 1), MAX_NUM_QUERIES_IN_SPLIT)
-    n_split = len(queries) // split_size
-    split_queries = [queries[split_size * i : split_size * (i + 1)] for i in range(n_split - 1)] + [
-        queries[split_size * (n_split - 1) :]
-    ]
-    split_punct_labels_lines = [
-        punct_label_lines[split_size * i : split_size * (i + 1)] for i in range(n_split - 1)
-    ] + [punct_label_lines[split_size * (n_split - 1) :]]
-    split_capit_labels_lines = [
-        capit_label_lines[split_size * i : split_size * (i + 1)] for i in range(n_split - 1)
-    ] + [capit_label_lines[split_size * (n_split - 1) :]]
-
-    args = list(zip(split_queries, split_punct_labels_lines, split_capit_labels_lines, range(n_split)))
-    if audio_queries:
-        split_audio_queries = [audio_queries[split_size * i : split_size * (i + 1)] for i in range(n_split - 1)] + [
-            audio_queries[split_size * (n_split - 1) :]
-        ]
-
-        args = list(
-            zip(
-                split_queries,
-                split_punct_labels_lines,
-                split_capit_labels_lines,
-                range(n_split),
-                split_audio_queries,
-                [sample_rate for _ in range(n_split)],
-                [preload_audios for _ in range(n_split)],
-            )
-        )
-    if create_progress_process:
-        progress = Progress(len(queries), "Tokenization", "query")
-        progress_queue = progress.get_queues()[0]
-    if n_jobs > 0:
-        with mp.Pool(n_jobs) as pool:
-            result = pool.starmap(
-                TokenizeCreateMasksClipWorker(
-                    max_seq_length, tokenizer, punct_label_ids, capit_label_ids, pad_label, verbose, progress_queue,
-                ),
-                args,
-            )
-    else:
-        result = []
-        for x in args:
-            result.append(
-                TokenizeCreateMasksClipWorker(
-                    max_seq_length, tokenizer, punct_label_ids, capit_label_ids, pad_label, verbose, progress_queue,
-                )(*x)
-            )
-    if create_progress_process:
-        progress.finish()
-
-    input_ids, subtokens_mask, punct_labels, capit_labels, waveforms, audio_lengths, audio_filepaths = tuple(
-        list(itertools.chain(*e)) for e in zip(*result)
-    )
-    if verbose:
-        logging.info("Finished initial tokenization.")
-        get_stats([len(inp) for inp in input_ids])
-        logging.info(f"Finished clipping and padding.")
-        for i in range(min(len(input_ids), 5)):
-            logging.info("*** Example ***")
-            logging.info("i: %s" % i)
-            logging.info("subtokens: %s" % " ".join(list(map(str, input_ids[i]))))
-            logging.info("subtokens_mask: %s" % " ".join(list(map(str, subtokens_mask[i]))))
-            logging.info("punct_labels: %s" % " ".join(list(map(str, punct_labels[i]))))
-            logging.info("capit_labels: %s" % " ".join(list(map(str, capit_labels[i]))))
-
-    return (
-        input_ids,
-        subtokens_mask,
-        waveforms,
-        audio_lengths,
-        audio_filepaths,
-        punct_labels,
-        capit_labels,
-    )
-
-
-def create_masks_and_segment_ids(
-    input_ids: np.ndarray,
-    subtokens_mask: np.ndarray,
-    pad_id: int,
-    cls_id: int,
-    sep_id: int,
-    ignore_start_end: bool,
-    ignore_extra_tokens: bool,
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """
-    Creates segment ids array, input mask, loss mask.
-
-    Segment ids array is BERT token type ids in HuggingFace terminology. It is a zeros array for punctuation
-    and capitalization task.
-
-    Input mask element is ``True`` if an element of ``input_ids`` is not padding and ``False`` otherwise.
-
-    Loss mask element is ``True`` for the first token in a word. If ``ignore_start_end=False``, then loss mask
-    element is ``True`` for [CLS] and [SEP] tokens. If ``ignore_extra_tokens=False``, then loss mask element is ``True``
-    for all word tokens. In all other cases loss mask elements are ``False``.
-
-    Args:
-        input_ids: an integer array of shape ``[Batch, Time]`` containing ids of source token ids
-        subtokens_mask: a boolean array of shape ``[Batch, Time]`` which elements are ``True`` if they correspond to
-            the first token of some word
-        pad_id: an id of padding token
-        cls_id: an id of [CLS] token
-        sep_id: an id of [SEP] token
-        ignore_start_end: whether to compute loss for [CLS] and [SEP] tokens
-        ignore_extra_tokens: whether to compute loss for not first tokens in words
-
-    Returns:
-        segment_ids: int8 array of shape [Batch, Time]
-        input_mask: boolean array of shape [Batch, Time]
-        loss_mask: boolean array of shape [Batch, Time]
-    """
-    segment_ids = np.zeros_like(input_ids, dtype=np.int8)
-    input_mask = np.not_equal(input_ids, pad_id)
-    special_mask = np.equal(input_ids, cls_id) & np.equal(input_ids, sep_id)
-    if ignore_start_end:
-        if ignore_extra_tokens:
-            loss_mask = subtokens_mask
-        else:
-            loss_mask = input_mask & ~special_mask
-    else:
-        if ignore_extra_tokens:
-            loss_mask = subtokens_mask | special_mask
-        else:
-            loss_mask = input_mask
-    return segment_ids, input_mask, loss_mask
-
-
-def create_label_ids(unique_labels: Set[str], pad_label: str) -> Dict[str, int]:
-    """
-    Returns label ids dictionary. ``pad_label`` always has id ``0``. Other labels are sorted in alphabetical order.
-    Args:
-        unique_labels: a set of labels from which label ids dictionary is created. May or may not contain ``pad_label``
-        pad_label: label used for padding. It is also a neutral label
-
-    Returns:
-        label ids dictionary
-    """
-    label_ids = {pad_label: 0}
-    if pad_label in unique_labels:
-        unique_labels.remove(pad_label)
-    for label in sorted(unique_labels):
-        label_ids[label] = len(label_ids)
-    return label_ids
-
-
-def load_label_ids(file_path: Union[str, os.PathLike]) -> Dict[str, int]:
-    ids = {}
-    with open(file_path, encoding='utf_8') as f:
-        for i, line in enumerate(f):
-            ids[line.strip()] = i
-    return ids
-
-
-def save_label_ids(label_ids: Dict[str, int], file_path: Path) -> None:
-    """
-    Saves label ids map to a file. In each line of a file one label is saved. Labels are saved in the order of
-    increasing of their ids.
-
-    Args:
-        label_ids: label id dictionary. Pad label has to have id ``0``
-        file_path: path to a file where labels will be saved
-    """
-    file_path.parent.mkdir(parents=True, exist_ok=True)
-    with file_path.open('w', encoding='utf_8', newline='\n') as out:
-        labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1]))
-        out.write('\n'.join(labels))
-
-
-def raise_not_equal_labels_error(
-    first_labels: Dict[str, int], second_labels: Dict[str, int], first_labels_desc: str, second_labels_desc: str
-) -> None:
-    """
-    A helper function for raising comprehensible error if labels from 2 sources are different.
-    Such sources may include:
-      - labels stored in .nemo checkpoint
-      - labels stored in tarred dataset
-      - labels passed in config parameters ``model.common_dataset_parameters.{punct_label_ids,capit_label_ids}``
-      - labels from files passed in config parameters ``model.class_labels.{punct_labels_file,capit_labels_file}``
-      - labels in attributes ``PunctuationCapitalizationModel.{punct_label_ids,capit_label_ids}``
-      - any other source
-    This function helps to detect configuration early and give error messages that are easy to interpret.
-    Call this function if ``first_labels != second_labels``.
-
-    Args:
-        first_labels: first dictionary with labels
-        second_labels: second dictionary with labels
-        first_labels_desc: a description of first labels
-        second_labels_desc: a description of second labels
-    """
-    missing_in_first = {k: second_labels[k] for k in set(second_labels) - set(first_labels)}
-    missing_in_second = {k: first_labels[k] for k in set(first_labels) - set(second_labels)}
-    not_equal = {
-        k: {'FIRST LABELS': first_labels[k], 'SECOND LABELS': second_labels[k]}
-        for k in set(first_labels) & set(second_labels)
-        if first_labels[k] != second_labels[k]
-    }
-    msg = f"{first_labels_desc} (FIRST LABELS) are not equal to {second_labels_desc} (SECOND LABELS)."
-    if len(missing_in_first) > 0:
-        msg += f" Number of SECOND LABELS missing in the FIRST LABELS: {len(missing_in_first)}."
-    if len(missing_in_second) > 0:
-        msg += f" Number of FIRST LABELS missing in the SECOND LABELS: {len(missing_in_second)}."
-    if len(not_equal) > 0:
-        msg += f" Number of labels which are not equal: {len(not_equal)}."
-    if len(missing_in_first) > 0:
-        msg += (
-            f" Several examples of missing SECONDS LABELS in the FIRST LABELS: "
-            f"{dict(list(missing_in_first.items())[:3])}."
-        )
-    if len(missing_in_second) > 0:
-        msg += (
-            f" Several examples of missing FIRST LABELS in the SECOND LABELS: "
-            f"{dict(list(missing_in_second.items())[:3])}."
-        )
-    if len(not_equal) > 0:
-        msg += f" Several examples of labels which are not equal: {dict(list(not_equal.items())[:3])}"
-    raise ValueError(msg)
-
-
-def pad(vectors: List[np.ndarray], length: int, value: Union[int, float, bool]) -> np.ndarray:
-    """
-    Pad vectors to length ``length`` and then stack.
-    Args:
-        vectors: a list of 1D arrays. Arrays to pad and stack
-        length: a length of padded sequence. Has to be greater or equal to the maximum length of an element of
-            ``vectors``.
-        value: a value used for padding
-
-    Returns:
-        an array of padded vectors
-    """
-    result = []
-    for v in vectors:
-        result.append(np.concatenate([v, np.full([length - v.shape[0]], value, dtype=v.dtype)]))
-    return np.stack(result)
-
-
-class BertPunctuationCapitalizationDataset(Dataset):
-    """
-    A dataset to use during training for punctuation and capitalization tasks.
-    For inference, you will need
-    :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_infer_dataset.BertPunctuationCapitalizationInferDataset`.
-    For huge datasets which cannot be loaded into memory simultaneously use
-    :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.BertPunctuationCapitalizationTarredDataset`.
-
-    Args:
-        text_file (:obj:`Union[str, os.PathLike]`): a path to a file with sequences, each line should contain a text
-            without punctuation and capitalization
-        labels_file (:obj:`Union[str, os.PathLike]`): a path to a file with labels, each line corresponds to word
-            labels for a sentence in the ``text_file``. Labels have to follow format described in this section of
-            documentation :ref:`NeMo Data Format<nemo-data-format-label>`.
-        max_seq_length (:obj:`int`): max number of tokens in a source sequence. ``max_seq_length`` includes for [CLS]
-            and [SEP] tokens. Sequences which are too long will be clipped by removal of tokens from the end of the
-            sequence.
-        tokenizer (:obj:`TokenizerSpec`): a tokenizer instance which has properties ``unk_id``, ``sep_id``, ``bos_id``,
-            ``eos_id``.
-        num_samples (:obj:`int`, `optional`, defaults to :obj:`-1`): a number of samples you want to use for the
-            dataset. If ``-1``, use all dataset. Useful for testing.
-        tokens_in_batch (:obj:`int`, `optional`, defaults to :obj:`5000`): number of tokens in a batch including
-            paddings and special tokens ([CLS], [SEP], [UNK]). This class :meth:`__getitem__` method returns not
-            samples but ready batches. Number of samples in a batch is adjusted for input sequences lengths. If input
-            sequences are short, then a batch will contain more samples. Before packing into batches, samples are
-            sorted by number of tokens they contain. Sorting allows to reduce number of pad tokens in a batch
-            significantly. Regular PyTorch data loader shuffling will only permute batches with changing their content.
-            Proper shuffling is achieved via calling method :meth:`repack_batches_with_shuffle` every epoch. If
-            parameter ``number_of_batches_is_multiple_of`` is greater than 1, some batches may be split into smaller
-            pieces.
-        pad_label (:obj:`str`, `optional`, defaults to :obj:`'O'`): pad value to use for labels. It's also the neutral
-            label both for punctuation and capitalization.
-        punct_label_ids (:obj:`Dict[str, int]`, `optional`): dict to map punctuation labels to label ids. For dev set,
-            use label ids generated during training to support cases when not all labels are present in the dev set.
-            For training, it is recommended to set ``punct_label_ids`` to ``None`` or load from cache.
-        capit_label_ids (:obj:`Dict[str, int]`, `optional`): same ``punct_label_ids`` for capitalization labels.
-        ignore_extra_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to compute loss on
-            tokens which are not first tokens in a word. For example, assume that word ``'tokenization'`` is tokenized
-            into ``['token', 'ization']``. If ``ignore_extra_tokens=True``, loss mask for the word is
-            ``[True, False]``, and if ``ignore_extra_tokens=False``, then loss mask is ``[True, True]``.
-        ignore_start_end (:obj:`bool`, `optional`, defaults to :obj:`True`): whether to ignore [CLS] and [SEP] tokens
-            in the loss_mask.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): whether to use pickled features already present
-            in ``cache_dir`` or not. If pickled features file does not exist or ``use_cache=False``, then features are
-            pickled in ``cache_dir``. Pickled features include input ids, subtokens mask (mask of first tokens in
-            words), encoded punctuation and capitalization labels, label ids. Features creation consumes considerable
-            time and this ``use_cache=True`` significantly speeds up training starting. Pickled features are also
-            used for sharing features between processes if data parallel training is used.
-        cache_dir (:obj:`Union[str, os.PathLike]`, `optional`): a path to a directory where cache (pickled features)
-            is stored. By default, ``text_file`` parent directory is used. This parameter is useful if dataset
-            directory is read-only, and you wish to pickle features. In such a case specify a path to directory which
-            allows writing in ``cache_dir`` parameter.
-        get_label_frequencies (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to print and save label
-            frequencies. Frequencies are showed if ``verbose`` parameter is ``True``. If
-            ``get_label_frequencies=True``, then frequencies are saved into ``label_info_save_dir`` directory.
-        label_info_save_dir (:obj:`Union[str, os.PathLike]`, `optional`): a path to a directory where label frequencies
-            are saved. By default, a ``text_file`` parent directory is used. When method
-            :meth:`save_labels_and_get_file_paths` is called label ids are saved into ``label_info_save_dir``
-            directory. This parameter is useful if directory containing ``text_file`` is read-only.
-        punct_label_vocab_file (:obj:`Union[str, os.PathLike]`, `optional`): a path to a .csv file containing
-            punctuation label vocabulary. Each line in such a vocabulary file contains exactly one label. The first
-            line has to contain `pad_label`, otherwise error will be raised.
-        capit_label_vocab_file (:obj:`Union[str, os.PathLike]`, `optional`): same as ``punct_label_vocab_file`` for
-            capitalization labels.
-        add_masks_and_segment_ids_to_batch (:obj:`bool`, `optional`, defaults to :obj:`True`): whether to add
-            ``'loss_mask'``, ``'input_mask'``, ``'segment_ids'`` items to a batch. Useful for creation of tarred
-            dataset and can NOT be used during model training and inference.
-        verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): whether to show data examples, label stats and
-            other useful information.
-        n_jobs (:obj:`int`, `optional`, defaults to :obj:`0`): number of workers used for tokenization, encoding
-            labels, creating "first token in word" mask, and clipping. If ``n_jobs <= 0`` data preparation is performed
-            without multiprocessing. By default, ``n_jobs`` is ``0``.
-
-            .. warning::
-                There can be deadlocking problems with some tokenizers (e.g. SentencePiece, HuggingFace AlBERT)
-                if ``n_jobs > 0``.
-
-        number_of_batches_is_multiple_of (:obj:`int`, `optional`, defaults to :obj:`1`): number of batches in the
-            dataset is made divisible by ``number_of_batches_is_multiple_of``. If ``number_of_batches_is_multiple_of``
-            is greater than 1, then several batches are split in parts until number of batches
-            is divisible by ``number_of_batches_is_multiple_of``. If there is no enough queries in the dataset to
-            create enough batches, then a warning is printed. This parameter is useful for dev and validation datasets
-            if multiple GPUs are used. The problem is that if number of batches is not evenly divisible by number of
-            GPUs, then some queries may be processed several times and metrics will be distorted.
-        batch_shuffling_random_seed (:obj:`int`, defaults to :obj:`int`): a random seed used for batches repacking and
-            shuffling.
-        tokenization_progress_queue (:obj:`multiprocessing.Queue`, `optional`): a queue for reporting tokenization
-            progress. Useful for creation of tarred dataset
-        batch_mark_up_progress_queue (:obj:`multiprocessing.Queue`, `optional`): a queue for reporting progress in
-            deciding which samples batches will contain. Useful for creation of tarred dataset
-        batch_building_progress_queue (:obj:`multiprocessing.Queue`, `optional`): a queue for reporting progress in
-            batch creation (stacking and padding). Useful for creation of tarred dataset
-        use_audio (:obj:`bool`, `optional`, defaults to :obj: `False`): If set to True dataset will return audio as well as text.
-        audio_file (:obj:`Union[str, os.PathLike]`, `optional`): a path to file with audio paths.
-        sample_rate (:obj:`int`, `optional`, defaults to :obj:`None`): sample rate of audios. Can be used for up sampling or down sampling of audio.
-        use_bucketing (:obj:`bool`, `optional`, defaults to :obj: `True`): If set to False dataset will return ``batch_size`` batches instead of ``number_of_tokens`` tokens.
-        preload_audios (:obj:`bool`, `optional`, defaults to :obj: `True`): If set to True batches will include waveforms, if set to False will store audio_filepaths instead and load audios during ``collate_fn`` call
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports. """
-        if self.use_audio:
-            return {
-                'input_ids': NeuralType(('B', 'T'), ChannelType()),
-                'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-                'input_mask': NeuralType(('B', 'T'), MaskType()),
-                'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-                'loss_mask': NeuralType(('B', 'T'), MaskType()),
-                'punct_labels': NeuralType(('B', 'T'), LabelsType()),
-                'capit_labels': NeuralType(('B', 'T'), LabelsType()),
-                'features': NeuralType(('B', 'T'), AudioSignal()),
-                'features_length': NeuralType(('B', 'T'), LengthsType()),
-            }
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-            'loss_mask': NeuralType(('B', 'T'), MaskType()),
-            'punct_labels': NeuralType(('B', 'T'), LabelsType()),
-            'capit_labels': NeuralType(('B', 'T'), LabelsType()),
-        }
-
-    def __init__(
-        self,
-        text_file: Union[str, os.PathLike],
-        labels_file: Union[str, os.PathLike],
-        max_seq_length: int,
-        tokenizer: TokenizerSpec,
-        num_samples: int = -1,
-        tokens_in_batch: int = 5000,
-        pad_label: str = 'O',
-        punct_label_ids: Optional[Union[Dict[str, int], DictConfig]] = None,
-        capit_label_ids: Optional[Union[Dict[str, int], DictConfig]] = None,
-        ignore_extra_tokens: bool = False,
-        ignore_start_end: bool = True,
-        use_cache: bool = True,
-        cache_dir: Optional[Union[str, os.PathLike]] = None,
-        get_label_frequencies: bool = False,
-        label_info_save_dir: Optional[Union[str, os.PathLike]] = None,
-        punct_label_vocab_file: Optional[Union[str, os.PathLike]] = None,
-        capit_label_vocab_file: Optional[Union[str, os.PathLike]] = None,
-        add_masks_and_segment_ids_to_batch: bool = True,
-        verbose: bool = True,
-        n_jobs: Optional[int] = 0,
-        number_of_batches_is_multiple_of: int = 1,
-        batch_shuffling_random_seed: int = 42,
-        tokenization_progress_queue: Optional[mp.Queue] = None,
-        batch_mark_up_progress_queue: Optional[mp.Queue] = None,
-        batch_building_progress_queue: Optional[mp.Queue] = None,
-        use_audio: Optional[bool] = False,
-        audio_file: Optional[Union[str, os.PathLike]] = None,
-        sample_rate: Optional[int] = None,
-        use_bucketing: Optional[bool] = True,
-        preload_audios: Optional[bool] = True,
-    ) -> None:
-        """ Initializes BertPunctuationCapitalizationDataset. """
-        if isinstance(punct_label_ids, DictConfig):
-            punct_label_ids = OmegaConf.to_container(punct_label_ids)
-        if isinstance(capit_label_ids, DictConfig):
-            capit_label_ids = OmegaConf.to_container(capit_label_ids)
-
-        self._check_constructor_parameters(
-            text_file,
-            labels_file,
-            punct_label_ids,
-            capit_label_ids,
-            punct_label_vocab_file,
-            capit_label_vocab_file,
-            num_samples,
-            use_cache,
-            number_of_batches_is_multiple_of,
-            use_audio,
-            audio_file,
-            sample_rate,
-        )
-
-        if punct_label_vocab_file is not None:
-            punct_label_vocab_file = Path(punct_label_vocab_file).expanduser()
-            punct_label_ids = load_label_ids(punct_label_vocab_file)
-        if capit_label_vocab_file is not None:
-            capit_label_vocab_file = Path(capit_label_vocab_file).expanduser()
-            capit_label_ids = load_label_ids(capit_label_vocab_file)
-        self.text_file, self.labels_file = Path(text_file).expanduser(), Path(labels_file).expanduser()
-        if label_info_save_dir is None:
-            self.label_info_save_dir = self.text_file.parent
-        else:
-            self.label_info_save_dir = Path(label_info_save_dir).expanduser()
-
-        self.tokens_in_batch = tokens_in_batch
-        self.tokenizer = tokenizer
-        self.pad_label = pad_label
-        self.ignore_extra_tokens = ignore_extra_tokens
-        self.ignore_start_end = ignore_start_end
-        self.add_masks_and_segment_ids_to_batch = add_masks_and_segment_ids_to_batch
-        self.verbose = verbose
-        self.batch_mark_up_progress_queue = batch_mark_up_progress_queue
-        self.batch_building_progress_queue = batch_building_progress_queue
-        self.use_audio = use_audio
-        self.audio_file = audio_file
-        self.sample_rate = sample_rate
-        self.use_bucketing = use_bucketing
-        self.preload_audios = preload_audios
-
-        master_device = is_global_rank_zero()
-        self.features_pkl = self._get_path_to_pkl_features(
-            self.text_file, self.labels_file, cache_dir, max_seq_length, num_samples
-        )
-        features = None
-        if master_device and not (self.features_pkl.is_file() and use_cache):
-            if verbose:
-                logging.info(
-                    f'Processing {self.text_file}' + f' {self.audio_file if self.audio_file else ""} '.rstrip()
-                )
-
-            (
-                text_lines,
-                punct_label_lines,
-                capit_label_lines,
-                punct_unique_labels,
-                capit_unique_labels,
-                audio_lines,
-            ) = self._read_dataset(self.text_file, self.labels_file, num_samples, self.audio_file)
-
-            if punct_label_ids:
-                self._check_label_ids_vs_unique_labels(
-                    punct_label_ids, punct_unique_labels, 'punct', 'punctuation', self.labels_file
-                )
-            else:
-                punct_label_ids = create_label_ids(punct_unique_labels, self.pad_label)
-            if capit_label_ids:
-                self._check_label_ids_vs_unique_labels(
-                    capit_label_ids, capit_unique_labels, 'capit', 'capitalization', self.labels_file
-                )
-            else:
-                capit_label_ids = create_label_ids(capit_unique_labels, self.pad_label)
-            features = _get_features(
-                text_lines,
-                punct_label_lines,
-                capit_label_lines,
-                max_seq_length,
-                self.tokenizer,
-                pad_label=self.pad_label,
-                punct_label_ids=punct_label_ids,
-                capit_label_ids=capit_label_ids,
-                verbose=self.verbose,
-                progress_queue=tokenization_progress_queue,
-                n_jobs=n_jobs,
-                audio_queries=audio_lines if self.use_audio else None,
-                sample_rate=self.sample_rate,
-                preload_audios=self.preload_audios,
-            )
-            self.features_pkl.parent.mkdir(parents=True, exist_ok=True)
-
-            # save features to a temp file first to make sure that non-master processes don't start reading the file
-            # until the master process is done with writing
-            ofd, tmp_features_pkl = tempfile.mkstemp(
-                suffix='.pkl', prefix=os.path.basename(self.features_pkl), dir=os.path.dirname(self.features_pkl)
-            )
-            with os.fdopen(ofd, 'wb') as temp_f:
-                pickle.dump(tuple(list(features) + [punct_label_ids, capit_label_ids]), temp_f)
-
-            os.rename(tmp_features_pkl, self.features_pkl)
-
-            if self.verbose:
-                logging.info(f'Features saved to {self.features_pkl}')
-
-        # wait until the master process writes to the processed data files
-        if not master_device:
-            while features is None and not os.path.exists(self.features_pkl):
-                sleep(10)
-
-        if features is None:
-            features = pickle.load(self.features_pkl.open('rb'))
-            li = features[-2:]
-            self._check_label_ids_loaded_from_pkl(
-                punct_label_ids, capit_label_ids, *li, punct_label_vocab_file, capit_label_vocab_file
-            )
-            punct_label_ids, capit_label_ids = li[-2], li[-1]
-            if tokenization_progress_queue is not None:
-                tokenization_progress_queue.put(len(features[0]))
-            if self.verbose:
-                logging.info(f'Features restored from {self.features_pkl}')
-            features = features[:-2]
-
-        (
-            self.input_ids,
-            self.subtokens_mask,
-            self.waveforms,
-            self.waveforms_length,
-            self.audio_filepaths,
-            self.punct_labels,
-            self.capit_labels,
-        ) = features
-        self.punct_label_ids, self.capit_label_ids = punct_label_ids, capit_label_ids
-        self.number_of_batches_is_multiple_of = number_of_batches_is_multiple_of
-        self.batch_shuffling_random_state = np.random.RandomState(batch_shuffling_random_seed)
-        if get_label_frequencies:
-            self.punct_label_frequencies = self._calculate_and_save_label_frequencies(self.punct_labels, 'punct')
-            self.capit_label_frequencies = self._calculate_and_save_label_frequencies(self.capit_labels, 'capit')
-        if self.use_bucketing:
-            self.batches = self._pack_into_batches(
-                input_ids=self.input_ids,
-                subtokens_mask=self.subtokens_mask,
-                punct_labels=self.punct_labels,
-                capit_labels=self.capit_labels,
-                waveforms=self.waveforms,
-                audio_lengths=self.waveforms_length,
-                audio_filepaths=self.audio_filepaths,
-            )
-        else:
-            self.batches = self._form_batches(
-                input_ids=self.input_ids,
-                subtokens_mask=self.subtokens_mask,
-                punct_labels=self.punct_labels,
-                capit_labels=self.capit_labels,
-                waveforms=self.waveforms,
-                audio_lengths=self.waveforms_length,
-                audio_filepaths=self.audio_filepaths,
-            )
-
-    def _get_path_to_pkl_features(
-        self,
-        text_file: Path,
-        labels_file: Path,
-        cache_dir: Optional[Union[str, os.PathLike]],
-        max_seq_length: int,
-        num_samples: int,
-    ) -> Path:
-        if cache_dir is None:
-            cache_dir = text_file.parent
-        else:
-            cache_dir = Path(cache_dir).expanduser()
-        vocab_size = getattr(self.tokenizer, "vocab_size", 0)
-        features_pkl = cache_dir / "cached.{}.{}.max_seq_length{}.vocab{}.{}.punctuation_capitalization.pkl".format(
-            '__' + text_file.name + '__' + labels_file.name + '__',
-            self.tokenizer.name,
-            max_seq_length,
-            vocab_size,
-            f'num_samples{num_samples}' if num_samples > 0 else 'all_samples',
-        )
-        return features_pkl
-
-    @staticmethod
-    def _check_constructor_parameters(
-        text_file: Union[str, os.PathLike],
-        labels_file: Union[str, os.PathLike],
-        punct_label_ids: Optional[Dict[str, int]],
-        capit_label_ids: Optional[Dict[str, int]],
-        punct_label_vocab_file: Union[str, os.PathLike],
-        capit_label_vocab_file: Union[str, os.PathLike],
-        num_samples: int,
-        use_cache: bool,
-        number_of_batches_is_multiple_of: int,
-        use_audio: bool = False,
-        audio_file: Optional[Union[str, os.PathLike]] = None,
-        sample_rate: Optional[int] = None,
-    ) -> None:
-        if torch.distributed.is_initialized() and torch.distributed.get_world_size() > 1 and not use_cache:
-            raise ValueError(
-                f"If you already created process group and the world size is greater than 1, then `use_cache` "
-                f"parameter has to be `True`. Only master process prepares features and if `use_cache=False`, then "
-                f"other processes will not be able to obtain features. Alternatively, you may set `use_cache=False` "
-                f"and set up data before spawning processes. Use `cache_dir` dataset directory with "
-                f"`text_file` and `labels_file` is read-only."
-            )
-        if not (os.path.exists(text_file) and os.path.exists(labels_file)):
-            raise FileNotFoundError(
-                f'{text_file} or {labels_file} not found. The data should be split into 2 files: text.txt and '
-                f'labels.txt. Each line of the text.txt file contains text sequences, where words are separated with '
-                f'spaces. The labels.txt file contains corresponding labels for each word in text.txt, the labels are '
-                f'separated with spaces. Each line of the files should follow the format:\n'
-                f'   [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and '
-                f'   [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).'
-            )
-        if not use_audio and audio_file:
-            raise ValueError(f"Audio file {audio_file} was passed but use_audio was set to False")
-        if use_audio and audio_file and not os.path.exists(audio_file):
-            raise FileNotFoundError(
-                f'use_audio was set to True but {audio_file} not found. Audio data should be listed in .txt file with one path per line'
-            )
-        if punct_label_ids is not None and punct_label_vocab_file is not None:
-            punct_label_vocab_file = Path(punct_label_vocab_file).expanduser()
-            file_punct_label_ids = load_label_ids(punct_label_vocab_file)
-            if file_punct_label_ids != punct_label_ids:
-                raise_not_equal_labels_error(
-                    first_labels=punct_label_ids,
-                    second_labels=file_punct_label_ids,
-                    first_labels_desc='Punctuation labels passed to the `PunctuationCapitalizationDataset` '
-                    'constructor in parameter `punct_label_ids`',
-                    second_labels_desc=f'Punctuation labels loaded from file {punct_label_vocab_file} path to which '
-                    f'is passed in parameter `punct_label_vocab_file`',
-                )
-        if capit_label_ids is not None and capit_label_vocab_file is not None:
-            capit_vocab_file = Path(capit_label_vocab_file).expanduser()
-            file_capit_label_ids = load_label_ids(capit_vocab_file)
-            if file_capit_label_ids != capit_label_ids:
-                raise_not_equal_labels_error(
-                    first_labels=capit_label_ids,
-                    second_labels=file_capit_label_ids,
-                    first_labels_desc='Capitalization labels passed to the `PunctuationCapitalizationDataset` '
-                    'constructor in parameter `capit_label_ids`',
-                    second_labels_desc=f'Capitalization labels loaded from file {capit_label_vocab_file} path to '
-                    f'which is passed in parameter `capit_label_vocab_file`',
-                )
-        if num_samples == 0:
-            raise ValueError(
-                f"Parameter `num_samples` has to be positive or negative whereas `num_samples={num_samples}`. "
-                f"Negative `num_samples` is for using all samples in a dataset."
-            )
-        if number_of_batches_is_multiple_of < 1 or not isinstance(number_of_batches_is_multiple_of, int):
-            raise ValueError(
-                f"Parameter `number_of_batches_is_multiple_of` has to be positive integer whereas "
-                f"{number_of_batches_is_multiple_of} is given."
-            )
-
-        if use_audio and not isinstance(sample_rate, int):
-            raise TypeError(f'use_audio was set to True but sample_rate was not set')
-
-        if use_audio and sample_rate < 1:
-            raise ValueError(f'sample_rate set to {sample_rate} but it cannot be less than 1')
-
-    def _check_label_ids_loaded_from_pkl(
-        self,
-        parameter_punct_label_ids: Dict[str, int],
-        parameter_capit_label_ids: Dict[str, int],
-        pkl_punct_label_ids: Any,
-        pkl_capit_label_ids: Any,
-        punct_label_vocab_file: Optional[Path],
-        capit_label_vocab_file: Optional[Path],
-    ) -> None:
-        if not isinstance(pkl_punct_label_ids, dict):
-            raise ValueError(
-                f"Punctuation label ids loaded from features file {self.features_pkl} have wrong type "
-                f"{type(pkl_punct_label_ids)}"
-            )
-        if parameter_punct_label_ids is not None:
-            if parameter_punct_label_ids != pkl_punct_label_ids:
-                raise_not_equal_labels_error(
-                    first_labels=parameter_punct_label_ids,
-                    second_labels=pkl_punct_label_ids,
-                    first_labels_desc="Punctuation labels passed in parameter `punct_label_ids`"
-                    if punct_label_vocab_file is None
-                    else f"Punctuation labels loaded from file {punct_label_vocab_file}",
-                    second_labels_desc=f"Punctuation label ids loaded from features file {self.features_pkl}",
-                )
-        if not isinstance(pkl_capit_label_ids, dict):
-            raise ValueError(
-                f"Capitalization label ids loaded from features file {self.features_pkl} has wrong type "
-                f"{type(pkl_capit_label_ids)}"
-            )
-        if parameter_capit_label_ids is not None:
-            if parameter_capit_label_ids != pkl_capit_label_ids:
-                raise_not_equal_labels_error(
-                    first_labels=parameter_capit_label_ids,
-                    second_labels=pkl_capit_label_ids,
-                    first_labels_desc="Capitalization labels passed in parameter `capit_label_ids`"
-                    if capit_label_vocab_file is None
-                    else f"Capitalization labels loaded from file {capit_label_vocab_file}",
-                    second_labels_desc=f"Capitalization label ids loaded from features file {self.features_pkl}",
-                )
-
-    @staticmethod
-    def _check_label_ids_vs_unique_labels(
-        label_ids: Dict[str, int], unique_labels: Set[str], label_type: str, task: str, label_file: Path
-    ) -> None:
-        if unique_labels - set(label_ids):
-            not_present_labels = list(unique_labels - set(label_ids))
-            raise ValueError(
-                f"{len(not_present_labels)} {task} labels found in {label_file} are not present in "
-                f"`{label_type}_label_ids`. Examples of unexpected labels from {label_file}: {not_present_labels[:3]}"
-            )
-
-    @staticmethod
-    def _read_dataset(
-        text_file: Path, labels_file: Path, num_samples: int, audio_file: Optional[Path] = None
-    ) -> Union[Tuple[Any, Any, Any, Set[Any], Set[Any], Any], Tuple[Any, Any, Any, Set[Any], Set[Any]]]:
-        with open(text_file, 'r', encoding='utf_8') as f:
-            text_lines = f.readlines()
-        punct_unique_labels, capit_unique_labels = set(), set()
-        punct_labels_lines, capit_labels_lines = [], []
-        with labels_file.open(encoding='utf_8') as f:
-            for i, line in enumerate(f):
-                pairs = line.split()
-                if not all([len(p) == 2 for p in pairs]):
-                    raise ValueError(
-                        f"Some label pairs are not pairs but have wrong length (!= 2) in line {i} in label file "
-                        f"{labels_file}"
-                    )
-                words = text_lines[i].split()
-                if len(pairs) != len(words):
-                    raise ValueError(
-                        f"In line {i} in text file {text_file} number of words {len(words)} is not equal to the "
-                        f"number of labels {len(pairs)} in labels file {labels_file}."
-                    )
-                punct_line, capit_line = zip(*pairs)
-                punct_labels_lines.append(punct_line)
-                capit_labels_lines.append(capit_line)
-                punct_unique_labels.update(punct_line)
-                capit_unique_labels.update(capit_line)
-        if len(punct_labels_lines) != len(text_lines):
-            raise ValueError(
-                f"Number of text lines {len(text_lines)} in text file {text_file} is not equal to the number of lines "
-                f"{len(punct_labels_lines)} in labels file {labels_file}."
-            )
-
-        if audio_file:
-            with open(audio_file, 'r') as f:
-                audio_lines = f.readlines()
-            if len(audio_lines) != len(text_lines):
-                raise ValueError(
-                    f'Number of lines in {audio_file} equals {len(audio_lines)} which is not equal to '
-                    f'number of lines in {text_file} which is {len(text_lines)}'
-                )
-            dataset = list(zip(text_lines, punct_labels_lines, capit_labels_lines, audio_lines))
-        else:
-            dataset = list(zip(text_lines, punct_labels_lines, capit_labels_lines))
-        if len(dataset) == 0:
-            raise ValueError(f"Dataset loaded from files {text_file} and {labels_file} is empty.")
-        if num_samples > 0:
-            dataset = dataset[:num_samples]
-        if audio_file:
-            text_lines, punct_labels_lines, capit_labels_lines, audio_lines = zip(*dataset)
-            return (
-                text_lines,
-                punct_labels_lines,
-                capit_labels_lines,
-                punct_unique_labels,
-                capit_unique_labels,
-                audio_lines,
-            )
-        else:
-            text_lines, punct_labels_lines, capit_labels_lines = zip(*dataset)
-            return text_lines, punct_labels_lines, capit_labels_lines, punct_unique_labels, capit_unique_labels, None
-
-    @staticmethod
-    def calc_batch_seq_length(queries: List[np.ndarray], length_is_multiple_of: int) -> int:
-        return ceil(max([len(elem) for elem in queries]) / length_is_multiple_of) * length_is_multiple_of
-
-    def _adjust_number_of_batches(
-        self,
-        input_ids: List[np.ndarray],
-        batch_beginnings: List[int],
-        batch_sizes: List[int],
-        batch_seq_lengths: List[int],
-    ) -> Tuple[List[int], List[int], List[int]]:
-        """
-        If length of ``batch_sizes`` list is not divisible by ``self.number_of_batches_is_multiple_of``, then
-        one or several batches are split into parts until number of batches is divisible by
-        ``self.number_of_batches_is_multiple_of``.
-
-        The method selects a batch and tries to slice smaller batches with 8 elements each from the batch. If
-        the batch cannot be sliced any further and there are still not enough batches, then the next batch from dataset
-        is selected.
-
-        If slicing batches of size 8 is not enough, then batches of size 1 are created.
-
-        If dataset is too small to create enough batches, then a warning is shown.
-
-        Args:
-            input_ids: tokenized queries of the dataset. `input_ids` are expected to be sorted by length in ascending
-                order.
-            batch_beginnings: indices of first elements of batches created inside :meth:`_mark_up_batches` method.
-                Expected to be sorted in ascending order.
-            batch_sizes: sizes of batches created inside :meth:`_mark_up_batches` method.
-            batch_seq_lengths: lengths of elements in batch after padding created inside :meth:`_mark_up_batches`
-                method.
-
-        Returns:
-            batch_beginnings: a list of indices in ``input_ids`` of first samples of every batch
-            batch_sizes: a list of numbers of samples in batches
-            batch_seq_lengths: a list of sequence lengths after padding for every batch
-        """
-        batch_beginnings, batch_sizes = batch_beginnings.copy(), batch_sizes.copy()
-        batch_seq_lengths = batch_seq_lengths.copy()
-        num_missing_batches = (
-            self.number_of_batches_is_multiple_of - len(batch_sizes) % self.number_of_batches_is_multiple_of
-        )
-        if num_missing_batches == 0:
-            return batch_beginnings, batch_sizes, batch_seq_lengths
-        if sum(batch_sizes) - len(batch_sizes) < num_missing_batches:
-            logging.warning(
-                f"Unable to achieve number of batches multiple of {self.number_of_batches_is_multiple_of} because "
-                f"dataset in files '{self.text_file}' and '{self.labels_file}' contains not enough queries "
-                f"({sum(batch_sizes)}) or queries in the dataset are too long. Dataset will have "
-                f"{len(batch_sizes)} batches instead. For validation or test dataset if multiple GPUs are used "
-                f"this will lead to distorted metrics because some batches will be processed several times. "
-                f"To fix this problem you may try to tweak (increase) parameter `tokens_in_batch`, though result is "
-                f"not guaranteed."
-            )
-            return batch_beginnings, batch_sizes, batch_seq_lengths
-        num_cut = 0
-        for ss in [8, 1]:  # ss - split_size
-            old_num_batches = len(batch_sizes)
-            # Starting from the last batch because its size is likely to be not multiple of 8. Thus number of
-            # batches which size is not multiple of 8 can be reduced by 1.
-            original_batch_index = old_num_batches - 1
-            while original_batch_index >= 0 and num_cut < num_missing_batches:
-                bs, bb = batch_sizes[original_batch_index], batch_beginnings[original_batch_index]
-                rb = 0  # an index of sliced first element of sliced batch in original batch (relative beginning)
-                if rb < bs - ss:
-                    while rb < bs - ss and num_cut < num_missing_batches:
-                        batch_sizes.append(ss)
-                        batch_beginnings.append(bb + rb)
-                        batch_seq_lengths.append(
-                            self.calc_batch_seq_length(input_ids[bb + rb : bb + rb + ss], length_is_multiple_of=8)
-                        )
-                        rb += ss
-                        num_cut += 1
-                    assert len(input_ids[bb + rb : bb + bs]) > 0
-                    batch_sizes[original_batch_index] = bs - rb
-                    batch_beginnings[original_batch_index] = bb + rb
-                    batch_seq_lengths[original_batch_index] = self.calc_batch_seq_length(
-                        input_ids[bb + rb : bb + bs], length_is_multiple_of=8
-                    )
-                original_batch_index -= 1
-            # Keeping order of batches.
-            batch_beginnings, batch_sizes, batch_seq_lengths = map(
-                list, zip(*sorted(zip(batch_beginnings, batch_sizes, batch_seq_lengths), key=lambda x: x[0]))
-            )
-        assert len(batch_beginnings) % self.number_of_batches_is_multiple_of == 0
-        assert len(batch_sizes) % self.number_of_batches_is_multiple_of == 0
-        assert len(batch_seq_lengths) % self.number_of_batches_is_multiple_of == 0
-        return batch_beginnings, batch_sizes, batch_seq_lengths
-
-    def _mark_up_batches(self, input_ids: List[np.ndarray]) -> Tuple[List[int], List[int], List[int]]:
-        """
-        Computes indices of first samples in batch, batch sizes, seq lengths for batches. ``input_ids`` has to be
-        sorted by number of tokens in ascending order.
-
-        Batches are marked up with respect to following conditions:
-            - total number of tokens in batch including paddings is less or equal to ``self.tokens_in_batch``
-            - batch size is evenly divisible by 8 (except for the last batch)
-            - seq length (elements of the third returned object) is evenly divisible by 8
-
-        If ``self.batch_mark_up_progress_queue`` is not None, then the progress in mark up is reported via
-        ``self.batch_mark_up_progress_queue``. Otherwise, ``tqdm`` instance is created in this function.
-
-        Args:
-            input_ids: a list of 1D int32 arrays. Elements of ``input_ids`` have to be sorted by length in ascending
-                order
-
-        Returns:
-            batch_beginnings: a list of indices in ``input_ids`` of first samples of every batch
-            batch_sizes: a list of numbers of samples in batches
-            batch_seq_lengths: a list of sequence lengths after padding for every batch
-        """
-        batch_beginnings, batch_sizes, batch_seq_lengths = [], [], []
-        current_max_length = 0
-        start = 0
-        if self.batch_mark_up_progress_queue is None:
-            inp_iterator = tqdm(enumerate(input_ids), total=len(input_ids), desc="Batch mark up", unit="query")
-        else:
-            inp_iterator = enumerate(input_ids)
-            progress_made = 0
-        for i, inp in inp_iterator:
-            current_max_length = max(current_max_length, ceil(len(inp) / 8) * 8)
-            if current_max_length * (i + 1 - start) > self.tokens_in_batch:
-                batch_size = (i - start) // 8 * 8
-                if batch_size == 0:
-                    if i > start:
-                        batch_size = i - start
-                        logging.warning(
-                            f"Could not create batch with multiple of 8 size. Probably, there is a too long sequence "
-                            f"in the dataset or parameter `tokens_in_batch` is too small. Current length of sequences "
-                            f"in batch is {current_max_length}. Batch size will be reduced to {batch_size}. "
-                            f"tokens_in_batch={self.tokens_in_batch}. The batch includes sequences from "
-                            f"{start} to {i - 1}."
-                        )
-                    else:
-                        logging.warning(
-                            f"Input sequence number {i - 1} is too long. Could not fit it into batch with "
-                            f"{self.tokens_in_batch} tokens. Sequence number {i - 1} will not be added to batches."
-                        )
-                        start = i
-                        current_max_length = ceil(len(inp) / 8) * 8
-                        continue
-                seq_length = self.calc_batch_seq_length(input_ids[start : start + batch_size], length_is_multiple_of=8)
-                batch_beginnings.append(start)
-                batch_sizes.append(batch_size)
-                batch_seq_lengths.append(seq_length)
-                start += batch_size
-                current_max_length = self.calc_batch_seq_length(input_ids[start : i + 1], length_is_multiple_of=8)
-            if self.batch_mark_up_progress_queue is not None:
-                progress_made += 1
-                if progress_made >= BATCH_MARK_UP_PROGRESS_REPORT_PERIOD:
-                    self.batch_mark_up_progress_queue.put(progress_made)
-                    progress_made = 0
-        if start < len(input_ids):
-            seq_length = self.calc_batch_seq_length(input_ids[start:], length_is_multiple_of=8)
-            batch_beginnings.append(start)
-            batch_sizes.append(len(input_ids) - start)
-            batch_seq_lengths.append(seq_length)
-            if self.batch_mark_up_progress_queue is not None:
-                self.batch_mark_up_progress_queue.put(progress_made)
-        if len(batch_beginnings) % self.number_of_batches_is_multiple_of:
-            batch_beginnings, batch_sizes, batch_seq_lengths = self._adjust_number_of_batches(
-                input_ids, batch_beginnings, batch_sizes, batch_seq_lengths
-            )
-        assert sum(batch_sizes) == len(input_ids)
-        for i in range(len(batch_beginnings) - 1):
-            assert batch_beginnings[i] + batch_sizes[i] == batch_beginnings[i + 1]
-            assert batch_seq_lengths[i] >= max(
-                [len(inp) for inp in input_ids[batch_beginnings[i] : batch_beginnings[i] + batch_sizes[i]]]
-            )
-        return batch_beginnings, batch_sizes, batch_seq_lengths
-
-    def _form_batches(
-        self,
-        input_ids: List[np.ndarray],
-        subtokens_mask: List[np.ndarray],
-        punct_labels: List[np.ndarray],
-        capit_labels: List[np.ndarray],
-        waveforms: Optional[List[np.ndarray]] = None,
-        audio_lengths: Optional[List[np.ndarray]] = None,
-        audio_filepaths: Optional[List[str]] = None,
-    ) -> List[Dict[str, np.ndarray]]:
-        """
-
-        Args:
-            input_ids: a list of 1D int32 arrays which contain token ids of dataset source
-            subtokens_mask: a list of 1D boolean arrays which elements are ``True`` if corresponding token is the
-                first token in some word
-            punct_labels: a list of 1D int32 arrays which contain encoded punctuation labels
-            capit_labels: a list of 1D int32 arrays which contain encoded capitalization labels
-            waveforms:  a list of 1D float arrays which contain raw waveforms of audios.
-            audio_lengths: a list of 1D int32 arrays which contain length of corresponding audio from `waveforms`
-            audio_filepaths: a list of strings which contain paths to audio
-
-        Returns:
-            a list of batches. Each batch is a dictionary with items:
-              - ``'input_ids'``: a ``np.int32`` numpy array;
-              - ``'subtokens_mask'``: a boolean numpy array;
-              - ``'punct_labels'``: a ``np.int32`` numpy array;
-              - ``'capit_labels'``: a ``np.int32`` numpy array.
-            If ``self.add_masks_and_segment_ids_to_batch`` is ``True``, then a batch also contain items
-              - ``'segment_ids'``: a ``np.int8`` numpy array;
-              - ``'input_mask'``: a boolean numpy array;
-              - ``'loss_mask'``: a boolean numpy array.
-            If ``waveforms`` is not ``None``, then a batch also contain items
-              - ``features``: a ``np.float64`` numpy array.
-              - ``features_length`` a ``np.int32`` numpy array.
-            If ``audio_filepaths`` is not ``None``, then a natch also contain items
-              - ``audio_filepaths`` a list of strings.
-
-            The values of a batch dictionary are numpy arrays of identical shape.
-        """
-        batches = []
-        dummy = [None] * len(input_ids)
-
-        zipped = list(
-            zip(
-                input_ids,
-                subtokens_mask,
-                punct_labels,
-                capit_labels,
-                waveforms if waveforms else dummy,
-                audio_lengths if audio_lengths else dummy,
-                audio_filepaths if audio_filepaths else dummy,
-            )
-        )
-
-        for item in zipped:
-            batch = {
-                "input_ids": item[0],
-                "subtokens_mask": item[1],
-                "punct_labels": item[2].astype(np.int64),
-                "capit_labels": item[3].astype(np.int64),
-            }
-            if self.use_audio and self.preload_audios:
-                batch['features'] = item[4].astype(np.float64)
-                batch['features_length'] = item[5]
-            elif self.use_audio and not self.preload_audios:
-                batch['audio_filepaths'] = item[6]
-            batches.append(batch)
-        return batches
-
-    def _pack_into_batches(
-        self,
-        input_ids: List[np.ndarray],
-        subtokens_mask: List[np.ndarray],
-        punct_labels: List[np.ndarray],
-        capit_labels: List[np.ndarray],
-        waveforms: Optional[List[np.ndarray]] = None,
-        audio_lengths: Optional[List[np.ndarray]] = None,
-        audio_filepaths: Optional[List[str]] = None,
-    ) -> List[Dict[str, np.ndarray]]:
-        """
-        Shuffle input sequences, sort them by number of tokens, pad, and pack into batches which satisfy following
-        conditions:
-            - total number of tokens in batch including paddings is less or equal to ``self.tokens_in_batch``
-            - batch size is evenly divisible by 8 (except for the last batch)
-            - seq length (elements of the third returned object) is evenly divisible by 8
-        Created batches are shuffled before returning.
-
-        If ``self.add_masks_and_segment_ids_to_batch`` is ``True``, then ``'segment_ids'``, ``'loss_mask'``, and
-        ``'input_mask'`` are added to the batch.
-
-        If ``self.batch_building_progress_queue`` is not ``None``, then padding progress is reported to
-        ``self.batch_building_progress_queue``. Otherwise, a new ``tqdm`` instance is created in ``pack_into_batches``
-        method.
-
-        Args:
-            input_ids: a list of 1D int32 arrays which contain token ids of dataset source
-            subtokens_mask: a list of 1D boolean arrays which elements are ``True`` if corresponding token is the
-                first token in some word
-            punct_labels: a list of 1D int32 arrays which contain encoded punctuation labels
-            capit_labels: a list of 1D int32 arrays which contain encoded capitalization labels
-            waveforms:  a list of 1D float arrays which contain raw waveforms of audios.
-            audio_lengths: a list of 1D int32 arrays which contain length of corresponding audio from `waveforms`
-            audio_filepaths: a list of strings which contain paths to audio
-
-        Returns:
-            a list of batches. Each batch is a dictionary with items:
-              - ``'input_ids'``: a ``np.int32`` numpy array;
-              - ``'subtokens_mask'``: a boolean numpy array;
-              - ``'punct_labels'``: a ``np.int32`` numpy array;
-              - ``'capit_labels'``: a ``np.int32`` numpy array.
-            If ``self.add_masks_and_segment_ids_to_batch`` is ``True``, then a batch also contain items
-              - ``'segment_ids'``: a ``np.int8`` numpy array;
-              - ``'input_mask'``: a boolean numpy array;
-              - ``'loss_mask'``: a boolean numpy array.
-            If ``waveforms`` is not ``None``, then a batch also contain items
-              - ``features``: a ``np.float64`` numpy array.
-              - ``features_length`` a ``np.int32`` numpy array.
-            If ``audio_filepaths`` is not ``None``, then a natch also contain items
-              - ``audio_filepaths`` a list of strings.
-
-            The values of a batch dictionary are numpy arrays of identical shape.
-        """
-        dummy = [None] * len(input_ids)
-        zipped = list(
-            zip(
-                input_ids,
-                subtokens_mask,
-                punct_labels,
-                capit_labels,
-                waveforms if waveforms else dummy,
-                audio_lengths if audio_lengths else dummy,
-                audio_filepaths if audio_filepaths else dummy,
-            )
-        )
-        self.batch_shuffling_random_state.shuffle(zipped)
-
-        dim_sort = 4 if self.use_audio and self.preload_audios else 0
-
-        input_ids, subtokens_mask, punct_labels, capit_labels, waveforms, audio_lengths, audio_filepaths = zip(
-            *sorted(zipped, key=lambda x: x[dim_sort].shape[0])
-        )
-        batch_beginnings, batch_sizes, batch_seq_lengths = self._mark_up_batches(input_ids)
-        batches = []
-        if self.batch_building_progress_queue is None:
-            inp_iterator = tqdm(
-                zip(batch_beginnings, batch_sizes, batch_seq_lengths),
-                total=len(batch_beginnings),
-                desc="Batch building",
-                unit="batch",
-            )
-        else:
-            # In this case we report number of queries not number of batches
-            inp_iterator = zip(batch_beginnings, batch_sizes, batch_seq_lengths)
-            progress_made = 0
-        for start, size, length in inp_iterator:
-            batch_input_ids = pad(input_ids[start : start + size], length, self.tokenizer.pad_id)
-            batch_subtokens_mask = pad(subtokens_mask[start : start + size], length, False)
-            batch = {
-                "input_ids": batch_input_ids,
-                "subtokens_mask": batch_subtokens_mask,
-                "punct_labels": pad(
-                    punct_labels[start : start + size], length, self.punct_label_ids[self.pad_label]
-                ).astype(np.int64),
-                "capit_labels": pad(
-                    capit_labels[start : start + size], length, self.capit_label_ids[self.pad_label]
-                ).astype(np.int64),
-            }
-            if self.use_audio and self.preload_audios:
-                batch['features'] = pad(
-                    waveforms[start : start + size], max(audio_lengths[start : start + size]), 0.0
-                ).astype(np.float64)
-                batch['features_length'] = audio_lengths[start : start + size]
-            elif self.use_audio and not self.preload_audios:
-                batch['audio_filepaths'] = audio_filepaths[start : start + size]
-
-            if self.add_masks_and_segment_ids_to_batch:
-                batch_segment_ids, batch_input_mask, batch_loss_mask = create_masks_and_segment_ids(
-                    batch_input_ids,
-                    batch_subtokens_mask,
-                    self.tokenizer.pad_id,
-                    self.tokenizer.cls_id,
-                    self.tokenizer.sep_id,
-                    self.ignore_start_end,
-                    self.ignore_extra_tokens,
-                )
-                batch['segment_ids'] = batch_segment_ids
-                batch['input_mask'] = batch_input_mask
-                batch['loss_mask'] = batch_loss_mask
-            batches.append(batch)
-            if self.batch_building_progress_queue is not None:
-                progress_made += size
-                if progress_made >= BATCH_BUILDING_PROGRESS_REPORT_PERIOD:
-                    self.batch_building_progress_queue.put(progress_made)
-                    progress_made = 0
-        if self.batch_building_progress_queue is not None:
-            self.batch_building_progress_queue.put(progress_made)
-        self.batch_shuffling_random_state.shuffle(batches)
-        return batches
-
-    def repack_batches_with_shuffle(self) -> None:
-        """A function for proper shuffling of a dataset. Pytorch data loader shuffling will only permute batches."""
-        if not self.use_bucketing:
-            return
-        logging.info("Shuffling training dataset")
-        self.batches = self._pack_into_batches(
-            self.input_ids,
-            self.subtokens_mask,
-            self.punct_labels,
-            self.capit_labels,
-            self.waveforms,
-            self.waveforms_length,
-            self.audio_filepaths,
-        )
-
-    def _calculate_and_save_label_frequencies(self, all_labels: List[np.ndarray], name: str) -> Dict[str, float]:
-        """Calculates and saves labels frequencies in :attr:`label_info_save_dir`."""
-        merged_labels = itertools.chain.from_iterable(all_labels)
-        if self.verbose:
-            logging.info('Three most popular labels')
-        self.label_info_save_dir.mkdir(parents=True, exist_ok=True)
-        _, label_frequencies, _ = get_label_stats(
-            merged_labels, str(self.label_info_save_dir / f'label_count_{name}.tsv')
-        )
-        return label_frequencies
-
-    def save_labels_and_get_file_paths(
-        self, punct_labels_file_name: str, capit_labels_file_name: str
-    ) -> Tuple[Path, Path]:
-        """
-        Saves label ids into files located in ``self.label_info_save_dir``. Saved label ids are usually used for
-        ``.nemo`` checkpoint creation.
-
-        The signatures of this method and the signature of the method
-        :meth:`~nemo.collections.nlp.data.token_classification.BertPunctuationCapitalizationTarredDataset.save_labels_and_get_file_paths`
-        must be identical.
-
-        Args:
-            punct_labels_file_name (:obj:`str`): a name of a punctuation labels file
-            capit_labels_file_name (:obj:`str`): a name of a capitalization labels file
-
-        Returns:
-            :obj:`Tuple[pathlib.Path, pathlib.Path]`: a tuple containing:
-
-                - :obj:`pathlib.Path`: a path to the saved punctuation labels file
-                - :obj:`pathlib.Path`: a path to the saved capitalization labels file
-        """
-        nemo_dir = self.label_info_save_dir / LABEL_ID_DIR_FOR_NEMO_CHECKPOINT
-        punct_labels_file = nemo_dir / punct_labels_file_name
-        capit_labels_file = nemo_dir / capit_labels_file_name
-        save_label_ids(self.punct_label_ids, punct_labels_file)
-        save_label_ids(self.capit_label_ids, capit_labels_file)
-        return punct_labels_file, capit_labels_file
-
-    def __len__(self) -> int:
-        return len(self.batches)
-
-    def collate_fn(self, batches: List[Dict[str, np.ndarray]]) -> Dict[str, torch.Tensor]:
-        """
-        If ``self.use_bucketing`` set to ``True`` returns zeroth batch from ``batches`` list passed for collating and casts ``'segment_ids'``, ``'punct_labels'``,
-        ``'capit_labels'`` to types supported by
-        :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel`
-        or :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationLexicalAudioModel` if ``self.use_audio`` set to ``True``
-        All output tensors have shape ``[Batch, Time]``.
-
-        .. warning::
-            A ``batch_size`` parameter of a PyTorch data loader and sampler has to be ``1`` if ``self.use_bucketing`` set to ``True``
-
-        Args:
-            batches (:obj:`List[Dict[str, np.ndarray]]`): a list containing 1 batch passed for collating
-
-        Returns:
-            :obj:`Dict[str, torch.Tensor]`: a batch dictionary with following items (for detailed description of batch
-            items see method :meth:`__getitem__`):
-
-              - ``'input_ids'`` (:obj:`torch.Tensor`): :obj:`torch.int32` tensor,
-              - ``'subtokens_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor,
-              - ``'punct_labels'`` (:obj:`torch.Tensor`): :obj:`torch.int64` tensor,
-              - ``'capit_labels'`` (:obj:`torch.Tensor`): :obj:`torch.int64` tensor,
-              - ``'segment_ids'`` (:obj:`torch.Tensor`): :obj:`torch.int32` tensor,
-              - ``'input_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor,
-              - ``'loss_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor.
-              - ``'features'`` (:obj:`torch.Tensor`): :obj:`torch.float` tensor.
-              - ``'features_length'`` (:obj:`torch.Tensor`): :obj:`torch.long` tensor.
-        """
-        if self.use_bucketing:
-            batch = {k: torch.as_tensor(v) for k, v in batches[0].items() if k != 'audio_filepaths'}
-            batch['segment_ids'] = batch['segment_ids'].int()
-            batch['punct_labels'] = batch['punct_labels'].long()
-            batch['capit_labels'] = batch['capit_labels'].long()
-            if self.use_audio and self.preload_audios:
-                batch['features'] = batch['features'].to(torch.float32)
-            return batch
-        else:
-            for batch in batches:
-                batch_segment_ids, batch_input_mask, batch_loss_mask = create_masks_and_segment_ids(
-                    batch['input_ids'],
-                    batch['subtokens_mask'],
-                    self.tokenizer.pad_id,
-                    self.tokenizer.cls_id,
-                    self.tokenizer.sep_id,
-                    self.ignore_start_end,
-                    self.ignore_extra_tokens,
-                )
-                batch['segment_ids'] = torch.as_tensor(batch_segment_ids, dtype=torch.int)
-                batch['input_mask'] = torch.as_tensor(batch_input_mask)
-                batch['loss_mask'] = torch.as_tensor(batch_loss_mask)
-                batch['input_ids'] = torch.as_tensor(batch['input_ids'], dtype=torch.int)
-                batch['subtokens_mask'] = torch.as_tensor(batch['subtokens_mask'])
-                batch['punct_labels'] = torch.as_tensor(batch['punct_labels'], dtype=torch.long)
-                batch['capit_labels'] = torch.as_tensor(batch['capit_labels'], dtype=torch.long)
-                if 'features' in batch:
-                    batch['features'] = torch.as_tensor(batch['features'], dtype=torch.float)
-                    batch['features_length'] = torch.as_tensor(batch['features_length'], dtype=torch.long)
-                elif self.use_audio:
-                    if ASR_AVAILABLE:
-                        waveform = AudioSegment.from_file(batch['audio_filepaths'], target_sr=self.sample_rate)
-                        batch['features'] = torch.as_tensor(waveform.samples, dtype=torch.float)
-                        batch['features_length'] = torch.as_tensor(waveform.num_samples, dtype=torch.long)
-                    else:
-                        raise ModuleNotFoundError(
-                            'Nemo ASR was not installed, see https://github.com/NVIDIA/NeMo#installation for installation instructions'
-                        )
-
-            segment_ids = pad_sequence([batch['segment_ids'] for batch in batches])
-            input_mask = pad_sequence([batch['input_mask'] for batch in batches])
-            loss_mask = pad_sequence([batch['loss_mask'] for batch in batches])
-            input_ids = pad_sequence([batch['input_ids'] for batch in batches], padding_value=self.tokenizer.pad_id)
-            subtokens_mask = pad_sequence([batch['subtokens_mask'] for batch in batches], padding_value=False)
-            punct_labels = pad_sequence([batch['punct_labels'] for batch in batches], padding_value=0)
-            capit_labels = pad_sequence([batch['capit_labels'] for batch in batches], padding_value=0)
-            features = pad_sequence([batch['features'] for batch in batches], padding_value=0.0)
-            features_length = torch.tensor([batch['features_length'] for batch in batches])
-            return {
-                'input_ids': input_ids.T,
-                'subtokens_mask': subtokens_mask.T,
-                'punct_labels': punct_labels.T,
-                'capit_labels': capit_labels.T,
-                'features': features.T,
-                'features_length': features_length,
-                'segment_ids': segment_ids.T,
-                'input_mask': input_mask.T,
-                'loss_mask': loss_mask.T,
-            }
-
-    def __getitem__(self, idx: int) -> Dict[str, np.ndarray]:
-        """
-        Return a batch with index ``idx``. The values of a batch dictionary are numpy arrays of identical shapes
-        ``[Batch, Time]``. Labels are identical for all tokens in a word. For example, if
-
-          - word ``'Tokenization'`` is tokenized into tokens ``['token', 'ization']``,
-          - it is followed by comma,
-
-        then punctuation labels are ``[',', ',']`` and capitalization labels are ``['U', 'U']`` (``'U'`` is a label
-        for words which start with upper case character).
-
-        Args:
-            idx: an index of returned batch
-
-        Returns:
-            :obj:`Dict[str, np.ndarray]`: a dictionary with items:
-
-              - ``'input_ids'`` (:obj:`numpy.ndarray`): :obj:`numpy.int32` array containing encoded tokens,
-              - ``'subtokens_mask'`` (:obj:`numpy.ndarray`): :obj:`bool` array which elements are ``True`` if they
-                correspond to first token in a word,
-              - ``'punct_labels'`` (:obj:`numpy.ndarray`): :obj:`numpy.int32` array containing encoded punctuation
-                labels,
-              - ``'capit_labels'`` (:obj:`numpy.ndarray`): :obj:`numpy.int32` array containing encoded capitalization
-                labels.
-              - ``'segment_ids'`` (:obj:`numpy.ndarray`): :obj:`numpy.int8` array filled with zeros (BERT token types
-                in HuggingFace terminology) (if ``self.add_masks_and_segment_ids_to_batch`` is ``False``, then these
-                items is missing),
-              - ``'input_mask'`` (:obj:`numpy.ndarray`): :obj:`bool` array which elements are ``True`` if corresponding
-                token is not a padding token (if ``self.add_masks_and_segment_ids_to_batch`` is ``False``, then these
-                items is missing),
-              - ``'loss_mask'`` (:obj:`numpy.ndarray`): :obj:`bool` array which elements are ``True`` if loss is
-                computed for corresponding token. See more in description of constructor parameters
-                ``ignore_start_end``, ``ignore_extra_tokens`` (if ``self.add_masks_and_segment_ids_to_batch`` is
-                ``False``, then these items is missing).
-              - ``'features'`` (:obj:`numpy.ndarray`) :obj:`np.float64` array of waveforms of audio if ``self.preload_audio`` is set to ``True`` else empty.
-              - ``'features_length'`` (:obj:`numpy.ndarray`) :obj:`np.longlong` array of number of samples per audio.
-              - ``'audio_filepaths'`` (:obj:`List`) :obj:`str` contains paths of audio files if ``self.preload_audio`` set to ``False``
-        """
-        return self.batches[idx]
diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py
deleted file mode 100644
index 13bb30403553..000000000000
--- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py
+++ /dev/null
@@ -1,466 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io
-import itertools
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from numpy import ndarray
-from torch import Tensor
-from torch.nn.utils.rnn import pad_sequence
-
-from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
-from nemo.collections.common.tokenizers import TokenizerSpec
-from nemo.collections.nlp.data import get_stats
-from nemo.core import Dataset
-from nemo.core.neural_types import ChannelType, Index, MaskType, NeuralType
-from nemo.core.neural_types.elements import AudioSignal, BoolType, LengthsType
-from nemo.utils import logging
-
-try:
-    from nemo.collections.asr.parts.preprocessing import AudioSegment
-
-    ASR_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    ASR_AVAILABLE = False
-
-
-def get_features_infer(
-    queries: List[str],
-    tokenizer: TokenizerSpec,
-    max_seq_length: int = 64,
-    step: Optional[int] = 8,
-    margin: Optional[int] = 16,
-    audio_queries: Optional[Union[List[bytes], List[str]]] = None,
-    target_sr: Optional[int] = None,
-) -> Tuple[
-    List[List[int]],
-    List[List[int]],
-    List[List[int]],
-    List[List[int]],
-    List[int],
-    List[int],
-    List[bool],
-    List[bool],
-    Optional[List[float]],
-    Optional[List[int]],
-]:
-    """
-    Processes the data and returns features.
-
-    Args:
-        queries: text sequences
-        tokenizer: such as AutoTokenizer
-        max_seq_length: max sequence length minus 2 for [CLS] and [SEP]
-        step: relative shift of consequent segments into which long queries are split. Long queries are split into
-            segments which can overlap. Parameter ``step`` controls such overlapping. Imagine that queries are
-            tokenized into characters, ``max_seq_length=5``, and ``step=2``. In such a case query "hello" is
-            tokenized into segments ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]``.
-        margin: number of subtokens near edges of segments which are not used for punctuation and capitalization
-            prediction. The first segment does not have left margin and the last segment does not have right
-            margin. For example, if input sequence is tokenized into characters, ``max_seq_length=5``,
-            ``step=1``, and ``margin=1``, then query "hello" will be tokenized into segments
-            ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'e', 'l', 'l', '[SEP]'],
-            ['[CLS]', 'l', 'l', 'o', '[SEP]']]``. These segments are passed to the model. Before final predictions
-            computation, margins are removed. In the next list, subtokens which logits are not used for final
-            predictions computation are marked with asterisk: ``[['[CLS]'*, 'h', 'e', 'l'*, '[SEP]'*],
-            ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]``.
-        audio_queries (:obj:`List[str]`, `optional`): paths to audio files.
-        target_sr (:obj:`int`, `optional`): target sample rate for audios.
-
-    Returns:
-        all_input_ids: list of input ids of all segments
-        all_segment_ids: token type ids of all segments
-        all_input_mask: attention mask to use for BERT model
-        all_subtokens_mask: masks out all subwords besides the first one
-        all_quantities_of_preceding_words: number of words in query preceding a segment. Used for joining
-            predictions from overlapping segments.
-        all_query_ids: index of a query to which segment belongs
-        all_is_first: is segment first segment in a query
-        all_is_last: is segment last segment in a query
-    """
-    st = []
-    stm = []
-    sent_lengths = []
-    audios = []
-    audio_queries = audio_queries if audio_queries else [None] * len(queries)  # Dummy if no `audio_queries` passed
-    for i, (query, audio_query) in enumerate(zip(queries, audio_queries)):
-        subtokens, subtokens_mask = _get_subtokens_and_subtokens_mask(query, tokenizer)
-        sent_lengths.append(len(subtokens))
-        st.append(subtokens)
-        stm.append(subtokens_mask)
-        if audio_query:
-            if ASR_AVAILABLE:
-                if isinstance(audio_query, bytes):
-                    audios.append(AudioSegment.from_file(io.BytesIO(audio_query), target_sr=target_sr))
-                elif isinstance(audio_query, str):
-                    audios.append(AudioSegment.from_file(audio_query.strip(), target_sr=target_sr))
-            else:
-                raise ModuleNotFoundError(
-                    'Nemo ASR was not installed, see https://github.com/NVIDIA/NeMo#installation for installation instructions'
-                )
-    audios = audios if len(audios) else [None] * len(st)
-    _check_max_seq_length_and_margin_and_step(max_seq_length, margin, step)
-    if max_seq_length > max(sent_lengths) + 2:
-        max_seq_length = max(sent_lengths) + 2
-        # If `max_seq_length` is greater than maximum length of input query, parameters ``margin`` and ``step`` are
-        # not used will not be used.
-        step = 1
-        # Maximum number of word subtokens in segment. The first and the last tokens in segment are CLS and EOS
-        length = max_seq_length - 2
-    else:
-        # Maximum number of word subtokens in segment. The first and the last tokens in segment are CLS and EOS
-        length = max_seq_length - 2
-        step = min(length - margin * 2, step)
-    logging.info(f'Max length: {max_seq_length}')
-    get_stats(sent_lengths)
-    all_input_ids, all_segment_ids, all_subtokens_mask, all_input_mask, all_input_mask = [], [], [], [], []
-    all_quantities_of_preceding_words, all_query_ids, all_is_first, all_is_last = [], [], [], []
-    all_audio_queries, all_audio_lengths = [], []
-    for q_i, (query_st, query_audio) in enumerate(zip(st, audios)):
-        q_inp_ids, q_segment_ids, q_subtokens_mask, q_inp_mask, q_quantities_of_preceding_words = [], [], [], [], []
-        q_audio_queries, q_audio_lengths = [], []
-        if query_audio and length < len(query_st):
-            logging.info(f'Ignoring query with id {q_i}')
-            continue
-        for i in range(0, max(len(query_st), length) - length + step, step):
-            subtokens = [tokenizer.cls_token] + query_st[i : i + length] + [tokenizer.sep_token]
-            q_inp_ids.append(tokenizer.tokens_to_ids(subtokens))
-            q_segment_ids.append([0] * len(subtokens))
-            q_subtokens_mask.append([False] + stm[q_i][i : i + length] + [False])
-            q_inp_mask.append([True] * len(subtokens))
-            q_quantities_of_preceding_words.append(np.count_nonzero(stm[q_i][:i]))
-            if query_audio:
-                samples = query_audio.samples
-                q_audio_queries.append(samples)
-                q_audio_lengths.append(len(samples))
-        all_input_ids.append(q_inp_ids)
-        all_segment_ids.append(q_segment_ids)
-        all_subtokens_mask.append(q_subtokens_mask)
-        all_input_mask.append(q_inp_mask)
-        all_quantities_of_preceding_words.append(q_quantities_of_preceding_words)
-        all_query_ids.append([q_i] * len(q_inp_ids))
-        all_is_first.append([True] + [False] * (len(q_inp_ids) - 1))
-        all_is_last.append([False] * (len(q_inp_ids) - 1) + [True])
-        if query_audio:
-            all_audio_queries.append(q_audio_queries)
-            all_audio_lengths.append(q_audio_lengths)
-    return (
-        list(itertools.chain(*all_input_ids)),
-        list(itertools.chain(*all_segment_ids)),
-        list(itertools.chain(*all_input_mask)),
-        list(itertools.chain(*all_subtokens_mask)),
-        list(itertools.chain(*all_quantities_of_preceding_words)),
-        list(itertools.chain(*all_query_ids)),
-        list(itertools.chain(*all_is_first)),
-        list(itertools.chain(*all_is_last)),
-        list(itertools.chain(*all_audio_queries)),
-        list(itertools.chain(*all_audio_lengths)),
-    )
-
-
-def _check_max_seq_length_and_margin_and_step(max_seq_length: int, margin: int, step: int):
-    """
-    Checks values of ``max_seq_length``, ``margin``, and ``step``.
-    Args:
-        max_seq_length: a segment length with ``[CLS]`` and ``[SEP]`` tokens
-        margin: a number of input tokens near edges of segments which are not used in punctuation and capitalization
-            prediction.
-        step: offset of consequent segments.
-    Returns:
-        None
-    """
-    if max_seq_length < 3:
-        raise ValueError(
-            f"Parameter `max_seq_length={max_seq_length}` cannot be less than 3 because `max_seq_length` is a length "
-            f"of a segment with [CLS] and [SEP] tokens."
-        )
-    if margin >= (max_seq_length - 2) // 2 and margin > 0 or margin < 0:
-        raise ValueError(
-            f"Parameter `margin` has to be not negative and less than `(max_seq_length - 2) // 2`. Don't forget about "
-            f"CLS and EOS tokens in the beginning and the end of segment. margin={margin}, "
-            f"max_seq_length={max_seq_length}"
-        )
-    if step <= 0:
-        raise ValueError(f"Parameter `step` has to be positive whereas step={step}")
-    if step > max_seq_length - 2 - 2 * margin:
-        logging.warning(
-            f"Parameter step={step} is too big. It will be reduced to `min(max_seq_length, <maximum query length> + 2) "
-            f"- 2 - 2 * margin`."
-        )
-
-
-def _get_subtokens_and_subtokens_mask(query: str, tokenizer: TokenizerSpec) -> Tuple[List[str], List[bool]]:
-    """
-    Tokenizes input query into subtokens and creates subtokens mask. Subtokens mask is an array of the same length as
-    subtokens array and contains zeros and ones in which. If element of mask equals 1, then corresponding subtoken in
-    subtokens array is first subtoken in some word
-    Args:
-        query: a string that will be tokenized
-        tokenizer: an instance of tokenizer
-    Returns:
-        subtokens: list of subtokens
-        subtokens_mask: list of ints
-    """
-    if isinstance(query, Hypothesis):
-        query = query.text
-    words = query.strip().split()
-    subtokens = []
-    subtokens_mask = []
-    for j, word in enumerate(words):
-        word_tokens = tokenizer.text_to_tokens(word)
-        subtokens.extend(word_tokens)
-        subtokens_mask.append(True)
-        subtokens_mask.extend([False] * (len(word_tokens) - 1))
-    return subtokens, subtokens_mask
-
-
-class BertPunctuationCapitalizationInferDataset(Dataset):
-    """
-    Creates dataset to use during inference for punctuation and capitalization tasks with a pretrained model.
-    For dataset to use during training with labels, see
-    :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.BertPunctuationCapitalizationDataset`
-    and
-    :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.BertPunctuationCapitalizationTarredDataset`.
-
-    Parameters ``max_seq_length``, ``step``, ``margin`` are for controlling the way queries are split into segments
-    which then processed by the model. Parameter ``max_seq_length`` is a length of a segment after tokenization
-    including special tokens [CLS] in the beginning and [SEP] in the end of a segment. Parameter ``step`` is shift
-    between consequent segments. Parameter ``margin`` is used to exclude negative effect of subtokens near
-    borders of segments which have only one side context.
-
-    Args:
-        queries (:obj:`List[str]`): list of sequences.
-        tokenizer (:obj:`TokenizerSpec`): a tokenizer which was used for model training. It should have properties
-            ``cls_id``, ``sep_id``, ``unk_id``, ``pad_id``.
-        max_seq_length (:obj:`int`, `optional`, defaults to :obj:`128`): max sequence length which includes [CLS] and
-            [SEP] tokens
-        step (:obj:`int`, `optional`, defaults to :obj:`8`): relative shift of consequent segments into which long
-            queries are split. Long queries are split into segments which can overlap. Parameter ``step`` controls such
-            overlapping. Imagine that queries are tokenized into characters, ``max_seq_length=5``, and ``step=2``. In
-            such a case query "hello" is tokenized into segments
-            ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]``.
-        margin (:obj:`int`, `optional`, defaults to :obj:`16`): number of subtokens in the beginning and the end of
-            segments which are not used for prediction computation. The first segment does not have left margin and the
-            last segment does not have right margin. For example, if input sequence is tokenized into characters,
-            ``max_seq_length=5``, ``step=1``, and ``margin=1``, then query "hello" will be tokenized into segments
-            ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'e', 'l', 'l', '[SEP]'],
-            ['[CLS]', 'l', 'l', 'o', '[SEP]']]``. These segments are passed to the model. Before final predictions
-            computation, margins are removed. In the next list, subtokens which logits are not used for final
-            predictions computation are marked with asterisk: ``[['[CLS]'*, 'h', 'e', 'l'*, '[SEP]'*],
-            ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]``.
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns neural types of :meth:`collate_fn` output."""
-        if self.use_audio:
-            return {
-                'input_ids': NeuralType(('B', 'T'), ChannelType()),
-                'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-                'input_mask': NeuralType(('B', 'T'), MaskType()),
-                'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-                'quantities_of_preceding_words': NeuralType(('B',), Index()),
-                'query_ids': NeuralType(('B',), Index()),
-                'is_first': NeuralType(('B',), BoolType()),
-                'is_last': NeuralType(('B',), BoolType()),
-                'features': NeuralType(('B', 'T'), AudioSignal()),
-                'features_length': NeuralType(('B', 'T'), LengthsType()),
-            }
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-            'quantities_of_preceding_words': NeuralType(('B',), Index()),
-            'query_ids': NeuralType(('B',), Index()),
-            'is_first': NeuralType(('B',), BoolType()),
-            'is_last': NeuralType(('B',), BoolType()),
-        }
-
-    def __init__(
-        self,
-        queries: List[str],
-        tokenizer: TokenizerSpec,
-        max_seq_length: int = 64,
-        step: int = 8,
-        margin: int = 16,
-        audio_queries: Optional[Union[List[bytes], List[str]]] = None,
-        target_sr: Optional[int] = None,
-    ):
-        features = get_features_infer(
-            queries=queries,
-            max_seq_length=max_seq_length,
-            tokenizer=tokenizer,
-            step=step,
-            margin=margin,
-            audio_queries=audio_queries,
-            target_sr=target_sr,
-        )
-        self.all_input_ids: List[List[int]] = features[0]
-        self.all_segment_ids: List[List[int]] = features[1]
-        self.all_input_mask: List[List[int]] = features[2]
-        self.all_subtokens_mask: List[List[int]] = features[3]
-        self.all_quantities_of_preceding_words: List[int] = features[4]
-        self.all_query_ids: List[int] = features[5]
-        self.all_is_first: List[bool] = features[6]
-        self.all_is_last: List[bool] = features[7]
-        self.all_audio_queries: Optional[List[List[float]]] = features[8]
-        self.all_audio_lengths: Optional[List[List[int]]] = features[9]
-        self.use_audio = audio_queries is not None
-
-    def __len__(self) -> int:
-        return len(self.all_input_ids)
-
-    def collate_fn(
-        self,
-        batch: List[
-            Tuple[
-                np.ndarray,
-                np.ndarray,
-                np.ndarray,
-                np.ndarray,
-                int,
-                int,
-                bool,
-                bool,
-                Optional[np.ndarray],
-                Optional[np.ndarray],
-            ]
-        ],
-    ) -> Union[
-        Tuple[Tensor, Tensor, Tensor, Tensor, Any, Any, Any, Any],
-        Tuple[Tensor, Tensor, Tensor, Tensor, Any, Any, Any, Any, Any, Any],
-    ]:
-        """
-        Collates samples into batches.
-
-        Args:
-            batch (:obj:`List[tuple]`): a list of samples returned by :meth:`__getitem__` method.
-
-        Returns:
-            :obj:`Tuple[torch.Tensor (x4), Tuple[int, ...] (x2), Tuple[bool, ...] (x2)]`: a tuple containing 8
-            elements:
-
-              - ``input_ids`` (:obj:`torch.Tensor`): an integer tensor of shape ``[Batch, Time]`` containing encoded
-                input text.
-              - ``segment_ids`` (:obj:`torch.Tensor`): an integer tensor of shape ``[Batch, Time]`` filled with zeros.
-              - ``input_mask`` (:obj:`torch.Tensor`): a boolean tensor of shape ``[Batch, Time]`` which elements are
-                ``True`` if corresponding token is not a padding token.
-              - ``subtokens_mask`` (:obj:`torch.Tensor`): a boolean tensor of shape ``[Batch, Time]`` which elements
-                are ``True`` if corresponding tken is the first token in a word.
-              - ``quantities_of_preceding_words`` (:obj:`Tuple[int, ...]`):  a tuple containing number of words in
-                a query preceding current segment.
-              - ``query_ids`` (:obj:`Tuple[int, ...]`): a tuple containing indices of queries to which segments belong.
-              - ``is_first`` (:obj:`Tuple[bool, ...]`): a tuple booleans which elements are ``True`` if corresponding
-                segment is the first segment in a query.
-              - ``is_last`` (:obj:`Tuple[bool, ...]`): a tuple of booleans which elements are ``True`` if corresponding
-                segment is the last segment in a query.
-
-        """
-        if not self.use_audio:
-            inp_ids, segment_ids, inp_mask, st_mask, n_preceding, query_ids, is_first, is_last = zip(*batch)
-            return (
-                pad_sequence([torch.tensor(x) for x in inp_ids], batch_first=True, padding_value=0),
-                pad_sequence([torch.tensor(x) for x in segment_ids], batch_first=True, padding_value=0),
-                pad_sequence([torch.tensor(x) for x in inp_mask], batch_first=True, padding_value=0),
-                pad_sequence([torch.tensor(x) for x in st_mask], batch_first=True, padding_value=0),
-                n_preceding,
-                query_ids,
-                is_first,
-                is_last,
-            )
-        (
-            inp_ids,
-            segment_ids,
-            inp_mask,
-            st_mask,
-            n_preceding,
-            query_ids,
-            is_first,
-            is_last,
-            features,
-            features_length,
-        ) = zip(*batch)
-        return (
-            pad_sequence([torch.tensor(x) for x in inp_ids], batch_first=True, padding_value=0),
-            pad_sequence([torch.tensor(x) for x in segment_ids], batch_first=True, padding_value=0),
-            pad_sequence([torch.tensor(x) for x in inp_mask], batch_first=True, padding_value=0),
-            pad_sequence([torch.tensor(x) for x in st_mask], batch_first=True, padding_value=0),
-            n_preceding,
-            query_ids,
-            is_first,
-            is_last,
-            pad_sequence([torch.tensor(x) for x in features], batch_first=True, padding_value=0).float(),
-            torch.tensor(features_length, dtype=torch.long),
-        )
-
-    def __getitem__(self, idx: int) -> Union[
-        Tuple[ndarray, ndarray, ndarray, ndarray, int, int, bool, bool],
-        Tuple[ndarray, ndarray, ndarray, ndarray, int, int, bool, bool, ndarray, List[int]],
-    ]:
-        """
-        Returns batch used for punctuation and capitalization inference.
-
-        Args:
-            idx (:obj:`int`): a batch index
-
-        Returns:
-            :obj:`Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int, bool, bool]`: a tuple containing:
-
-                - ``input_ids`` (:obj:`np.ndarray`): an integer numpy array of shape ``[Time]``. Ids of word
-                  subtokens encoded using tokenizer passed in constructor ``tokenizer`` parameter.
-                - ``segment_ids`` (:obj:`np.ndarray`): an integer zeros numpy array of shape ``[Time]``. Indices
-                  of segments for BERT model (token types in HuggingFace terminology).
-                - ``input_mask`` (:obj:`np.ndarray`): a boolean numpy array of shape ``[Time]``. An element of
-                  this array is ``True`` if corresponding token is not padding token.
-                - ``subtokens_mask`` (:obj:`np.ndarray`): a boolean numpy array of shape ``[Time]``. An element
-                  equals ``True`` if corresponding token is the first token in a word and ``False`` otherwise. For
-                  example, if input query ``"language processing"`` is tokenized into
-                  ``["[CLS]", "language", "process", "ing", "SEP"]``, then ``subtokens_mask`` will be
-                  ``[False, True, True, False, False]``.
-                - ``quantities_of_preceding_words`` (:obj:`int`): a number of words preceding current segment in the
-                  query to which the segment belongs. This parameter is used for uniting predictions from adjacent
-                  segments.
-                - ``query_ids`` (:obj:`int`): an index of query to which the segment belongs
-                - ``is_first`` (:obj:`bool`): whether a segment is the first segment in a query. The left margin of
-                  the first segment in a query is not removed.
-                - ``is_last`` (:obj:`bool`): whether a query is the last query in a query. The right margin of the last
-                  segment in a query is not removed.
-        """
-        if not self.use_audio:
-            return (
-                np.array(self.all_input_ids[idx]),
-                np.array(self.all_segment_ids[idx]),
-                np.array(self.all_input_mask[idx], dtype=np.float32),
-                np.array(self.all_subtokens_mask[idx]),
-                self.all_quantities_of_preceding_words[idx],
-                self.all_query_ids[idx],
-                self.all_is_first[idx],
-                self.all_is_last[idx],
-            )
-        return (
-            np.array(self.all_input_ids[idx]),
-            np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.float32),
-            np.array(self.all_subtokens_mask[idx]),
-            self.all_quantities_of_preceding_words[idx],
-            self.all_query_ids[idx],
-            self.all_is_first[idx],
-            self.all_is_last[idx],
-            np.array(self.all_audio_queries[idx], dtype=np.float64),
-            self.all_audio_lengths[idx],
-        )
diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py
deleted file mode 100644
index e88d87ba7c45..000000000000
--- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py
+++ /dev/null
@@ -1,1293 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import json
-import multiprocessing as mp
-import os
-import pickle
-import re
-import shutil
-import tempfile
-from collections import deque
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple, Type, Union
-
-import numpy as np
-import torch
-import webdataset as wds
-from joblib import Parallel, delayed
-from omegaconf import DictConfig
-from torch.utils.data import IterableDataset
-
-from nemo.collections.common.tokenizers import TokenizerSpec
-from nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset import (
-    LABEL_ID_DIR_FOR_NEMO_CHECKPOINT,
-    BertPunctuationCapitalizationDataset,
-    Progress,
-    create_label_ids,
-    create_masks_and_segment_ids,
-    load_label_ids,
-    raise_not_equal_labels_error,
-)
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
-from nemo.core.neural_types import AudioSignal, ChannelType, LabelsType, LengthsType, MaskType, NeuralType
-from nemo.utils import logging
-from nemo.utils.distributed import webdataset_split_by_workers
-
-NUMBER_RE = "(0|[1-9][0-9]*)"
-TAR_FRAGMENT_TMPL_IN_PROGRESS = "fragment{fragment_idx}.{file_idx}.tar"
-TAR_FRAGMENT_TMPL_FINISHED = "fragment{fragment_idx}.num_batches{num_batches}.{file_idx}.tar"
-TAR_FRAGMENT_TMPL_TO_REPACK = "fragment{fragment_idx}.num_batches{num_batches}.{file_idx}.tar.to_repack"
-TAR_FRAGMENT_PATTERN_IN_PROGRESS = re.compile(f"fragment{NUMBER_RE}.{NUMBER_RE}.tar$")
-TAR_FRAGMENT_PATTERN_FINISHED = re.compile(f"fragment{NUMBER_RE}.num_batches{NUMBER_RE}.{NUMBER_RE}.tar$")
-TAR_FRAGMENT_PATTERN_TO_REPACK = re.compile(f"fragment{NUMBER_RE}.num_batches{NUMBER_RE}.{NUMBER_RE}.tar.to_repack$")
-NOT_ALLOWED_CHARACTERS_IN_FILE_NAME = re.compile(f"[^a-zA-Z0-9_.-]")
-REPLACE_NOT_ALLOWED_CHARACTERS_IN_FILE_NAME = re.compile(f"-*[^a-zA-Z0-9_.-]+-*")
-
-DATASET_PARAMETERS_TMPL = "{prefix}.tokens{tokens_in_batch}.max_seq_length{max_seq_length}.{tokenizer}"
-TAR_FINAL_TMPL = ".batches{num_batches}.{ctr}.tar"
-
-PROGRESS_REPORT_PERIOD = 10 ** 4
-
-METADATA_PUNCT_LABEL_VOCAB_KEY = 'punct_label_vocab_file'
-METADATA_CAPIT_LABEL_VOCAB_KEY = 'capit_label_vocab_file'
-DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME = 'punct_label_vocab.csv'
-DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME = 'capit_label_vocab.csv'
-
-
-def count_lines_and_get_fragment_starting_positions(
-    file_name: Path, lines_per_dataset_fragment: int
-) -> Tuple[int, List[int]]:
-    """
-    Returns number of lines in a file and indices of fragment starting bytes.
-
-    Args:
-        file_name: a path to a text or label file
-        lines_per_dataset_fragment: number of lines in a dataset fragment. The last fragment can contain fewer lines
-
-    Returns:
-        num_lines: number of lines in a file
-        start_bytes: indices of fragment starting bytes
-    """
-    pos = [0]
-    with file_name.open() as f:
-        i = 0
-        line = f.readline()
-        while line:
-            i += 1
-            if i % lines_per_dataset_fragment == 0:
-                pos.append(f.tell())
-            line = f.readline()
-    return i, pos[:-1] if i % lines_per_dataset_fragment == 0 else pos
-
-
-def get_fragment_start_bytes(
-    text_file: Path, labels_file: Path, lines_per_dataset_fragment: int, audio_file: Path = None
-) -> Union[Tuple[Any, Any, Any, Any], Tuple[Any, Any, Any]]:
-    """
-    A function for calculating borders of dataset fragments. The function is used to split ``text_file`` and
-    ``labels_file`` for processing them in parallel.
-
-    Args:
-        text_file: a path to a dataset source file
-        labels_file: a path to a dataset label file
-        lines_per_dataset_fragment: a number of lines in one fragment
-        audio_file: a path to a dataset audio file if one needed
-
-    Returns:
-        num_lines: total number of elements in the dataset (number of lines in ``text_file``` and ``labels_file``)
-        text_start_bytes: indices of the first bytes of fragments in ``text_file``
-        label_start_bytes: indices of the first bytes of fragments in ``labels_file``
-    """
-    logging.info(
-        f"Counting lines in files {text_file} and {labels_file} and creating segment borders. This may take "
-        f"considerable time. 86GB, 1.27b lines file was processed in 7 minutes."
-    )
-    if audio_file:
-        result = Parallel(n_jobs=3)(
-            delayed(count_lines_and_get_fragment_starting_positions)(file_name, lines_per_dataset_fragment)
-            for file_name in [text_file, labels_file, audio_file]
-        )
-        num_lines = result[0][0]
-        if result[0][0] != result[1][0]:
-            raise ValueError(
-                f"Text file {text_file} and label file {labels_file} contain different number of lines. Number of lines "
-                f"in text file: {result[0][0]}, number of lines in label file: {result[1][0]}."
-            )
-        text_start_bytes, label_start_bytes, manifest_start_bytes = result[0][1], result[1][1], result[2][1]
-        assert len(text_start_bytes) == len(label_start_bytes) == len(manifest_start_bytes)
-        return num_lines, text_start_bytes, label_start_bytes, manifest_start_bytes
-    else:
-        result = Parallel(n_jobs=2)(
-            delayed(count_lines_and_get_fragment_starting_positions)(file_name, lines_per_dataset_fragment)
-            for file_name in [text_file, labels_file]
-        )
-        num_lines = result[0][0]
-        if result[0][0] != result[1][0]:
-            raise ValueError(
-                f"Text file {text_file} and label file {labels_file} contain different number of lines. Number of lines "
-                f"in text file: {result[0][0]}, number of lines in label file: {result[1][0]}."
-            )
-        text_start_bytes, label_start_bytes = result[0][1], result[1][1]
-        assert len(text_start_bytes) == len(label_start_bytes)
-        return num_lines, text_start_bytes, label_start_bytes
-
-
-def process_fragment(
-    text_file: Path,
-    labels_file: Path,
-    output_dir: Path,
-    text_start_pos: int,
-    label_start_pos: int,
-    lines_per_dataset_fragment: int,
-    max_seq_length: int,
-    tokens_in_batch: int,
-    num_batches_per_tarfile: int,
-    tokenizer_name: str,
-    tokenizer_model: Optional[Path],
-    vocab_file: Optional[Path],
-    merges_file: Optional[Path],
-    special_tokens: Dict[str, str],
-    use_fast_tokenizer: Optional[bool],
-    pad_label: str,
-    punct_label_ids: Dict[str, int],
-    capit_label_ids: Dict[str, int],
-    fragment_idx: int,
-    tokenization_progress_queue: mp.Queue,
-    batch_mark_up_progress_queue: mp.Queue,
-    batch_building_progress_queue: mp.Queue,
-    writing_to_tar_progress_queue: mp.Queue,
-    audio_file: Path = None,
-    sample_rate: int = None,
-    audio_file_start_pos: int = None,
-    use_audio: bool = False,
-) -> None:
-    tokenizer = get_tokenizer(
-        tokenizer_name,
-        tokenizer_model=None if tokenizer_model is None else str(tokenizer_model),
-        vocab_file=None if vocab_file is None else str(vocab_file),
-        merges_file=None if merges_file is None else str(merges_file),
-        special_tokens=special_tokens,
-        use_fast=use_fast_tokenizer,
-    )
-    tmp_text: Optional[str] = None
-    tmp_labels: Optional[str] = None
-    tmp_audio: Optional[str] = None
-    try:
-        otfd, tmp_text = tempfile.mkstemp(suffix='.txt', prefix=f'text_{fragment_idx}_', dir=output_dir, text=True)
-        olfd, tmp_labels = tempfile.mkstemp(suffix='.txt', prefix=f'labels_{fragment_idx}_', dir=output_dir, text=True)
-        if use_audio:
-            oafd, tmp_audio = tempfile.mkstemp(
-                suffix='.txt', prefix=f'audio_{fragment_idx}_', dir=output_dir, text=True
-            )
-        with text_file.open() as tf, labels_file.open() as lf, os.fdopen(otfd, 'w') as otf, os.fdopen(
-            olfd, 'w'
-        ) as olf:  # handle audio manifest
-            if use_audio:
-                mf = audio_file.open()
-                mf.seek(audio_file_start_pos)
-                oaf = os.fdopen(oafd, 'w')
-            tf.seek(text_start_pos)
-            lf.seek(label_start_pos)
-            for _ in range(lines_per_dataset_fragment):
-                text_line = tf.readline()
-                if not text_line:
-                    break
-                otf.write(text_line)
-                olf.write(lf.readline())
-                if use_audio:
-                    oaf.write(mf.readline())
-        if use_audio:
-            mf.close()
-            oaf.close()
-        dataset = BertPunctuationCapitalizationDataset(
-            tmp_text,
-            tmp_labels,
-            max_seq_length,
-            tokenizer,
-            tokens_in_batch=tokens_in_batch,
-            pad_label=pad_label,
-            punct_label_ids=punct_label_ids,
-            capit_label_ids=capit_label_ids,
-            n_jobs=0,
-            use_cache=False,
-            add_masks_and_segment_ids_to_batch=False,
-            verbose=False,
-            tokenization_progress_queue=tokenization_progress_queue,
-            batch_mark_up_progress_queue=batch_mark_up_progress_queue,
-            batch_building_progress_queue=batch_building_progress_queue,
-            audio_file=tmp_audio,
-            sample_rate=sample_rate,
-            use_audio=use_audio,
-            use_bucketing=True,
-            preload_audios=use_audio,
-        )
-    finally:
-        if tmp_text is not None and os.path.exists(tmp_text):
-            os.remove(tmp_text)
-        if tmp_labels is not None and os.path.exists(tmp_labels):
-            os.remove(tmp_labels)
-        if tmp_audio is not None and os.path.exists(tmp_audio):
-            os.remove(tmp_audio)
-    dataset.features_pkl.unlink()
-    tar_ctr = 0
-    current_file_name = output_dir / TAR_FRAGMENT_TMPL_IN_PROGRESS.format(fragment_idx=fragment_idx, file_idx=tar_ctr)
-    current_num_batches = 0
-    sink = wds.TarWriter(str(current_file_name))
-    progress_made = 0
-    for batch_i, batch in enumerate(dataset):
-        sink.write({"__key__": f"fragment-{fragment_idx}-batch-{batch_i}", "batch.pyd": batch})
-        current_num_batches += 1
-        progress_made += len(batch['input_ids'])
-        if current_num_batches % num_batches_per_tarfile == 0:
-            sink.close()
-            current_file_name.rename(
-                output_dir
-                / TAR_FRAGMENT_TMPL_FINISHED.format(
-                    fragment_idx=fragment_idx, num_batches=current_num_batches, file_idx=tar_ctr
-                )
-            )
-            writing_to_tar_progress_queue.put(progress_made)
-            progress_made = 0
-            tar_ctr += 1
-            current_file_name = output_dir / TAR_FRAGMENT_TMPL_IN_PROGRESS.format(
-                fragment_idx=fragment_idx, file_idx=tar_ctr
-            )
-            current_num_batches = 0
-            sink = wds.TarWriter(str(current_file_name))
-    sink.close()
-    writing_to_tar_progress_queue.put(progress_made)
-    if progress_made > 0:
-        new_file_name = output_dir / TAR_FRAGMENT_TMPL_TO_REPACK.format(
-            fragment_idx=fragment_idx, num_batches=current_num_batches, file_idx=tar_ctr
-        )
-        current_file_name.rename(new_file_name)
-    else:
-        current_file_name.unlink()
-    if fragment_idx == 0:
-        punct_label_ids_file, capit_label_ids_file = dataset.save_labels_and_get_file_paths(
-            DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME, DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME
-        )
-        punct_label_ids_file.rename(output_dir / DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME)
-        capit_label_ids_file.rename(output_dir / DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME)
-        shutil.rmtree(punct_label_ids_file.parent)
-
-
-def remove_unexpected_files_and_dirs(output_dir: Path, output_file_tmpl: str, metadata_file_name: Path) -> None:
-    """
-    This function removes all files with names which may be used in the dataset creation.
-
-    Args:
-        output_dir: a path to directory where removal is performed
-        output_file_tmpl: a format string for a name of final tar file. Must include fields ``ctr`` for number of the
-            file and ``num_batches`` for number of batches in the file.
-        metadata_file_name: a metadata file name
-    """
-    if not output_dir.is_dir():
-        return
-    tar_final_pattern = re.compile(output_file_tmpl.format(ctr=NUMBER_RE, num_batches=NUMBER_RE))
-    unexpected_tar_files = [
-        path
-        for path in output_dir.iterdir()
-        if any(
-            [
-                p.match(path.name) is not None
-                for p in [
-                    TAR_FRAGMENT_PATTERN_IN_PROGRESS,
-                    TAR_FRAGMENT_PATTERN_FINISHED,
-                    TAR_FRAGMENT_PATTERN_TO_REPACK,
-                    tar_final_pattern,
-                ]
-            ]
-        )
-    ]
-    if unexpected_tar_files:
-        logging.warning(
-            f"Found {len(unexpected_tar_files)} unexpected tar files in the output directory {output_dir}. "
-            f"All of them are going to be removed. The files match one of 3 patterns: "
-            f"'{TAR_FRAGMENT_PATTERN_IN_PROGRESS.pattern}', '{TAR_FRAGMENT_PATTERN_FINISHED.pattern}', "
-            f"'{tar_final_pattern.pattern}'. The first unexpected files: "
-            f"{', '.join([str(f) for f in unexpected_tar_files[:3]])}."
-        )
-        for fn in unexpected_tar_files:
-            fn.unlink()
-    if metadata_file_name.exists():
-        logging.warning(f"Found metadata file {metadata_file_name}. It is going to be removed.")
-        metadata_file_name.unlink()
-    punct_label_ids = output_dir / DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME
-    capit_label_ids = output_dir / DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME
-    if punct_label_ids.exists():
-        logging.warning(f"Found unexpected punctuation label file {punct_label_ids}. It is going to be removed.")
-        punct_label_ids.unlink()
-    if capit_label_ids.exists():
-        logging.warning(f"Found unexpected capitalization label file {capit_label_ids}. It is going to be removed.")
-        capit_label_ids.unlink()
-
-
-def collect_unique_labels_from_fragment(
-    labels_file: Path, start_pos: int, lines_per_dataset_fragment: int, progress_queue: mp.Queue, fragment_idx: int
-) -> Tuple[Set[str], Set[str]]:
-    """
-    Returns a set of unique punctuation labels and a set of unique capitalization labels.
-
-    Args:
-        labels_file: a path to a file with labels
-        start_pos: an index of the first byte of a fragment in ``labels_file``
-        lines_per_dataset_fragment: number of lines in dataset fragment. In the last fragment there can be less lines.
-        progress_queue: a queue for reporting number of processed lines
-        fragment_idx: a processed fragment index
-
-    Returns:
-        unique_punct: a set of unique punctuation labels
-        unique_capit: a set of unique capitalization labels
-    """
-    unique_punct, unique_capit = set(), set()
-    with labels_file.open() as f:
-        f.seek(start_pos)
-        progress_report = 0
-        for i in range(lines_per_dataset_fragment):
-            line = f.readline()
-            if not line:
-                break
-            pairs = line.split()
-            if not all([len(p) == 2 for p in pairs]):
-                broken_pairs = [i for i, p in enumerate(pairs) if len(p) != 2]
-                raise ValueError(
-                    f"Found broken labels line in number {fragment_idx * lines_per_dataset_fragment + i} in file "
-                    f"{labels_file}. Indices of broken pairs of labels: {broken_pairs}"
-                )
-            punct, capit = zip(*pairs)
-            unique_punct.update(punct)
-            unique_capit.update(capit)
-            progress_report += 1
-            if progress_report >= PROGRESS_REPORT_PERIOD:
-                progress_queue.put(progress_report)
-                progress_report = 0
-        progress_queue.put(progress_report)
-    return unique_punct, unique_capit
-
-
-def create_label_dictionaries(
-    labels_file: Path,
-    text_start_bytes: List[int],
-    num_lines: int,
-    lines_per_dataset_fragment: int,
-    pad_label: str,
-    n_jobs: int,
-) -> Tuple[Dict[str, int], Dict[str, int]]:
-    """
-    Creates punctuation and capitalization label ids dictionaries based on labels present in ``labels_file``.
-
-    Args:
-        labels_file: a path to file with labels
-        text_start_bytes: indices of first bytes of fragments in ``labels_file``
-        num_lines: total number of lines in ``labels_file``
-        lines_per_dataset_fragment: number of lines in dataset fragments. The last fragment can have fewer lines
-        pad_label: a label used for padding and for absence of punctuation and capitalization
-        n_jobs: a number of fragments processed in parallel
-
-    Returns:
-        punct_label_ids: a dictionary with punctuation label ids
-        capit_label_ids: a dictionary with capitalization label ids
-    """
-    with Progress(num_lines, "Creating label dictionary", "line") as progress_queues:
-        result = Parallel(n_jobs=min(n_jobs, len(text_start_bytes)))(
-            delayed(collect_unique_labels_from_fragment)(
-                labels_file, start_pos, lines_per_dataset_fragment, *progress_queues, fragment_idx
-            )
-            for fragment_idx, start_pos in enumerate(text_start_bytes)
-        )
-    unique_punct, unique_capit = zip(*result)
-    unique_punct = set().union(*unique_punct)
-    unique_capit = set().union(*unique_capit)
-    return create_label_ids(unique_punct, pad_label), create_label_ids(unique_capit, pad_label)
-
-
-def check_label_ids(pad_label: str, punct_label_ids: Dict[str, int], capit_label_ids: Dict[str, int]) -> None:
-    """
-    A function for checking that pad label has zeroth id in ``punct_label_dis`` and ``capit_label_ids`` dictionaries.
-    Args:
-        pad_label: a pad label
-        punct_label_ids: a dictionary with punctuation label ids
-        capit_label_ids: a dictionary with capitalization label ids
-    """
-    msg = "Parameter `pad_label` has to have id 0 in dictionary `{param_name}` whereas it has id {id_}." + (
-        '' if len(pad_label) > 10 else f" pad_label='{pad_label}'"
-    )
-    if punct_label_ids is not None:
-        if punct_label_ids[pad_label] != 0:
-            raise ValueError(msg.format(param_name='punct_label_ids', id_=punct_label_ids[pad_label]))
-    if capit_label_ids is not None:
-        if capit_label_ids[pad_label] != 0:
-            raise ValueError(msg.format(param_name='capit_label_ids', id_=capit_label_ids[pad_label]))
-
-
-def process_error(msg: str, error_class_or_function: Union[Type[Exception], Callable[[str], Any]]) -> None:
-    if inspect.isclass(error_class_or_function) and issubclass(error_class_or_function, Exception):
-        raise error_class_or_function(msg)
-    if callable(error_class_or_function):
-        error_class_or_function(msg)
-    raise ValueError(
-        f"Parameter `error_class_or_function` has to be a subclass of `Exception` or a function."
-        f"Given {type(error_class_or_function)}"
-    )
-
-
-def check_labels_for_being_unique_before_building_label_ids(
-    pad_label: str,
-    other_labels: List[str],
-    pad_label_name: str,
-    other_labels_name: str,
-    error_class_or_function: Union[Type[Exception], Callable[[str], Any]],
-) -> None:
-    """
-    A function for checking that that all labels are unique.
-
-    Args:
-        pad_label: a pad label
-        other_labels: a list of labels except for the pad label
-        pad_label_name: a name of the pad label used in error message
-        other_labels_name: a name of other labels used in error message
-        error_class_or_function: a class of an exception which is raised if there is a problem with labels.
-            Alternatively it can be a function for handling exceptions, for example ``argparse.ArgumentParser.error``.
-            Such a function has to take one argument -- error message.
-    """
-    for i, lbl in enumerate(other_labels):
-        if lbl == pad_label:
-            msg = f"Label number {i} in parameter `{other_labels_name}` is equal to `{pad_label_name}`."
-            process_error(msg, error_class_or_function)
-    for i in range(len(other_labels) - 1):
-        for lbl in other_labels[i + 1 :]:
-            if lbl == other_labels[i]:
-                msg = f"Label number {i} occurs at least 2 times in parameter `{other_labels_name}`."
-                process_error(msg, error_class_or_function)
-
-
-def build_label_ids_from_list_of_labels(pad_label: str, other_labels: List[str]) -> Dict[str, int]:
-    """
-    Builds label ids dictionary from pad label and list of other labels. Used for parsing command line arguments.
-    Args:
-        pad_label: a pad label
-        other_labels: list of labels except for the pad label
-
-    Returns:
-        a dictionary with label ids
-    """
-    check_labels_for_being_unique_before_building_label_ids(
-        pad_label, other_labels, 'pad_label', 'other_labels', ValueError
-    )
-    ids = {pad_label: 0}
-    for lbl in other_labels:
-        ids[lbl] = len(ids)
-    return ids
-
-
-def get_label_dictionaries(
-    labels_file: Path,
-    start_bytes: List[int],
-    num_lines: int,
-    lines_per_dataset_fragment: int,
-    pad_label: str,
-    punct_label_ids: Optional[Dict[str, int]],
-    capit_label_ids: Optional[Dict[str, int]],
-    punct_label_vocab_file: Optional[Path],
-    capit_label_vocab_file: Optional[Path],
-    n_jobs: int,
-) -> Tuple[Dict[str, int], Dict[str, int]]:
-    """
-    Return label ids if the label ids are present in parameters ``punct_label_ids``, ``capit_label_ids``,
-    ``punct_label_vocab_file``, ``capit_label_vocab_file``. Otherwise, label ids are created using ``labels_file``.
-
-    Args:
-        labels_file: a path to file with labels. Labels have to be given in the format described in
-            https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#nemo-data-format
-        start_bytes: a list of positions in ``labels_file`` at which fragments start. Parameter ``start_bytes`` is used
-            for creating labels for several fragments in parallel
-        num_lines: total number of lines in ``labels_file``. Parameter ``num_lines`` is used for showing progress of
-            label ids collection
-        lines_per_dataset_fragment: number of lines in a dataset fragment
-        pad_label: a label used for padding and also neutral label showing there is no punctuation and capitalization.
-            Label ``pad_label`` has to have id ``0`` in parameters ``punct_label_ids``, ``capit_label_ids``,
-            ``punct_label_vocab_file``, ``capit_label_vocab_file`` if these parameters are provided.
-        punct_label_ids: a dictionary with punctuation label ids. Pad label has to have id ``0``. No more than 1 of
-            parameters ``punct_label_ids`` and ``punct_label_vocab_file`` can be provided.
-        capit_label_ids: a dictionary with capitalization label ids. Pad label has to have id ``0``. No more than 1 of
-            parameters ``capit_label_ids`` and ``capit_label_vocab_file`` can be provided.
-        punct_label_vocab_file: a text file with punctuation labels. Every line in the file contains 1 label. Pad label
-            has to be in the first line. No more than 1 of parameters ``punct_label_ids`` and
-            ``punct_label_vocab_file`` can be provided.
-        capit_label_vocab_file: a text file with capitalization labels. Every line in the file contains 1 label. Pad
-            label has to be in the first line. No more than 1 of parameters ``capit_label_ids`` and
-            ``capit_label_vocab_file`` can be provided.
-        n_jobs: a number of fragments processed in parallel
-
-    Returns:
-        punct_label_ids: a dictionary with punctuation label ids
-        capit_label_ids: a dictionary with capitalization label ids
-    """
-    if punct_label_ids is not None and punct_label_vocab_file is not None:
-        raise ValueError("You can provide at most one of parameters `punct_label_ids` and `punct_label_vocab_file`.")
-    if capit_label_ids is not None and capit_label_vocab_file is not None:
-        raise ValueError("You can provide at most one of parameters `capit_label_ids` and `capit_label_vocab_file`.")
-    if punct_label_ids is None and punct_label_vocab_file is not None:
-        punct_label_ids = load_label_ids(punct_label_vocab_file)
-    if capit_label_ids is None and capit_label_vocab_file is not None:
-        capit_label_ids = load_label_ids(capit_label_vocab_file)
-    check_label_ids(pad_label, punct_label_ids, capit_label_ids)
-    if punct_label_ids is None or capit_label_ids is None:
-        _punct_label_ids, _capit_label_ids = create_label_dictionaries(
-            labels_file, start_bytes, num_lines, lines_per_dataset_fragment, pad_label, n_jobs
-        )
-        if punct_label_ids is None:
-            punct_label_ids = _punct_label_ids
-        if capit_label_ids is None:
-            capit_label_ids = _capit_label_ids
-    return punct_label_ids, capit_label_ids
-
-
-def decode_pyd(key: str, value: bytes) -> Any:
-    """
-    Used for decoding batch loaded by ``webdataset`` from tar files.
-    Args:
-        key: name of a batch
-        value: pickled batch
-
-    Returns:
-        decoded batch
-    """
-    return pickle.loads(value)
-
-
-def repack_tar_files_with_not_enough_batches(output_dir: Path, num_batches_per_tarfile: int) -> None:
-    f"""
-    It is possible that number of batches in a fragment is not evenly divisible by ``num_batches_per_tarfile``.
-    In such a case excess batches are put in a tar file which matches a pattern
-    ``fragment(0|[1-9][0-9]*).num_batches(0|[1-9][0-9]*).(0|[1-9][0-9]*).tar.to_repack``. Such files are repacked by
-    ``repack_tar_files_with_not_enough_batches`` function into tar files with correct ``num_batches_per_tarfile``
-    batches each. If there is no enough batches in repacked files, then up to ``num_batches_per_tarfile - 1``
-    remaining batches may be discarded.
-
-    Args:
-        output_dir: a path to the output directory which contains files to repack and where new files are saved
-        num_batches_per_tarfile: a number of batches in 1 tar file. If number of batches in files matching a pattern
-            ``fragment(0|[1-9][0-9]*).num_batches(0|[1-9][0-9]*).(0|[1-9][0-9]*).tar.to_repack`` is not evenly
-            divisible by ``num_batches_per_tarfile`` excess batches are discarded.
-    """
-    files_to_repack_with_matches = [
-        (path, TAR_FRAGMENT_PATTERN_TO_REPACK.match(path.name))
-        for path in output_dir.iterdir()
-        if TAR_FRAGMENT_PATTERN_TO_REPACK.match(path.name) is not None
-    ]
-    files_to_repack_with_matches = sorted(files_to_repack_with_matches, key=lambda x: int(x[1].group(3)))
-    logging.info(f"Found {len(files_to_repack_with_matches)} files for repacking.")
-    files_to_repack_with_matches = deque(files_to_repack_with_matches)
-    total_batches_in_repacked_files = 0
-    initial_number_of_files_to_repack = len(files_to_repack_with_matches)
-    pop_file_ds = None
-    new_file_sink = None
-    new_file_num_batches = 0
-    while files_to_repack_with_matches:
-        assert pop_file_ds is None or new_file_sink is None
-        if new_file_sink is None:
-            # `append_file` is a file which content will serve as a start for new tar file. `append_file` content is
-            # copied into a `new_file` and then content of other files needing repacking is appended to content of
-            # `new_file`.
-            append_file, match = files_to_repack_with_matches.popleft()
-            new_file = append_file.parent / TAR_FRAGMENT_TMPL_FINISHED.format(
-                fragment_idx=match.group(1), num_batches=num_batches_per_tarfile, file_idx=match.group(3)
-            )
-            new_file_sink = wds.TarWriter(str(new_file))
-            append_ds_to_rewrite = wds.DataPipeline(
-                wds.SimpleShardList(urls=[str(append_file)]),
-                wds.tarfile_to_samples(),
-                wds.decode(wds.handle_extension('.pyd', decode_pyd)),
-                wds.to_tuple('__key__', 'batch.pyd'),
-            )
-            for key, batch in iter(append_ds_to_rewrite):
-                new_file_sink.write({"__key__": key, "batch.pyd": batch})
-                new_file_num_batches += 1
-                total_batches_in_repacked_files += 1
-            assert total_batches_in_repacked_files < initial_number_of_files_to_repack * num_batches_per_tarfile
-            assert new_file_num_batches == int(match.group(2)), (
-                f"Number of batches {new_file_num_batches} in {append_file} is different from number of batches "
-                f"{match.group(2)} in repacked tar file with name {append_file}."
-            )
-            append_file.unlink()
-        if files_to_repack_with_matches and pop_file_ds is None:
-            pop_file, _ = files_to_repack_with_matches.pop()
-            pop_file_ds = wds.DataPipeline(
-                wds.SimpleShardList([str(pop_file)]),
-                wds.tarfile_to_samples(),
-                wds.decode(wds.handle_extension('.pyd', decode_pyd)),
-                wds.to_tuple('__key__', 'batch.pyd'),
-            )
-            pop_file_ds = iter(pop_file_ds)
-        if pop_file_ds is not None and new_file_sink is not None:
-            while new_file_num_batches < num_batches_per_tarfile:
-                try:
-                    key, batch = next(pop_file_ds)
-                except StopIteration:
-                    pop_file_ds = None
-                    pop_file.unlink()
-                    break
-                new_file_sink.write({"__key__": key, "batch.pyd": batch})
-                total_batches_in_repacked_files += 1
-                assert total_batches_in_repacked_files < initial_number_of_files_to_repack * num_batches_per_tarfile
-                new_file_num_batches += 1
-            if new_file_num_batches >= num_batches_per_tarfile:
-                assert new_file_num_batches == num_batches_per_tarfile
-                new_file_sink.close()
-                new_file_sink = None
-                new_file_num_batches = 0
-    if new_file_sink is not None:
-        new_file_sink.close()
-        new_file.unlink()
-        logging.info(f"Discarded {new_file_num_batches} batches.")
-    if pop_file_ds is not None:
-        pop_file.unlink()
-    logging.info(f"Repacked {total_batches_in_repacked_files} batches from short tar files")
-
-
-def create_metadata_file(
-    output_dir: Path, output_file_tmpl: str, metadata_file_name: Path, num_batches_per_tarfile: int
-) -> None:
-    """
-    Rename tar files according to template ``output_file_tmpl`` and save metadata file.
-    Args:
-        output_dir: a path to directory which contains initial tar files and where renamed tar files are saved
-        output_file_tmpl: a template of a new tar file name
-        metadata_file_name: a path to a file into which metadata is going to be saved
-        num_batches_per_tarfile: a required number of batches in tar files. Used for checking that present tar files
-            have correct number of batches
-    """
-    metadata = {"num_batches": 0, "tar_files": []}
-    for i, fn in enumerate([fn for fn in output_dir.iterdir() if TAR_FRAGMENT_PATTERN_FINISHED.match(fn.name)]):
-        nb = int(TAR_FRAGMENT_PATTERN_FINISHED.match(fn.name).group(2))
-        assert nb == num_batches_per_tarfile
-        new_name = output_dir / output_file_tmpl.format(ctr=i, num_batches=nb)
-        fn.rename(new_name)
-        metadata['tar_files'].append(new_name.name)
-        metadata["num_batches"] += nb
-    metadata[METADATA_PUNCT_LABEL_VOCAB_KEY] = DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME
-    metadata[METADATA_CAPIT_LABEL_VOCAB_KEY] = DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME
-    logging.info(f"{metadata['num_batches']} batches are in tarred dataset with metadata file {metadata_file_name}")
-    with metadata_file_name.open('w') as f:
-        json.dump(metadata, f, indent=2)
-
-
-def check_tar_file_prefix(
-    tar_file_prefix: str, error_class_or_function: Union[Type[Exception], Callable[[str], Any]], var_name: str
-) -> None:
-    not_allowed_characters_in_prefix = NOT_ALLOWED_CHARACTERS_IN_FILE_NAME.findall(tar_file_prefix)
-    if not_allowed_characters_in_prefix:
-        not_allowed_characters_in_prefix = set(not_allowed_characters_in_prefix)
-        msg = (
-            f"Found {len(not_allowed_characters_in_prefix)} not allowed characters in `{var_name}`. Only 'A-Z', "
-            f"'a-z', '0-9', '_', '-', '.' characters are allowed. Examples of not allowed characters: "
-            f"{list(not_allowed_characters_in_prefix)[:10]}. `{var_name}`[:30]={repr(tar_file_prefix)[:30]}."
-        )
-        process_error(msg, error_class_or_function)
-
-
-def create_tarred_dataset(
-    text_file: Union[os.PathLike, str],
-    labels_file: Union[os.PathLike, str],
-    output_dir: Union[os.PathLike, str],
-    max_seq_length: int,
-    tokens_in_batch: int,
-    lines_per_dataset_fragment: int,
-    num_batches_per_tarfile: int,
-    tokenizer_name: str,
-    tokenizer_model: Optional[Union[os.PathLike, str]] = None,
-    vocab_file: Optional[Union[os.PathLike, str]] = None,
-    merges_file: Optional[Union[os.PathLike, str]] = None,
-    special_tokens: Optional[Dict[str, str]] = None,
-    use_fast_tokenizer: Optional[bool] = False,
-    pad_label: str = 'O',
-    punct_label_ids: Optional[Dict[str, int]] = None,
-    capit_label_ids: Optional[Dict[str, int]] = None,
-    punct_label_vocab_file: Optional[Union[os.PathLike, str]] = None,
-    capit_label_vocab_file: Optional[Union[os.PathLike, str]] = None,
-    tar_file_prefix: Optional[str] = 'punctuation_capitalization',
-    n_jobs: Optional[int] = None,
-    audio_file: Optional[Path] = None,
-    use_audio: Optional[bool] = False,
-    sample_rate: Optional[int] = 16000,
-) -> None:
-    """
-    Creates tarred dataset from ``text_file`` and ``labels_file``. A tarred dataset allows to train on large amounts of
-    data without storing it all into memory simultaneously. You may use these function directly or try script
-    `examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py
-    <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py>`_.
-
-    Tarred dataset is a directory which contains metadata file, tar files with batches,
-    ``punct_label_vocab.csv`` and ``capit_label_vocab.csv`` files.
-
-    Metadata file is a JSON file with 4 items: ``'num_batches'``, ``'tar_files'``, ``'punct_label_vocab_file'``,
-    ``'capit_label_vocab_file'``. The item ``'num_batches'`` (``int``) is a total number of batches in tarred dataset.
-    ``'tar_files'`` is a list of paths to tar files relative to directory containing the metadata file. The items
-    ``'punct_label_vocab_file'`` and ``'capit_label_vocab_file'`` are correspondingly paths to punctuation and
-    capitalization label vocabulary files. These paths are relative to directory containing the metadata file.
-
-    Every tar file contains objects written using ``webdataset.TarWriter``. Each object is a dictionary with two items:
-    ``'__key__'`` and ``'batch.pyd'``. ``'__key__'`` is a name of a batch and ``'batch.pyd'`` is a pickled dictionary
-    which contains ``'input_ids'``, ``'subtokens_mask'``, ``'punct_labels'``, ``'capit_labels'``. ``'input_ids'`` is an
-    array containing ids of source tokens, ``'subtokens_mask'`` is a boolean array showing first tokens in words,
-    ``'punct_labels'`` and ``'capit_labels'`` are arrays with ids of labels.
-
-    Metadata file should be passed to constructor of :class:`BertPunctuationCapitalizationTarredDataset` and the
-    instance of the class will handle iteration and constructing masks and token types for BERT model.
-
-    Args:
-        text_file (:obj:`Union[os.PathLike, str]`): a path to a file with dataset source. Dataset source is lowercase
-            text without punctuation. Number of lines in ``text_file`` has to be equal to the number of lines in
-            ``labels_file``.
-        labels_file (:obj:`Union[os.PathLike, str]`): a path to a file with labels. Labels are given in the format
-            described in :ref:`NeMo Data Format<nemo-data-format-label>`.
-        output_dir (:obj:`Union[os.PathLike, str]`): a path to a directory where metadata file, tar files and
-            ``'punct_label_ids.csv'`` and ``'capit_label_ids.csv'`` files are saved.
-        max_seq_length (:obj:`int`): Maximum number of subtokens in an input sequence. A source sequence which contains
-            too many subtokens is clipped to ``max_seq_length - 2`` subtokens and then [CLS] token is prepended to the
-            clipped sequence and [SEP] token is appended to the clipped sequence. The clipping is performed via removal
-            of subtokens in the end of a source sequence.
-        tokens_in_batch (:obj:`int`): maximum number of tokens in a batch including [CLS], [SEP], [UNK], and [PAD]
-            tokens. Before packing into batches source sequences are sorted by number of tokens in order to reduce
-            number of pad tokens. So the number of samples in a batch may vary.
-        lines_per_dataset_fragment (:obj:`int`): a number of lines processed by one worker during creation of tarred
-            dataset. A worker tokenizes ``lines_per_dataset_fragment`` lines and keeps in RAM tokenized text labels
-            before packing them into batches. Reducing ``lines_per_dataset_fragment`` leads to reducing of the amount
-            of memory used by this function.
-        num_batches_per_tarfile (:obj:`int`): a number of batches saved in a tar file. If you increase
-            ``num_batches_per_tarfile``, then there will be less tar files in the dataset. There cannot be less than
-            ``num_batches_per_tarfile`` batches in a tar file, and all excess batches are removed. Maximum number of
-            discarded batches is ``num_batches_per_tarfile - 1``.
-        tokenizer_name (:obj:`str`): a name of the tokenizer used for tokenization of source sequences. Possible
-            options are ``'sentencepiece'``, ``'word'``, ``'char'``, HuggingFace tokenizers. For more options see
-            function ``nemo.collections.nlp.modules.common.get_tokenizer``. The tokenizer must have properties
-            ``cls_id``, ``pad_id``, ``sep_id``, ``unk_id``.
-        tokenizer_model (:obj:`Union[os.PathLike, str]`, `optional`): a path to a tokenizer model required for
-            ``'sentencepiece'`` tokenizer.
-        vocab_file (:obj:`Union[os.PathLike, str]`, `optional`): a path to a vocabulary file which can be used in
-            ``'word'``, ``'char'``, and HuggingFace tokenizers.
-        merges_file (:obj:`Union[os.PathLike, str]`, `optional`): a path to merges file which can be used in
-            HuggingFace tokenizers.
-        special_tokens (:obj:`Dict[str, str]`, `optional`): a dictionary with special tokens passed to constructors of
-            ``'char'``, ``'word'``, ``'sentencepiece'``, and various HuggingFace tokenizers.
-        use_fast_tokenizer (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to use fast HuggingFace
-            tokenizer.
-        pad_label (:obj:`str`, `optional`, defaults to :obj:`'O'`): a pad label both for punctuation and
-            capitalization. This label is also a neutral label (used for marking words which do not need punctuation
-            and capitalization).
-        punct_label_ids (:obj:`Dict[str, int]`, `optional`): a dictionary which keys are punctuation labels and values
-            are label ids. The pad label ``pad_label`` has to have id ``0``. You can provide at most one of parameters
-            ``punct_label_ids`` and ``punct_label_vocab_file``. If none of parameters ``punct_label_ids`` and
-            ``punct_label_vocab_file`` is provided, then punctuation label ids will be inferred from ``labels_file``
-            file.
-        capit_label_ids (:obj:`Dict[str, int]`, `optional`): same as ``punct_label_ids`` for capitalization labels.
-        punct_label_vocab_file (:obj:`Union[os.PathLike, str]`, `optional`): a path to a file with punctuation labels.
-            These labels include pad label. The pad label has to be the first label in the file. Each label is written
-            on a separate line. Alternatively you can use ``punct_labels_ids`` parameter. If none of parameters
-            ``punct_labels_ids`` and ``punct_label_vocab_file`` is provided, then punctuation label ids will be
-            inferred from ``labels_file`` file.
-        capit_label_vocab_file (:obj:`Union[os.PathLike, str]`, `optional`): same as ``punct_label_vocab_file`` for
-            capitalization labels.
-        tar_file_prefix (:obj:`str`, `optional`, defaults :obj:`'punctuation_capitalization'`): a string from which tar
-            file names start. The string can contain only characters ``A-Z``, ``a-z``, ``0-9``, ``_``, ``-``, ``.``.
-        n_jobs (:obj:`int`, `optional`): a number of workers for creating tarred dataset. If ``None``, then ``n_jobs``
-            is equal to number of CPUs.
-        audio_file (:obj:`Optional[Union[os.PathLike, str]]`, defaults to :obj:`None`): a path to a file with audio dataset file paths if dataset is lexical and audio. Must contain one path per line.
-        use_audio (:obj:`bool`, `optional`, defaults to :obj:`False`): If set to ``True`` dataset becomes lexical and audio rather than only lexical.
-        sample_rate (:obj:`int`, `optional`, defaults to :obj:`16000`) Targeted sample rate of audios If ``use_audio`` set to ``True``.
-    """
-    check_tar_file_prefix(tar_file_prefix, ValueError, 'tar_file_prefix')
-    if n_jobs is None:
-        n_jobs = mp.cpu_count()
-    text_file, labels_file = Path(text_file).expanduser(), Path(labels_file).expanduser()
-    output_dir = Path(output_dir).expanduser()
-    ds_params_str = DATASET_PARAMETERS_TMPL.format(
-        prefix=tar_file_prefix,
-        tokens_in_batch=tokens_in_batch,
-        max_seq_length=max_seq_length,
-        tokenizer=REPLACE_NOT_ALLOWED_CHARACTERS_IN_FILE_NAME.sub('-', tokenizer_name),
-    )
-    output_file_tmpl = ds_params_str + TAR_FINAL_TMPL
-    metadata_file_name = output_dir / ('metadata.' + ds_params_str + '.json')
-    remove_unexpected_files_and_dirs(output_dir, output_file_tmpl, metadata_file_name)
-    audio_start_bytes = None
-    if use_audio:
-        num_lines, text_start_bytes, label_start_bytes, audio_start_bytes = get_fragment_start_bytes(
-            text_file, labels_file, lines_per_dataset_fragment, audio_file
-        )
-    else:
-        num_lines, text_start_bytes, label_start_bytes = get_fragment_start_bytes(
-            text_file, labels_file, lines_per_dataset_fragment
-        )
-    if text_start_bytes:
-        output_dir.mkdir(parents=True, exist_ok=True)
-    else:
-        raise ValueError(f"Both {labels_file} and {text_file} are empty. Tarred dataset cannot be created.")
-    punct_label_ids, capit_label_ids = get_label_dictionaries(
-        labels_file,
-        label_start_bytes,
-        num_lines,
-        lines_per_dataset_fragment,
-        pad_label,
-        punct_label_ids,
-        capit_label_ids,
-        punct_label_vocab_file,
-        capit_label_vocab_file,
-        n_jobs,
-    )
-
-    with Progress(
-        num_lines, ["Tokenization", "Batch mark up", "Batch building", "Writing tarred dataset"], "query"
-    ) as progress_queues:
-        Parallel(n_jobs=min(n_jobs, len(text_start_bytes)))(
-            delayed(process_fragment)(
-                text_file,
-                labels_file,
-                output_dir,
-                text_start_pos,
-                label_start_pos,
-                lines_per_dataset_fragment,
-                max_seq_length,
-                tokens_in_batch,
-                num_batches_per_tarfile,
-                tokenizer_name,
-                None if tokenizer_model is None else Path(tokenizer_model).expanduser(),
-                None if vocab_file is None else Path(vocab_file).expanduser(),
-                None if merges_file is None else Path(merges_file).expanduser(),
-                special_tokens,
-                use_fast_tokenizer,
-                pad_label,
-                punct_label_ids,
-                capit_label_ids,
-                fragment_idx,
-                *progress_queues,
-                audio_file,
-                sample_rate,
-                audio_file_start_pos,
-                use_audio,
-            )
-            for fragment_idx, (text_start_pos, label_start_pos, audio_file_start_pos) in enumerate(
-                zip(
-                    text_start_bytes,
-                    label_start_bytes,
-                    audio_start_bytes if use_audio else [None for _ in range(len(text_start_bytes))],
-                )
-            )
-        )
-    repack_tar_files_with_not_enough_batches(output_dir, num_batches_per_tarfile)
-    create_metadata_file(output_dir, output_file_tmpl, metadata_file_name, num_batches_per_tarfile)
-
-
-class BertPunctuationCapitalizationTarredDataset(IterableDataset):
-    """
-    Punctuation capitalization dataset which allows not to load all data in memory simultaneously. A tarred dataset
-    is created from text and label files using script
-    `examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py
-    <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py>`_
-    or function
-    :func:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.create_tarred_dataset`.
-
-    Args:
-        metadata_file (:obj:`Union[os.PathLike, str]`): a path to tarred dataset metadata file. Metadata file and files
-            referenced in metadata file are created by
-            `examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py
-            <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py>`_.
-            Metadata file is a JSON file which contains ``'num_batches'``, ``'tar_files'``,
-            ``'punct_label_vocab_file'``, ``'capit_label_vocab_file'`` items. The first item is total number of batches
-            in a dataset, the second is a list of paths to tar files relative to directory containing
-            ``metadata_file``. Items ``'punct_label_vocab_file'`` and ``'capit_label_vocab_file'`` are paths to
-            ``.csv`` files which contain unique punctuation a capitalization label vocabularies. Vocabulary file paths
-            are relative to directory containing the ``metadata_file``. Each line in ``'punct_label_vocab_file'`` and
-            ``'capit_label_vocab_file'`` contains 1 label. The first lines in ``'punct_label_vocab_file'`` and
-            ``'capit_label_vocab_file'`` files are neutral labels which also serve as pad labels. Neutral labels for
-            punctuation and capitalization must be equal to the ``pad_label`` parameter.
-        tokenizer (:obj:`TokenizerSpec`): a tokenizer instance used for tokenization of dataset source. A tokenizer
-            instance is used for getting ids of [CLS], [PAD], and [SEP] tokens which are used for masks creation.
-        pad_label (:obj:`str`): a label that is used for padding and for absence of punctuation or
-            capitalization. Used for checking items ``'punct_label_vocab'`` and ``'capit_label_vocab'`` of dictionary
-            in ``metadata_file``.
-        label_info_save_dir (:obj:`Union[os.PathLike, str]`, `optional`): a path to a directory where label
-            vocabularies are copied when method :meth:`save_labels_and_get_file_paths` is called. This parameter is
-            useful if tarred dataset directory is read-only.
-        ignore_extra_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to use only first token in a
-            word for loss computation and training. If set to ``True``, then loss will be computed only for the first
-            tokens of words.
-        ignore_start_end (:obj:`bool`, `optional`, defaults to :obj:`True`): whether to compute loss for [CLS] and
-            [SEP] tokens. If set to ``True``, then loss will not be computed for [CLS] and [SEP] tokens.
-        world_size (:obj:`int`, `optional`, defaults to :obj:`1`): a number of processes used for model training. It is
-            used together with a ``global_rank`` parameter to decide which tar files will be used in the current
-            process.
-        global_rank (:obj:`int`, `optional`, defaults to :obj:`0`): a number of current process in the pool of workers
-            used for model training. It is used together with ``world_size`` parameter to decide which tar files will
-            be used in the current process.
-        shuffle_n (:obj:`int`, `optional`, defaults to :obj:`1`): a number of shuffled batches in a buffer.
-            ``shuffle_n`` batches are loaded into memory, shuffled, and then yielded by a dataset instance.
-        shard_strategy (:obj:`str`, defaults to :obj:``'scatter'``): Tarred dataset shard distribution strategy chosen as
-            a str value during ddp.
-            -   ``'scatter'``: The default shard strategy applied by WebDataset, where each node gets
-                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
-            -   ``'replicate'``: Optional shard strategy, where each node gets all the set of shards
-                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
-                The benefit of replication is that it allows each node to sample data points from the entire
-                dataset independently of other nodes, and reduces dependence on value of :param:`shuffle_n`.
-
-                .. warning::
-                    Replicated strategy allows every node to sample the entire set of available tar files,
-                    and therefore more than one node may sample the same tarfile, and even sample the same
-                    data points! As such, there is no assured guarantee that all samples in the dataset will be
-                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
-                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
-                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
-                    or test datasets.
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports. """
-        if self.use_audio:
-            return {
-                'input_ids': NeuralType(('B', 'T'), ChannelType()),
-                'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-                'input_mask': NeuralType(('B', 'T'), MaskType()),
-                'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-                'loss_mask': NeuralType(('B', 'T'), MaskType()),
-                'punct_labels': NeuralType(('B', 'T'), LabelsType()),
-                'capit_labels': NeuralType(('B', 'T'), LabelsType()),
-                'features': NeuralType(('B', 'T'), AudioSignal()),
-                'features_length': NeuralType(('B', 'T'), LengthsType()),
-            }
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-            'loss_mask': NeuralType(('B', 'T'), MaskType()),
-            'punct_labels': NeuralType(('B', 'T'), LabelsType()),
-            'capit_labels': NeuralType(('B', 'T'), LabelsType()),
-        }
-
-    def __init__(
-        self,
-        metadata_file: Union[os.PathLike, str],
-        tokenizer: TokenizerSpec,
-        pad_label: str,
-        label_info_save_dir: Optional[Union[os.PathLike, str]] = None,
-        ignore_extra_tokens: bool = False,
-        ignore_start_end: bool = True,
-        world_size: int = 1,
-        global_rank: int = 0,
-        shuffle_n: int = 1,
-        shard_strategy: str = "scatter",
-        use_audio: bool = False,
-    ) -> None:
-        super().__init__()
-
-        valid_shard_strategies = ['scatter', 'replicate']
-        if shard_strategy not in valid_shard_strategies:
-            raise ValueError(
-                f"Invalid shard strategy of type {type(shard_strategy)} "
-                f"{repr(shard_strategy) if len(repr(shard_strategy)) < 100 else repr(shard_strategy)[:100] + '...'}! "
-                f"Allowed values are: {valid_shard_strategies}."
-            )
-
-        self.tokenizer = tokenizer
-        self.metadata_file = Path(metadata_file).expanduser()
-        if label_info_save_dir is None:
-            self.for_nemo_ckpt = self.metadata_file.parent / LABEL_ID_DIR_FOR_NEMO_CHECKPOINT
-        else:
-            self.for_nemo_ckpt = Path(label_info_save_dir).expanduser() / LABEL_ID_DIR_FOR_NEMO_CHECKPOINT
-        with open(self.metadata_file) as f:
-            self.metadata = json.load(f)
-        self.ignore_extra_tokens = ignore_extra_tokens
-        self.ignore_start_end = ignore_start_end
-        self.tar_files = []
-        for file_path in self.metadata['tar_files']:
-            file_path = Path(file_path).expanduser()
-            if file_path.is_absolute():
-                self.tar_files.append(str(file_path))
-            else:
-                self.tar_files.append(str(self.metadata_file.parent / file_path))
-        self.punct_label_vocab_file = self.metadata_file.parent / self.metadata[METADATA_PUNCT_LABEL_VOCAB_KEY]
-        self.capit_label_vocab_file = self.metadata_file.parent / self.metadata[METADATA_CAPIT_LABEL_VOCAB_KEY]
-        self.punct_label_ids = load_label_ids(self.punct_label_vocab_file)
-        self.capit_label_ids = load_label_ids(self.capit_label_vocab_file)
-        self.pad_label = pad_label
-        self._check_pad_label()
-
-        if shard_strategy == 'scatter':
-            logging.info("Tarred dataset shards will be scattered evenly across all nodes.")
-            if len(self.tar_files) % world_size != 0:
-                logging.warning(
-                    f"Number of shards in tarred dataset ({len(self.tar_files)}) is not divisible "
-                    f"by number of distributed workers ({world_size}). "
-                    f"Some shards will not be used ({len(self.tar_files) % world_size})."
-                )
-            begin_idx = (len(self.tar_files) // world_size) * global_rank
-            end_idx = begin_idx + (len(self.tar_files) // world_size)
-            logging.info(
-                "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx
-            )
-            batches_per_tar = self.metadata['num_batches'] // len(self.tar_files)
-            self.tar_files = self.tar_files[begin_idx:end_idx]
-            self.length = batches_per_tar * len(self.tar_files) * world_size
-
-        elif shard_strategy == 'replicate':
-            logging.info("All tarred dataset shards will be replicated across all nodes.")
-            self.length = self.metadata['num_batches']
-
-        else:
-            raise ValueError(f"Invalid shard strategy! Allowed values are: {valid_shard_strategies}")
-
-        self._dataset = wds.DataPipeline(
-            wds.SimpleShardList(self.tar_files),
-            webdataset_split_by_workers,
-            wds.tarfile_to_samples(),
-            wds.decode(wds.handle_extension('.pyd', decode_pyd)),
-            wds.shuffle(shuffle_n),
-            wds.to_tuple('__key__', 'batch.pyd'),
-            wds.map(self._build_sample),
-        )
-
-        self.use_audio = use_audio
-
-    def _check_pad_label(self) -> None:
-        """
-        Checks the condition that ``pad_label`` passed to this class constructor has ``0`` id in
-        ``self.punct_label_ids`` and ``self.capit_label_ids`` loaded from tarred dataset.
-        """
-        for label_ids, labels_file, task in [
-            (self.punct_label_ids, self.metadata[METADATA_PUNCT_LABEL_VOCAB_KEY], "punctuation"),
-            (self.capit_label_ids, self.metadata[METADATA_CAPIT_LABEL_VOCAB_KEY], "capitalization"),
-        ]:
-            if label_ids[self.pad_label] != 0:
-                raise ValueError(
-                    f"Pad label '{self.pad_label}' has non zero id {label_ids[self.pad_label]} in {task} "
-                    f"ids dictionary loaded from {labels_file}."
-                )
-
-    def check_for_label_consistency_with_model_config(
-        self,
-        punct_label_ids: Optional[Dict[str, int]],
-        capit_label_ids: Optional[Dict[str, int]],
-        class_labels: DictConfig,
-        common_dataset_parameters_config: DictConfig,
-    ) -> None:
-        """
-        Checks that label ids loaded from tarred dataset are identical to those provided in
-        ``model.common_dataset_parameters`` :ref:`config<common-dataset-parameters-config-label>` item. In addition,
-        this method checks that label ids set in attributes ``punct_label_ids`` and ``capit_label_ids`` of an instance
-        of
-        :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel`
-        are identical to label ids loaded from tarred dataset.
-
-        Args:
-            punct_label_ids: a content of ``punct_label_ids`` attribute of an instance of
-                :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel`
-                in which this tarred dataset is used.
-            capit_label_ids: a content of ``capit_label_ids`` attribute of an instance of
-                :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel`
-                in which this tarred dataset is used.
-            class_labels: a config item ``model.class_labels``. See more in description of
-                :ref:`class labels' config<class-labels-config-label>`.
-            common_dataset_parameters_config: a config item ``model.common_dataset_parameters``. See more in
-                of :ref:`common dataset parameters config<common-dataset-parameters-config-label>`.
-        """
-        tarred_dataset_label_desc_tmpl = (
-            f'{{label_type}} labels loaded from tarred dataset with metadata file {self.metadata_file}'
-        )
-        if punct_label_ids is not None:
-            if punct_label_ids != self.punct_label_ids:
-                raise_not_equal_labels_error(
-                    first_labels=self.punct_label_ids,
-                    second_labels=punct_label_ids,
-                    first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Punctuation'),
-                    second_labels_desc="Punctuation labels stored in an attribute "
-                    "`PunctuationCapitalizationModel.punct_label_ids`",
-                )
-        if capit_label_ids is not None:
-            if capit_label_ids != self.capit_label_ids:
-                raise_not_equal_labels_error(
-                    first_labels=self.capit_label_ids,
-                    second_labels=capit_label_ids,
-                    first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Capitalization'),
-                    second_labels_desc="Capitalization labels stored in an attribute"
-                    "`PunctuationCapitalizationModel.capit_label_ids`",
-                )
-        if common_dataset_parameters_config.punct_label_ids is not None:
-            cfg_punct_label_ids = dict(common_dataset_parameters_config.punct_label_ids)
-            if cfg_punct_label_ids != self.punct_label_ids:
-                raise_not_equal_labels_error(
-                    first_labels=self.punct_label_ids,
-                    second_labels=cfg_punct_label_ids,
-                    first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Punctuation'),
-                    second_labels_desc='Punctuation labels stored a config field '
-                    '`model.common_dataset_parameters.punct_label_ids`',
-                )
-        if common_dataset_parameters_config.capit_label_ids is not None:
-            cfg_capit_label_ids = dict(common_dataset_parameters_config.capit_label_ids)
-            if cfg_capit_label_ids != self.capit_label_ids:
-                raise_not_equal_labels_error(
-                    first_labels=self.capit_label_ids,
-                    second_labels=cfg_capit_label_ids,
-                    first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Capitalization'),
-                    second_labels_desc='Capitalization labels stored a config field '
-                    '`model.common_dataset_parameters.capit_label_ids`',
-                )
-        if common_dataset_parameters_config.label_vocab_dir is not None:
-            label_vocab_dir = Path(common_dataset_parameters_config.label_vocab_dir).expanduser()
-            punct_label_vocab_file = label_vocab_dir / class_labels.punct_labels_file
-            file_punct_vocab = load_label_ids(punct_label_vocab_file)
-            if file_punct_vocab != self.punct_label_ids:
-                raise_not_equal_labels_error(
-                    first_labels=self.punct_label_ids,
-                    second_labels=file_punct_vocab,
-                    first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Punctuation'),
-                    second_labels_desc=f'labels stored in file {punct_label_vocab_file} passed in '
-                    f'`model.common_dataset_parameters.punct_label_vocab_file`',
-                )
-            capit_label_vocab_file = label_vocab_dir / class_labels.capit_labels_file
-            file_capit_vocab = load_label_ids(capit_label_vocab_file)
-            if file_capit_vocab != self.capit_label_ids:
-                raise_not_equal_labels_error(
-                    first_labels=self.capit_label_ids,
-                    second_labels=file_capit_vocab,
-                    first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Capitalization'),
-                    second_labels_desc=f'labels stored in file {capit_label_vocab_file} passed in '
-                    f'`model.common_dataset_parameters.capit_label_vocab_file`',
-                )
-
-    def save_labels_and_get_file_paths(
-        self, punct_labels_file_name: str, capit_labels_file_name: str
-    ) -> Tuple[Path, Path]:
-        """
-        Copies label vocabulary files for punctuation and capitalization into directory passed in the constructor
-        parameter ``label_info_save_dir``. The names of new
-        files are ``punct_labels_file_name`` and ``capit_labels_file_name``.
-
-        The signatures of this method and the signature of the method
-        :meth:`~nemo.collections.nlp.data.token_classification.BertPunctuationCapitalizationDataset.save_labels_and_get_file_paths`
-        must be identical.
-
-        Args:
-            punct_labels_file_name (:obj:`str`): a name of punctuation labels file
-            capit_labels_file_name (:obj:`str`): a name of capitalization labels file
-
-        Returns:
-            :obj:`Tuple[Path, Path]`: a tuple of 2 elements
-
-                - :obj:`pathlib.Path`: a path to the new punctuation label ids file
-                - :obj:`pathlib.Path`: a path to the new capitalization label ids file
-        """
-        self.for_nemo_ckpt.mkdir(parents=True, exist_ok=True)
-        punct_label_ids_file = self.for_nemo_ckpt / punct_labels_file_name
-        capit_label_ids_file = self.for_nemo_ckpt / capit_labels_file_name
-        shutil.copy(str(self.punct_label_vocab_file), str(punct_label_ids_file))
-        shutil.copy(str(self.capit_label_vocab_file), str(capit_label_ids_file))
-        return punct_label_ids_file, capit_label_ids_file
-
-    def _build_sample(self, batch: Tuple[str, Dict[str, np.ndarray]]) -> Dict[str, np.ndarray]:
-        """
-        Takes batch loaded from tarred dataset and transforms it for passing to the model. Adds ``'segment_ids'``,
-        ``'input_mask'``, ``'loss_mask'`` items to the batch.
-
-        Args:
-            batch: a tuple of 2 elements: batch name and a dictionary with ``'input_ids'``, ``'subtokens_mask'``,
-                ``'punct_labels'``, ``'capit_labels'``. Batch name is not needed for training and inference and
-                discarded.
-
-        Returns:
-            a batch in the form of a dictionary with items:
-              - ``'input_ids'``: a ``np.int32`` numpy array of shape ``[Batch, Time]``;
-              - ``'subtokens_mask'``: a boolean numpy array of shape ``[Batch, Time]``;
-              - ``'punct_labels'``: a ``np.int32`` numpy array of shape ``[Batch, Time]``;
-              - ``'capit_labels'``: a ``np.int32`` numpy array of shape ``[Batch, Time]``;
-              - ``'segment_ids'``: a ``np.int8`` numpy array of shape ``[Batch, Time]``;
-              - ``'input_mask'``: a boolean numpy array of shape ``[Batch, Time]``;
-              - ``'loss_mask'``: a boolean numpy array of shape ``[Batch, Time]``.
-        """
-        _, batch = batch
-        batch_segment_ids, batch_input_mask, batch_loss_mask = create_masks_and_segment_ids(
-            batch['input_ids'],
-            batch['subtokens_mask'],
-            self.tokenizer.pad_id,
-            self.tokenizer.cls_id,
-            self.tokenizer.sep_id,
-            self.ignore_start_end,
-            self.ignore_extra_tokens,
-        )
-        batch['segment_ids'] = batch_segment_ids
-        batch['input_mask'] = batch_input_mask
-        batch['loss_mask'] = batch_loss_mask
-        return batch
-
-    def __iter__(self) -> Iterator[Dict[str, np.ndarray]]:
-        """
-        Constructs an iterator of batches. The values of one batch dictionary are numpy arrays of identical shapes
-        ``[Batch, Time]``.
-
-        Returns:
-            :obj:`Iterator[Dict[str, np.ndarray]]`: an iterator of batches with items:
-
-              - ``'input_ids'``: ``np.int32`` array containing encoded tokens,
-              - ``'subtokens_mask'``: ``bool`` array which elements are ``True`` if they correspond to first token in
-                a word,
-              - ``'punct_labels'``: ``np.int32`` array with encoded punctuation labels,
-              - ``'capit_labels'``: ``np.int32`` array with encoded capitalization labels,
-              - ``'segment_ids'``: ``np.int8`` array filled with zeros (BERT token types in HuggingFace terminology),
-              - ``'input_mask'``: ``bool`` array which elements are ``True`` if corresponding token is not a padding
-                token,
-              - ``'loss_mask'``: ``bool`` array which elements are ``True`` if loss is computed for corresponding
-                token. See more in description of constructor parameters ``ignore_start_end``, ``ignore_extra_tokens``.
-        """
-        return self._dataset.__iter__()
-
-    def __len__(self) -> int:
-        return self.length
-
-    def collate_fn(self, batches: List[Dict[str, np.ndarray]]) -> Dict[str, torch.Tensor]:
-        """
-        Return zeroth batch of ``batches`` list passed for collating and casts ``'segment_ids'``, ``'punct_labels'``,
-        ``'capit_labels'`` to types supported by
-        :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel`.
-        All output tensors have shape ``[Batch, Time]``.
-
-        .. warning::
-            ``batch size`` parameter of a PyTorch data loader and sampler has to be ``1``.
-
-        Args:
-            batches (:obj:`List[Dict[str, np.ndarray]]`): a list of batches passed for collating
-
-        Returns:
-            :obj:`Dict[str, torch.Tensor]`: a batch dictionary with following items (for detailed description of batch
-            items see method :meth:`__getitem__`):
-
-              - ``'input_ids'`` (:obj:`torch.Tensor`): :obj:`torch.int32` tensor,
-              - ``'subtokens_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor,
-              - ``'punct_labels'`` (:obj:`torch.Tensor`): :obj:`torch.int64` tensor,
-              - ``'capit_labels'`` (:obj:`torch.Tensor`): :obj:`torch.int64` tensor,
-              - ``'segment_ids'`` (:obj:`torch.Tensor`): :obj:`torch.int32` tensor,
-              - ``'input_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor,
-              - ``'loss_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor.
-        """
-        batch = {k: torch.as_tensor(v) for k, v in batches[0].items()}
-        batch['segment_ids'] = batch['segment_ids'].int()
-        batch['punct_labels'] = batch['punct_labels'].long()
-        batch['capit_labels'] = batch['capit_labels'].long()
-        if self.use_audio:
-            batch['features'] = batch['features'].to(torch.float32)
-        return batch
diff --git a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py
deleted file mode 100644
index 4f49e34ce24e..000000000000
--- a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py
+++ /dev/null
@@ -1,353 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Utility functions for Token Classification NLP tasks
-Some parts of this code were adapted from the HuggingFace library at
-https://github.com/huggingface/pytorch-pretrained-BERT
-"""
-
-import os
-import pickle
-import tempfile
-import time
-from typing import Dict, List, Optional
-
-import numpy as np
-
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-from nemo.collections.nlp.data.data_utils.data_preprocessing import get_stats
-from nemo.core.classes import Dataset
-from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType
-from nemo.utils import logging
-from nemo.utils.get_rank import is_global_rank_zero
-
-__all__ = ['BertTokenClassificationDataset', 'BertTokenClassificationInferDataset']
-
-
-def get_features(
-    queries: List[str],
-    tokenizer: TokenizerSpec,
-    max_seq_length: int = -1,
-    label_ids: dict = None,
-    pad_label: str = 'O',
-    raw_labels: List[str] = None,
-    ignore_extra_tokens: bool = False,
-    ignore_start_end: bool = False,
-):
-    """
-    Processes the data and returns features.
-    Args:
-        queries: text sequences
-        tokenizer: such as AutoTokenizer
-        max_seq_length: max sequence length minus 2 for [CLS] and [SEP], when -1 - use the max len from the data
-        pad_label: pad value use for labels. By default, it's the neutral label.
-        raw_labels: list of labels for every word in a sequence
-        label_ids: dict to map labels to label ids.
-            Starts with pad_label->0 and then increases in alphabetical order.
-            Required for training and evaluation, not needed for inference.
-        ignore_extra_tokens: whether to ignore extra tokens in the loss_mask
-        ignore_start_end: whether to ignore bos and eos tokens in the loss_mask
-    """
-    all_subtokens = []
-    all_loss_mask = []
-    all_subtokens_mask = []
-    all_segment_ids = []
-    all_input_ids = []
-    all_input_mask = []
-    sent_lengths = []
-    all_labels = []
-    with_label = False
-
-    if raw_labels is not None:
-        with_label = True
-
-    for i, query in enumerate(queries):
-        words = query.strip().split()
-
-        # add bos token
-        subtokens = [tokenizer.cls_token]
-        loss_mask = [1 - ignore_start_end]
-        subtokens_mask = [0]
-        if with_label:
-            pad_id = label_ids[pad_label]
-            labels = [pad_id]
-            query_labels = [label_ids[lab] for lab in raw_labels[i]]
-
-        for j, word in enumerate(words):
-            word_tokens = tokenizer.text_to_tokens(word)
-
-            # to handle emojis that could be neglected during tokenization
-            if len(word.strip()) > 0 and len(word_tokens) == 0:
-                word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)]
-
-            subtokens.extend(word_tokens)
-
-            loss_mask.append(1)
-            loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1))
-
-            subtokens_mask.append(1)
-            subtokens_mask.extend([0] * (len(word_tokens) - 1))
-
-            if with_label:
-                labels.extend([query_labels[j]] * len(word_tokens))
-        # add eos token
-        subtokens.append(tokenizer.sep_token)
-        loss_mask.append(1 - ignore_start_end)
-        subtokens_mask.append(0)
-        sent_lengths.append(len(subtokens))
-        all_subtokens.append(subtokens)
-        all_loss_mask.append(loss_mask)
-        all_subtokens_mask.append(subtokens_mask)
-        all_input_mask.append([1] * len(subtokens))
-
-        if with_label:
-            labels.append(pad_id)
-            all_labels.append(labels)
-
-    max_seq_length_data = max(sent_lengths)
-    max_seq_length = min(max_seq_length, max_seq_length_data) if max_seq_length > 0 else max_seq_length_data
-    logging.info(f'Setting Max Seq length to: {max_seq_length}')
-    get_stats(sent_lengths)
-    too_long_count = 0
-
-    for i, subtokens in enumerate(all_subtokens):
-        if len(subtokens) > max_seq_length:
-            subtokens = [tokenizer.cls_token] + subtokens[-max_seq_length + 1 :]
-            all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :]
-            all_loss_mask[i] = [int(not ignore_start_end)] + all_loss_mask[i][-max_seq_length + 1 :]
-            all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :]
-
-            if with_label:
-                all_labels[i] = [pad_id] + all_labels[i][-max_seq_length + 1 :]
-            too_long_count += 1
-
-        all_input_ids.append(tokenizer.tokens_to_ids(subtokens))
-
-        if len(subtokens) < max_seq_length:
-            extra = max_seq_length - len(subtokens)
-            all_input_ids[i] = all_input_ids[i] + [0] * extra
-            all_loss_mask[i] = all_loss_mask[i] + [0] * extra
-            all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra
-            all_input_mask[i] = all_input_mask[i] + [0] * extra
-
-            if with_label:
-                all_labels[i] = all_labels[i] + [pad_id] * extra
-
-        all_segment_ids.append([0] * max_seq_length)
-
-    logging.warning(f'{too_long_count} are longer than {max_seq_length}')
-
-    for i in range(min(len(all_input_ids), 1)):
-        logging.info("*** Example ***")
-        logging.info("i: %s", i)
-        logging.info("subtokens: %s", " ".join(list(map(str, all_subtokens[i]))))
-        logging.info("loss_mask: %s", " ".join(list(map(str, all_loss_mask[i]))))
-        logging.info("input_mask: %s", " ".join(list(map(str, all_input_mask[i]))))
-        logging.info("subtokens_mask: %s", " ".join(list(map(str, all_subtokens_mask[i]))))
-        if with_label:
-            logging.info("labels: %s", " ".join(list(map(str, all_labels[i]))))
-    return (all_input_ids, all_segment_ids, all_input_mask, all_subtokens_mask, all_loss_mask, all_labels)
-
-
-class BertTokenClassificationDataset(Dataset):
-    """
-    Creates dataset to use during training for token classification tasks with a pretrained model.
-
-    Converts from raw data to an instance that can be used by Dataloader.
-    For dataset to use during inference without labels, see BertTokenClassificationInferDataset.
-
-    Args:
-        text_file: file to sequences, each line should a sentence, no header.
-        label_file: file to labels, each line corresponds to word labels for a sentence in the text_file. No header.
-        max_seq_length: max sequence length minus 2 for [CLS] and [SEP]
-        tokenizer: such as AutoTokenizer
-        num_samples: number of samples you want to use for the dataset.
-            If -1, use all dataset. Useful for testing.
-        pad_label: pad value use for labels. By default, it's the neutral label.
-        label_ids: label_ids (dict): dict to map labels to label ids.
-            Starts with pad_label->0 and then increases in alphabetical order
-            For dev set use label_ids generated during training to support
-            cases when not all labels are present in the dev set.
-            For training set label_ids should be None.
-        ignore_extra_tokens: whether to ignore extra tokens in the loss_mask
-        ignore_start_end: whether to ignore bos and eos tokens in the loss_mask
-        use_cache: whether to use processed data cache or not
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-            'loss_mask': NeuralType(('B', 'T'), MaskType()),
-            'labels': NeuralType(('B', 'T'), LabelsType()),
-        }
-
-    def __init__(
-        self,
-        text_file: str,
-        label_file: str,
-        max_seq_length: int,
-        tokenizer: TokenizerSpec,
-        num_samples: int = -1,
-        pad_label: str = 'O',
-        label_ids: Dict[str, int] = None,
-        ignore_extra_tokens: bool = False,
-        ignore_start_end: bool = False,
-        use_cache: bool = True,
-    ):
-        """ Initializes BertTokenClassificationDataset. """
-
-        data_dir = os.path.dirname(text_file)
-        text_filename = os.path.basename(text_file)
-        lbl_filename = os.path.basename(label_file)
-
-        if not text_filename.endswith('.txt'):
-            raise ValueError("{text_file} should have extension .txt")
-
-        vocab_size = getattr(tokenizer, "vocab_size", 0)
-        features_pkl = os.path.join(
-            data_dir,
-            f"cached__{text_filename}__{lbl_filename}__{tokenizer.name}_{max_seq_length}_{vocab_size}_{num_samples}",
-        )
-
-        master_device = is_global_rank_zero()
-        features = None
-        if master_device and (not use_cache or not os.path.exists(features_pkl)):
-            if num_samples == 0:
-                raise ValueError("num_samples has to be positive", num_samples)
-
-            with open(text_file, 'r') as f:
-                text_lines = f.readlines()
-
-            labels_lines = []
-            with open(label_file, 'r') as f:
-                for line in f:
-                    line = line.strip().split()
-                    labels_lines.append(line)
-
-            if len(labels_lines) != len(text_lines):
-                raise ValueError("Labels file should contain labels for every word")
-
-            if num_samples > 0:
-                dataset = list(zip(text_lines, labels_lines))
-                dataset = dataset[:num_samples]
-
-                dataset = list(zip(*dataset))
-                text_lines = dataset[0]
-                labels_lines = dataset[1]
-
-            features = get_features(
-                queries=text_lines,
-                max_seq_length=max_seq_length,
-                tokenizer=tokenizer,
-                pad_label=pad_label,
-                raw_labels=labels_lines,
-                label_ids=label_ids,
-                ignore_extra_tokens=ignore_extra_tokens,
-                ignore_start_end=ignore_start_end,
-            )
-
-            # save features to a temp file first to make sure that non-master processes don't start reading the file
-            # until the master process is done with writing
-            ofd, tmp_features_pkl = tempfile.mkstemp(
-                suffix='.pkl', prefix=os.path.basename(features_pkl), dir=os.path.dirname(features_pkl)
-            )
-            with os.fdopen(ofd, 'wb') as temp_f:
-                pickle.dump(features, temp_f)
-
-            os.rename(tmp_features_pkl, features_pkl)
-            logging.info(f'features saved to {features_pkl}')
-
-        # wait until the master process writes to the processed data files
-        if not master_device:
-            while features is None and not os.path.exists(features_pkl):
-                time.sleep(10)
-
-        if features is None:
-            features = pickle.load(open(features_pkl, 'rb'))
-            logging.info(f'features restored from {features_pkl}')
-
-        self.all_input_ids = features[0]
-        self.all_segment_ids = features[1]
-        self.all_input_mask = features[2]
-        self.all_subtokens_mask = features[3]
-        self.all_loss_mask = features[4]
-        self.all_labels = features[5]
-
-    def __len__(self):
-        return len(self.all_input_ids)
-
-    def __getitem__(self, idx):
-        return (
-            np.array(self.all_input_ids[idx]),
-            np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.longlong),
-            np.array(self.all_subtokens_mask[idx]),
-            np.array(self.all_loss_mask[idx]),
-            np.array(self.all_labels[idx]),
-        )
-
-
-class BertTokenClassificationInferDataset(Dataset):
-    """
-    Creates dataset to use during inference for token classification tasks with a pretrained model.
-    For dataset to use during training with labels, see BertTokenClassificationDataset.
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-        }
-
-    def __init__(
-        self, queries: List[str], max_seq_length: int, tokenizer: TokenizerSpec,
-    ):
-        """
-        Initializes BertTokenClassificationInferDataset
-        Args:
-            queries: text sequences
-            max_seq_length: max sequence length minus 2 for [CLS] and [SEP]
-            tokenizer: such as AutoTokenizer
-        """
-        features = get_features(queries=queries, max_seq_length=max_seq_length, tokenizer=tokenizer)
-
-        self.all_input_ids = features[0]
-        self.all_segment_ids = features[1]
-        self.all_input_mask = features[2]
-        self.all_subtokens_mask = features[3]
-
-    def __len__(self):
-        return len(self.all_input_ids)
-
-    def __getitem__(self, idx):
-        return (
-            np.array(self.all_input_ids[idx]),
-            np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.longlong),
-            np.array(self.all_subtokens_mask[idx]),
-        )
diff --git a/nemo/collections/nlp/data/token_classification/token_classification_utils.py b/nemo/collections/nlp/data/token_classification/token_classification_utils.py
deleted file mode 100644
index 94acd69d3b11..000000000000
--- a/nemo/collections/nlp/data/token_classification/token_classification_utils.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pickle
-import re
-import string
-from typing import Dict
-
-from nemo.collections.nlp.data.data_utils.data_preprocessing import (
-    fill_class_weights,
-    get_freq_weights,
-    get_label_stats,
-)
-from nemo.utils import logging
-
-__all__ = ['get_label_ids', 'create_text_and_labels']
-
-
-def remove_punctuation(word: str):
-    """
-    Removes all punctuation marks from a word except for '
-    that is often a part of word: don't, it's, and so on
-    """
-    all_punct_marks = string.punctuation.replace("'", '')
-    return re.sub('[' + all_punct_marks + ']', '', word)
-
-
-def create_text_and_labels(output_dir: str, file_path: str, punct_marks: str = ',.?'):
-    """
-    Create datasets for training and evaluation.
-
-    Args:
-      output_dir: path to the output data directory
-      file_path: path to file name
-      punct_marks: supported punctuation marks
-
-    The data will be split into 2 files: text.txt and labels.txt. \
-    Each line of the text.txt file contains text sequences, where words\
-    are separated with spaces. The labels.txt file contains \
-    corresponding labels for each word in text.txt, the labels are \
-    separated with spaces. Each line of the files should follow the \
-    format:  \
-    [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \
-    [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).'
-    """
-    if not os.path.exists(file_path):
-        raise ValueError(f'{file_path} not found')
-
-    os.makedirs(output_dir, exist_ok=True)
-
-    base_name = os.path.basename(file_path)
-    labels_file = os.path.join(output_dir, 'labels_' + base_name)
-    text_file = os.path.join(output_dir, 'text_' + base_name)
-
-    with open(file_path, 'r') as f:
-        with open(text_file, 'w') as text_f:
-            with open(labels_file, 'w') as labels_f:
-                for line in f:
-                    line = line.split()
-                    text = ''
-                    labels = ''
-                    for word in line:
-                        label = word[-1] if word[-1] in punct_marks else 'O'
-                        word = remove_punctuation(word)
-                        if len(word) > 0:
-                            if word[0].isupper():
-                                label += 'U'
-                            else:
-                                label += 'O'
-
-                            word = word.lower()
-                            text += word + ' '
-                            labels += label + ' '
-
-                    text_f.write(text.strip() + '\n')
-                    labels_f.write(labels.strip() + '\n')
-
-    print(f'{text_file} and {labels_file} created from {file_path}.')
-
-
-def get_label_ids(
-    label_file: str,
-    is_training: bool = False,
-    pad_label: str = 'O',
-    label_ids_dict: Dict[str, int] = None,
-    get_weights: bool = True,
-    class_labels_file_artifact='label_ids.csv',
-):
-    """
-    Generates str to int labels mapping for training data or checks correctness of the label_ids_dict
-    file for non-training files or if label_ids_dict is specified
-
-    Args:
-        label_file: the path of the label file to process
-        is_training: indicates whether the label_file is used for training
-        pad_label: token used for padding
-        label_ids_dict: str label name to int ids mapping. Required for non-training data.
-            If specified, the check that all labels from label_file are present in label_ids_dict will be performed.
-            For training data, if label_ids_dict is None, a new mapping will be generated from label_file.
-        get_weights: set to True to calculate class weights, required for Weighted Loss.
-        class_labels_file_artifact: name of the file to save in .nemo
-    """
-    if not os.path.exists(label_file):
-        raise ValueError(f'File {label_file} was not found.')
-
-    logging.info(f'Processing {label_file}')
-    if not is_training and label_ids_dict is None:
-        raise ValueError(
-            f'For non training data, label_ids_dict created during preprocessing of the training data '
-            f'should be provided'
-        )
-
-    # collect all labels from the label_file
-    data_dir = os.path.dirname(label_file)
-    unique_labels = set(pad_label)
-    all_labels = []
-    with open(label_file, 'r') as f:
-        for line in f:
-            line = line.strip().split()
-            all_labels.extend(line)
-            unique_labels.update(line)
-
-    # check that all labels from label_file are present in the specified label_ids_dict
-    # or generate label_ids_dict from data (for training only)
-    if label_ids_dict:
-        logging.info(f'Using provided labels mapping {label_ids_dict}')
-        for name in unique_labels:
-            if name not in label_ids_dict:
-                raise ValueError(f'{name} class from {label_file} not found in the provided mapping: {label_ids_dict}')
-    else:
-        label_ids_dict = {pad_label: 0}
-        if pad_label in unique_labels:
-            unique_labels.remove(pad_label)
-        for label in sorted(unique_labels):
-            label_ids_dict[label] = len(label_ids_dict)
-
-    label_ids_filename = os.path.join(data_dir, class_labels_file_artifact)
-    if is_training:
-        with open(label_ids_filename, 'w') as f:
-            labels, _ = zip(*sorted(label_ids_dict.items(), key=lambda x: x[1]))
-            f.write('\n'.join(labels))
-        logging.info(f'Labels mapping {label_ids_dict} saved to : {label_ids_filename}')
-
-    # calculate label statistics
-    base_name = os.path.splitext(os.path.basename(label_file))[0]
-    stats_file = os.path.join(data_dir, f'{base_name}_label_stats.tsv')
-    if os.path.exists(stats_file) and not is_training and not get_weights:
-        logging.info(f'{stats_file} found, skipping stats calculation.')
-    else:
-        all_labels = [label_ids_dict[label] for label in all_labels]
-        logging.info(f'Three most popular labels in {label_file}:')
-        total_labels, label_frequencies, max_id = get_label_stats(all_labels, stats_file)
-        logging.info(f'Total labels: {total_labels}. Label frequencies - {label_frequencies}')
-
-    if get_weights:
-        class_weights_pkl = os.path.join(data_dir, f'{base_name}_weights.p')
-        if os.path.exists(class_weights_pkl):
-            class_weights = pickle.load(open(class_weights_pkl, 'rb'))
-            logging.info(f'Class weights restored from {class_weights_pkl}')
-        else:
-            class_weights_dict = get_freq_weights(label_frequencies)
-            logging.info(f'Class Weights: {class_weights_dict}')
-            class_weights = fill_class_weights(class_weights_dict, max_id)
-
-            pickle.dump(class_weights, open(class_weights_pkl, "wb"))
-            logging.info(f'Class weights saved to {class_weights_pkl}')
-    else:
-        class_weights = None
-
-    return label_ids_dict, label_ids_filename, class_weights
diff --git a/nemo/collections/nlp/data/zero_shot_intent_recognition/__init__.py b/nemo/collections/nlp/data/zero_shot_intent_recognition/__init__.py
deleted file mode 100644
index 6d56d4564a5c..000000000000
--- a/nemo/collections/nlp/data/zero_shot_intent_recognition/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import (
-    ZeroShotIntentInferenceDataset,
-    calc_class_weights_from_dataloader,
-)
diff --git a/nemo/collections/nlp/data/zero_shot_intent_recognition/zero_shot_intent_dataset.py b/nemo/collections/nlp/data/zero_shot_intent_recognition/zero_shot_intent_dataset.py
deleted file mode 100644
index d14e0c7b73c3..000000000000
--- a/nemo/collections/nlp/data/zero_shot_intent_recognition/zero_shot_intent_dataset.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pickle
-from typing import Dict, List, Optional
-
-import torch
-
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-from nemo.collections.nlp.data.data_utils.data_preprocessing import (
-    DataProcessor,
-    fill_class_weights,
-    get_freq_weights,
-    get_label_stats,
-)
-from nemo.collections.nlp.data.glue_benchmark.data_processors import InputExample
-from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset
-from nemo.collections.nlp.parts.utils_funcs import tensor2list
-from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType
-from nemo.utils import logging
-
-__all__ = ['ZeroShotIntentProcessor', 'ZeroShotIntentDataset', 'ZeroShotIntentInferenceDataset']
-
-
-class ZeroShotIntentProcessor(DataProcessor):
-    """
-    Processor for entailment data sets used to train NLI models for zero shot intent classification.
-    """
-
-    def __init__(self, sent1_col: int, sent2_col: int, label_col: int, num_classes: int):
-        """
-        Args:
-            sent1_col: the index of the column containing the premise (or sentence 1)
-            sent2_col: the index of the column containing the hypothesis (or sentence 2)
-            label_col: the index of the column containing the label
-            num_classes: number of classes in the data (should be either 2 or 3, corresponding to
-            labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"])
-        """
-        self.sent1_col = sent1_col
-        self.sent2_col = sent2_col
-        self.label_col = label_col
-        self.num_classes = num_classes
-
-    def get_train_examples(self, file_path: str):
-        """Gets a collection of `InputExample`s for the train set."""
-        return self._create_examples(self._read_tsv(file_path), "train")
-
-    def get_dev_examples(self, file_path: str):
-        """Gets a collection of `InputExample`s for the dev set."""
-        return self._create_examples(self._read_tsv(file_path), "dev")
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        if self.num_classes == 2:
-            return ['not_entailment', 'entailment']
-        elif self.num_classes == 3:
-            return ["contradiction", "entailment", "neutral"]
-        else:
-            raise ValueError("num_classes must be either 2 or 3!")
-
-    def _create_examples(self, lines: List[str], set_type: str):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[self.sent1_col]
-            text_b = line[self.sent2_col]
-            label = line[self.label_col]
-            if label == "-":
-                continue
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class ZeroShotIntentDataset(GLUEDataset):
-    """
-    Dataset for training a NLI model for zero shot intent recognition. Similar to GLUE/MNLI
-    dataset, but allows the user to specify which columns in the data files contain the
-    premise, hypothesis, and gold label.
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'labels': NeuralType(tuple('B'), CategoricalValuesType()),
-        }
-
-    def __init__(
-        self,
-        file_path: str,
-        tokenizer: TokenizerSpec,
-        max_seq_length: str,
-        sent1_col: int,
-        sent2_col: int,
-        label_col: int,
-        num_classes: int,
-        use_cache: bool = True,
-    ):
-        """
-        Args:
-            file_path: path to file
-            tokenizer: such as AutoTokenizer
-            max_seq_length: max sequence length including [CLS] and [SEP]
-            sent1_col: the index of the column containing the premise (or sentence 1)
-            sent2_col: the index of the column containing the hypothesis (or sentence 2)
-            label_col: the index of the column containing the label
-            num_classes: number of classes in the data (should be either 2 or 3, corresponding to
-            labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"])
-            use_cache: whether to use data cache
-        """
-        self.task_name = "mnli"  # for compatibility with parent class
-        data_dir, file_name = os.path.split(file_path)
-        logging.info(f'Processing {file_name}')
-        self.tokenizer = tokenizer
-        evaluate = False if 'train' in file_name else True
-        processor = ZeroShotIntentProcessor(sent1_col, sent2_col, label_col, num_classes)
-        self.label_list = processor.get_labels()
-        if not evaluate:
-            self.examples = processor.get_train_examples(file_path)
-
-            # check the labels found in the training set
-            all_train_labels = [example.label for example in self.examples]
-            unique_labels = set(all_train_labels)
-            if len(unique_labels) != num_classes:
-                raise ValueError(
-                    "Number of classes specified in config doesn't match the number found in the training data!"
-                )
-            elif len(unique_labels) == 2:
-                if not unique_labels == set(self.label_list):
-                    raise ValueError(
-                        f"Found unexpected labels! For a two-class model, labels are expected to be {self.label_list}"
-                    )
-            elif len(unique_labels) == 3:
-                if not unique_labels == set(self.label_list):
-                    raise ValueError(
-                        f"Found unexpected labels! For a three-class model, labels are expected to be {self.label_list}"
-                    )
-
-            # save the label map for reference
-            label_file = os.path.join(data_dir, "label_ids.csv")
-            with open(label_file, "w") as out:
-                out.write('\n'.join(self.label_list))
-            logging.info(f'Labels: {self.label_list}')
-            logging.info(f'Label mapping saved to : {label_file}')
-
-        else:
-            self.examples = processor.get_dev_examples(file_path)
-
-        processor_name = type(processor).__name__
-        vocab_size = getattr(tokenizer, "vocab_size", 0)
-        cached_features_file = os.path.join(
-            data_dir,
-            "cached_{}_{}_{}_{}_{}".format(
-                processor_name, file_name, tokenizer.name, str(max_seq_length), str(vocab_size)
-            ),
-        )
-
-        if use_cache and os.path.exists(cached_features_file):
-            logging.info(f"loading from {cached_features_file}")
-            with open(cached_features_file, "rb") as reader:
-                self.features = pickle.load(reader)
-        else:
-            token_params = {
-                'bos_token': None,
-                'eos_token': tokenizer.eos_token,
-                'pad_token': tokenizer.pad_token,
-                'cls_token': tokenizer.cls_token,
-                'sep_token_extra': tokenizer.eos_token if 'roberta' in tokenizer.name.lower() else None,
-            }
-
-            self.features = self.convert_examples_to_features(
-                self.examples, self.label_list, max_seq_length, tokenizer, output_mode="classification", **token_params
-            )
-            master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
-            if master_device:
-                logging.info(f'Saving train features into {cached_features_file}')
-                with open(cached_features_file, "wb") as writer:
-                    pickle.dump(self.features, writer)
-
-
-class ZeroShotIntentInferenceDataset(GLUEDataset):
-    """
-    Similar to ZeroShotIntentDataset, but gets utterances and candidate labels from lists
-    rather than sentence pairs and labels from a file.
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'labels': NeuralType(tuple('B'), CategoricalValuesType()),
-        }
-
-    def __init__(
-        self,
-        queries: List[str],
-        candidate_labels: List[str],
-        tokenizer: TokenizerSpec,
-        max_seq_length: int,
-        hypothesis_template: str,
-    ):
-        """
-        Args:
-            queries: list of utterances to classify
-            candidate_labels: list of candidate labels
-            tokenizer: such as AutoTokenizer
-            max_seq_length: max sequence length including [CLS] and [SEP]
-            hypothesis_template: template used to turn each candidate label into a NLI-style hypothesis
-        """
-
-        logging.info(f'Processing queries for inference')
-        self.tokenizer = tokenizer
-        token_params = {
-            'bos_token': None,
-            'eos_token': tokenizer.eos_token,
-            'pad_token': tokenizer.pad_token,
-            'cls_token': tokenizer.cls_token,
-            'sep_token_extra': tokenizer.eos_token if 'roberta' in tokenizer.name.lower() else None,
-        }
-        self.examples = []
-        for i, query in enumerate(queries):
-            for j, candidate_label in enumerate(candidate_labels):
-                guid = "query-%s-label-%s" % (i, j)
-                text_a = query
-                text_b = hypothesis_template.format(candidate_label)
-                label = 3  # dummy label for inference; training labels are 0, 1, 2 or 0, 1
-                self.examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-
-        self.features = self.convert_examples_to_features(
-            self.examples, [0, 1, 2, 3], max_seq_length, tokenizer, output_mode="classification", **token_params
-        )
-
-
-def calc_class_weights_from_dataloader(
-    dataloader: 'torch.utils.data.DataLoader', num_classes: int, data_dir: str
-) -> List[float]:
-    """
-    Calculate the weights of each class to be used for weighted loss. This is similar to the function calc_class_weights
-    in text_classification_dataset, but it gets the labels from a dataloader rather than from a file.
-    Args:
-        dataloader: the dataloader for the training set
-        num_classes: number of classes in the dataset
-    """
-    labels = []
-    for batch in dataloader:
-        labels.extend(tensor2list(batch[-1]))
-    logging.info(f'Calculating label frequency stats...')
-    total_sents, sent_label_freq, max_id = get_label_stats(
-        labels, os.path.join(data_dir, 'sentence_stats.tsv'), verbose=False
-    )
-    if max_id >= num_classes:
-        raise ValueError(f'Found an invalid label! Labels should be from [0, num_classes-1].')
-
-    class_weights_dict = get_freq_weights(sent_label_freq)
-
-    logging.info(f'Total Sentence Pairs: {total_sents}')
-    logging.info(f'Class Frequencies: {sent_label_freq}')
-    logging.info(f'Class Weights: {class_weights_dict}')
-    class_weights = fill_class_weights(weights=class_weights_dict, max_id=num_classes - 1)
-    return class_weights
diff --git a/nemo/collections/nlp/metrics/__init__.py b/nemo/collections/nlp/metrics/__init__.py
deleted file mode 100644
index 794f43dcbb52..000000000000
--- a/nemo/collections/nlp/metrics/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.metrics.classification_report import (  # noqa: F401
-    ClassificationReport,
-    MultiLabelClassificationReport,
-)
-from nemo.collections.nlp.metrics.sequence_perplexity import SequencePerplexity  # noqa: F401
diff --git a/nemo/collections/nlp/metrics/sequence_perplexity.py b/nemo/collections/nlp/metrics/sequence_perplexity.py
deleted file mode 100644
index 339f062f7cc1..000000000000
--- a/nemo/collections/nlp/metrics/sequence_perplexity.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-from torchmetrics import Metric
-
-__all__ = ['SequencePerplexity']
-
-
-class SequencePerplexity(Metric):
-    """
-    This class computes mean perplexity across the batches of sequences.
-
-    You have to provide ``log_probs`` (float tensor of shape [batch_size x seq_length x vocab_size]) and
-    ``labels`` (int tensor of shape [batch_size x seq_length] with values from the range [0, vocab_size-1])
-    to the :meth:`update` method. If some of the sequences are shorter than seq_length, you can also provide
-    an optional argument ``mask`` (bool tensor of shape [batch_size x seq_length]) which masks out tokens
-    not participating in perplexity computation.
-
-    See :doc:`PyTorch Lightning Metrics<pytorch-lightning:metrics>` for the metric usage instructions.
-
-    Args:
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()`` before returning the value at the step.
-        process_group:
-            Specify the process group on which synchronization is called. default: ``None`` (which selects the entire
-                world)
-        dist_sync_fn:
-            Callback that performs the allgather operation on the metric state. When ``None``, DDP will be used
-                to perform the allgather.
-    """
-
-    def __init__(self, dist_sync_on_step=False, process_group=None, dist_sync_fn=None):
-        super().__init__(
-            dist_sync_on_step=dist_sync_on_step, process_group=process_group, dist_sync_fn=dist_sync_fn,
-        )
-
-        # Total sum of exponentiated average negative log likelihoods
-        self.add_state('perplexities_sum', default=torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum')
-        # Total number of sequences in all batches
-        self.add_state('num_sequences', default=torch.tensor(0, dtype=torch.int64), dist_reduce_fx='sum')
-
-    def update(self, log_probs: torch.Tensor, labels: torch.Tensor, mask=None):
-
-        if mask is None:
-            mask = torch.ones_like(labels)
-        if mask.dtype is not log_probs.dtype:
-            mask = mask.to(log_probs.dtype)
-
-        target_log_probs = log_probs.gather(2, labels.unsqueeze(2)).squeeze(2)
-        avg_neg_ll = -(target_log_probs * mask).sum(dim=-1) / mask.sum(dim=-1)
-        ppl = avg_neg_ll.exp()
-        self.num_sequences += ppl.numel()
-        self.perplexities_sum += ppl.sum()
-
-    def compute(self):
-        """
-        Returns perplexity across all workers and resets to 0 :attr:`perplexities_sum` and :attr:`num_sequences`.
-        """
-        if self.num_sequences.eq(0):
-            return None
-        return self.perplexities_sum / self.num_sequences
diff --git a/nemo/collections/nlp/metrics/sgd_metrics.py b/nemo/collections/nlp/metrics/sgd_metrics.py
deleted file mode 100644
index 53666fb08928..000000000000
--- a/nemo/collections/nlp/metrics/sgd_metrics.py
+++ /dev/null
@@ -1,341 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Evaluation metrics for Schema-guided dialogue.
-
-This library provides functions for calculating the evaluation metrics for a
-single dialogue. The following metrics are defined:
-
-(1) Active intent accuracy: The fraction of user turns for which the active
-  intent has been correctly predicted.
-(2) Slot tagging F1: The macro-averaged F1 score for tagging slot values for
-  non-categorical slots. This metric is optional to report in the final paper
-  if participants decide not to use slot tagging.
-(3) Requested slots F1: The macro-averaged F1 score for requested slots over the
-  turns. For a turn, if there are no requested slots in both the ground truth
-  and the prediction, that turn is skipped. The reported number is the average
-  F1 score for all un-skipped user turns. This metric is optional to report in
-  the final paper.
-(4) Average goal accuracy: For each turn, participants must predict a single
-  value for each slot present in the dialogue state. The slots which have a
-  non-empty assignment in the ground truth dialogue state are only considered.
-  This is the average accuracy of predicting the value of a slot correctly. A
-  fuzzy matching based score is used for non-categorical slots.
-(5) Joint goal accuracy: This is the average accuracy of predicting all slot
-  assignments for a turn correctly. A fuzzy matching based score is used for
-  non-categorical slots. This is the primary evaluation metric used for ranking
-  submissions. More details to follow with the evaluation script.
-
-This file contains code artifacts adapted from the original implementation:
-https://github.com/google-research/google-research/blob/master/schema_guided_dst/metrics.py
-"""
-
-import collections
-
-import numpy as np
-from rapidfuzz import fuzz
-
-F1Scores = collections.namedtuple("F1Scores", ["f1", "precision", "recall"])
-
-# Evaluation and other relevant metrics for DSTC8/SGD Schema-guided DST.
-# (1) Active intent accuracy.
-ACTIVE_INTENT_ACCURACY = "active_intent_accuracy"
-# (2) Slot tagging F1.
-SLOT_TAGGING_F1 = "slot_tagging_f1"
-SLOT_TAGGING_PRECISION = "slot_tagging_precision"
-SLOT_TAGGING_RECALL = "slot_tagging_recall"
-# (3) Requested slots F1.
-REQUESTED_SLOTS_F1 = "requested_slots_f1"
-REQUESTED_SLOTS_PRECISION = "requested_slots_precision"
-REQUESTED_SLOTS_RECALL = "requested_slots_recall"
-# (4) Average goal accuracy.
-AVERAGE_GOAL_ACCURACY = "average_goal_accuracy"
-AVERAGE_CAT_ACCURACY = "average_cat_accuracy"
-AVERAGE_NONCAT_ACCURACY = "average_noncat_accuracy"
-# (5) Joint goal accuracy.
-JOINT_GOAL_ACCURACY = "joint_goal_accuracy"
-JOINT_CAT_ACCURACY = "joint_cat_accuracy"
-JOINT_NONCAT_ACCURACY = "joint_noncat_accuracy"
-
-
-AVERAGE_CAT_STATUS_ACCURACY = "average_cat_status_accuracy"
-AVERAGE_CAT_VALUE_ACCURACY = "average_cat_value_accuracy"
-AVERAGE_NONCAT_STATUS_ACCURACY = "average_noncat_status_accuracy"
-AVERAGE_NONCAT_VALUE_ACCURACY = "average_noncat_value_accuracy"
-
-JOINT_CAT_STATUS_ACCURACY = "joint_cat_status_accuracy"
-JOINT_CAT_VALUE_ACCURACY = "joint_cat_value_accuracy"
-JOINT_NONCAT_STATUS_ACCURACY = "joint_noncat_status_accuracy"
-JOINT_NONCAT_VALUE_ACCURACY = "joint_noncat_value_accuracy"
-
-
-NAN_VAL = "NA"
-
-
-def compute_f1(list_ref, list_hyp):
-    """Compute F1 score from reference (grouth truth) list and hypothesis list.
-    Args:
-      list_ref: List of true elements.
-      list_hyp: List of postive (retrieved) elements.
-    Returns:
-      A F1Scores object containing F1, precision, and recall scores.
-    """
-
-    ref = collections.Counter(list_ref)
-    hyp = collections.Counter(list_hyp)
-    true = sum(ref.values())
-    positive = sum(hyp.values())
-    true_positive = sum((ref & hyp).values())
-    precision = float(true_positive) / positive if positive else 1.0
-    recall = float(true_positive) / true if true else 1.0
-    if precision + recall > 0.0:
-        f1 = 2.0 * precision * recall / (precision + recall)
-    else:  # The F1-score is defined to be 0 if both precision and recall are 0.
-        f1 = 0.0
-
-    return F1Scores(f1=f1, precision=precision, recall=recall)
-
-
-def fuzzy_string_match(str_ref, str_hyp):
-    """Returns fuzzy string similarity score in range [0.0, 1.0].
-    Args:
-      str_ref: reference string
-      str_hyp: hypothesis string
-    Returns:
-      fuzzy string similarity
-    """
-
-    # The higher the score, the higher the similarity between the two strings.
-    return fuzz.token_sort_ratio(str_ref, str_hyp) / 100.0
-
-
-def noncat_slot_value_match(str_ref_list, str_hyp, use_fuzzy_match):
-    """Calculate non-categorical slots correctness.
-    Args:
-      str_ref_list: a list of reference strings.
-      str_hyp: the hypothesis string.
-      use_fuzzy_match: whether to use fuzzy string matching.
-    Returns:
-      score: The highest fuzzy string match score of the references and hypotheis.
-    """
-    score = 0.0
-    for str_ref in str_ref_list:
-        if use_fuzzy_match:
-            match_score = fuzzy_string_match(str_ref, str_hyp)
-        else:
-            match_score = float(str_ref == str_hyp)
-        score = max(score, match_score)
-    return score
-
-
-def compare_slot_values(slot_values_ref, slot_values_hyp, service, use_fuzzy_match):
-    """Compare and get correctness of goal state's slot_values.
-
-    Args:
-      slot_values_ref: goal state slot_values from reference (ground truth).
-      slot_values_hyp: goal state slot_values from hypothesis (prediction).
-      service: a service data structure in the schema. We use it to obtain the
-        list of slots in the service and infer whether a slot is categorical.
-      use_fuzzy_match: whether to use fuzzy string matching for non-categorical
-        slot values
-
-    Returns:
-      list_cor: list of corectness scores, each corresponding to one slot in the
-          service. The score is a float either 0.0 or 1.0 for categorical slot,
-          and in range [0.0, 1.0] for non-categorical slot.
-      slot_active: list indicating whether the element in list_cor corresponds to
-          an active ground-truth slot.
-      slot_cat: list indicating whether the element in list_cor corresponds to a
-          categorical slot.
-      list_cor_status: list of correct slot statuses 
-      list_cor_value: list of correctness score only for active slots. Monactive slots are assigned -1.
-    """
-    list_cor = []
-    list_cor_status = []
-    list_cor_value = []
-    slot_active = []
-    slot_cat = []
-
-    for slot in service["slots"]:
-        slot_name = slot["name"]
-        slot_cat.append(slot["is_categorical"])
-
-        if slot_name in slot_values_ref:  # REF=active
-            slot_active.append(True)
-            if slot_name in slot_values_hyp:  # HYP=active, apply matching
-                value_ref_list = slot_values_ref[slot_name]
-                value_hyp = slot_values_hyp[slot_name][0]
-                if slot["is_categorical"]:
-                    cor = float(value_ref_list[0] == value_hyp)
-                else:
-                    cor = noncat_slot_value_match(value_ref_list, value_hyp, use_fuzzy_match)
-                list_cor.append(cor)
-                list_cor_status.append(1.0)
-                list_cor_value.append(cor)
-            else:  # HYP=off
-                list_cor.append(0.0)
-                list_cor_status.append(0.0)
-                list_cor_value.append(-1.0)
-        else:  # REF=off
-            slot_active.append(False)
-            if slot_name in slot_values_hyp:  # HYP=active
-                list_cor.append(0.0)
-                list_cor_status.append(0.0)
-            else:  # HYP=off
-                list_cor.append(1.0)
-                list_cor_status.append(1.0)
-            list_cor_value.append(-1.0)
-
-    assert len(list_cor) == len(service["slots"])
-    assert len(slot_active) == len(service["slots"])
-    assert len(slot_cat) == len(service["slots"])
-    return list_cor, slot_active, slot_cat, list_cor_status, list_cor_value
-
-
-def get_active_intent_accuracy(frame_ref, frame_hyp):
-    """Get active intent accuracy of a frame.
-
-    Args:
-      frame_ref: single semantic frame from reference (ground truth) file.
-      frame_hyp: single semantic frame from hypothesis (prediction) file.
-
-    Returns:
-      1.0 if the intent prediction is correct, otherwise 0.0.
-    """
-    return float(frame_ref["state"]["active_intent"] == frame_hyp["state"]["active_intent"])
-
-
-def get_slot_tagging_f1(frame_ref, frame_hyp, utt, service):
-    """Get slot tagging (non-categorical slots only) F1 scores of a frame.
-
-    Args:
-      frame_ref: single semantic frame from reference (ground truth) file.
-      frame_hyp: single semantic frame from hypothesis (prediction) file.
-      utt: user utterance. Slot tagging annotations are the character positions in
-        the utterance.
-      service: a service data structure in the schema. We use it to infer whether
-        a slot is non-categorical.
-
-    Returns:
-      A F1Scores object containing F1, precision, and recall scores.
-    """
-    list_noncat_slots = [s["name"] for s in service["slots"] if not s["is_categorical"]]
-    if "slots" not in frame_hyp:
-        return None
-    else:
-        list_ref = [
-            (s["slot"], utt[s["start"] : s["exclusive_end"]])
-            for s in frame_ref["slots"]
-            if s["slot"] in list_noncat_slots
-        ]
-        list_hyp = [
-            (s["slot"], utt[s["start"] : s["exclusive_end"]])
-            for s in frame_hyp["slots"]
-            if s["slot"] in list_noncat_slots
-        ]
-        return compute_f1(list_ref, list_hyp)
-
-
-def get_requested_slots_f1(frame_ref, frame_hyp):
-    """Get requested slots F1 scores of a frame.
-
-    Args:
-      frame_ref: single semantic frame from reference (ground truth) file.
-      frame_hyp: single semantic frame from hypothesis (prediction) file.
-
-    Returns:
-      A F1Scores object containing F1, precision, and recall scores.
-    """
-    return compute_f1(frame_ref["state"]["requested_slots"], frame_hyp["state"]["requested_slots"])
-
-
-def get_average_and_joint_goal_accuracy(frame_ref, frame_hyp, service, use_fuzzy_match):
-    """Get average and joint goal accuracies of a frame.
-
-    Args:
-      frame_ref: single semantic frame from reference (ground truth) file.
-      frame_hyp: single semantic frame from hypothesis (prediction) file.
-      service: a service data structure in the schema. We use it to obtain the
-        list of slots in the service and infer whether a slot is categorical.
-      use_fuzzy_match: whether to use fuzzy string matching for comparing
-        non-categorical slot values.
-
-    Returns:
-      goal_acc: a dict whose values are average / joint
-          all-goal / categorical-goal / non-categorical-goal accuracies.
-    """
-    goal_acc = {}
-
-    list_acc, slot_active, slot_cat, list_status_acc, list_value_acc = compare_slot_values(
-        frame_ref["state"]["slot_values"], frame_hyp["state"]["slot_values"], service, use_fuzzy_match
-    )
-
-    # (4) Average goal accuracy.
-    active_acc = [acc for acc, active in zip(list_acc, slot_active) if active]
-    goal_acc[AVERAGE_GOAL_ACCURACY] = np.mean(active_acc) if active_acc else NAN_VAL
-    # (4-a) categorical.
-    active_cat_acc = [acc for acc, active, cat in zip(list_acc, slot_active, slot_cat) if active and cat]
-    goal_acc[AVERAGE_CAT_ACCURACY] = np.mean(active_cat_acc) if active_cat_acc else NAN_VAL
-    # (4-b) non-categorical.
-    active_noncat_acc = [acc for acc, active, cat in zip(list_acc, slot_active, slot_cat) if active and not cat]
-    goal_acc[AVERAGE_NONCAT_ACCURACY] = np.mean(active_noncat_acc) if active_noncat_acc else NAN_VAL
-
-    # (5) Joint goal accuracy.
-    goal_acc[JOINT_GOAL_ACCURACY] = np.prod(list_acc) if list_acc else NAN_VAL
-    # (5-a) categorical.
-    cat_acc = [acc for acc, cat in zip(list_acc, slot_cat) if cat]
-    goal_acc[JOINT_CAT_ACCURACY] = np.prod(cat_acc) if cat_acc else NAN_VAL
-    # (5-b) non-categorical.
-    noncat_acc = [acc for acc, cat in zip(list_acc, slot_cat) if not cat]
-    goal_acc[JOINT_NONCAT_ACCURACY] = np.prod(noncat_acc) if noncat_acc else NAN_VAL
-
-    # !!!!!!!!!!DEBUG!!!!!!!!!!!!!
-    # cat status acc for both active and non active
-    active_cat_status_acc = [acc for acc, active, cat in zip(list_status_acc, slot_active, slot_cat) if cat and active]
-    goal_acc[AVERAGE_CAT_STATUS_ACCURACY] = np.mean(active_cat_status_acc) if active_cat_status_acc else NAN_VAL
-    # joint cat status acc for both active and non active
-    cat_status_acc = [acc for acc, cat in zip(list_status_acc, slot_cat) if cat]
-    goal_acc[JOINT_CAT_STATUS_ACCURACY] = np.prod(cat_status_acc) if cat_status_acc else NAN_VAL
-
-    # non cat status acc for both active and non active
-    active_noncat_status_acc = [
-        acc for acc, active, cat in zip(list_status_acc, slot_active, slot_cat) if not cat and active
-    ]
-    goal_acc[AVERAGE_NONCAT_STATUS_ACCURACY] = (
-        np.mean(active_noncat_status_acc) if active_noncat_status_acc else NAN_VAL
-    )
-    # joint non cat status acc for both active and non active
-    noncat_status_acc = [acc for acc, cat in zip(list_status_acc, slot_cat) if not cat]
-    goal_acc[JOINT_NONCAT_STATUS_ACCURACY] = np.prod(noncat_status_acc) if noncat_status_acc else NAN_VAL
-
-    # cat value acc for both active and non active
-    active_cat_val_acc = [
-        acc for acc, active, cat in zip(list_value_acc, slot_active, slot_cat) if cat and acc > -0.5 and active
-    ]
-    goal_acc[AVERAGE_CAT_VALUE_ACCURACY] = np.mean(active_cat_val_acc) if active_cat_val_acc else NAN_VAL
-    # joint cat value acc for both active and non active
-    cat_val_acc = [acc for acc, cat in zip(list_value_acc, slot_cat) if cat and acc > -0.5]
-    goal_acc[JOINT_CAT_VALUE_ACCURACY] = np.prod(cat_val_acc) if cat_val_acc else NAN_VAL
-
-    # cat non value acc for both active and non active
-    active_noncat_val_acc = [
-        acc for acc, active, cat in zip(list_value_acc, slot_active, slot_cat) if not cat and acc > -0.5 and active
-    ]
-    goal_acc[AVERAGE_NONCAT_VALUE_ACCURACY] = np.mean(active_noncat_val_acc) if active_noncat_val_acc else NAN_VAL
-    # joint non cat value acc for both active and non active
-    noncat_val_acc = [acc for acc, cat in zip(list_value_acc, slot_cat) if not cat and acc > -0.5]
-    goal_acc[JOINT_NONCAT_VALUE_ACCURACY] = np.prod(noncat_val_acc) if noncat_val_acc else NAN_VAL
-
-    return goal_acc
diff --git a/nemo/collections/nlp/models/__init__.py b/nemo/collections/nlp/models/__init__.py
index 6fa39cbe053b..25b4980f70c3 100644
--- a/nemo/collections/nlp/models/__init__.py
+++ b/nemo/collections/nlp/models/__init__.py
@@ -12,21 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-from nemo.collections.nlp.models.entity_linking.entity_linking_model import EntityLinkingModel  # noqa: F401
-from nemo.collections.nlp.models.glue_benchmark.glue_benchmark_model import GLUEModel  # noqa: F401
-from nemo.collections.nlp.models.information_retrieval import BertDPRModel, BertJointIRModel  # noqa: F401
-from nemo.collections.nlp.models.intent_slot_classification import (  # noqa: F401
-    IntentSlotClassificationModel,
-    MultiLabelIntentSlotClassificationModel,
-)
 from nemo.collections.nlp.models.language_modeling import MegatronGPTPromptLearningModel  # noqa: F401
 from nemo.collections.nlp.models.language_modeling.bert_lm_model import BERTLMModel  # noqa: F401
 from nemo.collections.nlp.models.language_modeling.transformer_lm_model import TransformerLMModel  # noqa: F401
 from nemo.collections.nlp.models.machine_translation import MTEncDecModel  # noqa: F401
-from nemo.collections.nlp.models.token_classification import (  # noqa: F401
-    PunctuationCapitalizationLexicalAudioModel,
-    PunctuationCapitalizationModel,
-    TokenClassificationModel,
-)
-from nemo.collections.nlp.models.zero_shot_intent_recognition import ZeroShotIntentModel  # noqa: F401
diff --git a/nemo/collections/nlp/models/entity_linking/__init__.py b/nemo/collections/nlp/models/entity_linking/__init__.py
deleted file mode 100644
index 925bfc18c77e..000000000000
--- a/nemo/collections/nlp/models/entity_linking/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.models.entity_linking.entity_linking_model import EntityLinkingModel
diff --git a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
deleted file mode 100644
index 640520cdaaa7..000000000000
--- a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional
-
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig
-from transformers import AutoTokenizer
-
-from nemo.collections.common.losses import MultiSimilarityLoss
-from nemo.collections.nlp.data import EntityLinkingDataset
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.core.classes.common import typecheck
-from nemo.core.classes.exportable import Exportable
-from nemo.core.neural_types import LogitsType, NeuralType
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['EntityLinkingModel']
-
-
-class EntityLinkingModel(NLPModel, Exportable):
-    """
-    Second stage pretraining of BERT based language model
-    for entity linking task. An implementation of Liu et. al's
-    NAACL 2021 paper Self-Alignment Pretraining for Biomedical Entity Representations.
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {"logits": NeuralType(('B', 'D'), LogitsType())}
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        """Initializes the SAP-BERT model for entity linking."""
-
-        # deprecation warning
-        deprecated_warning("EntityLinkingModel")
-
-        # tokenizer needed before super().__init__() so dataset and loader can process data
-        self._setup_tokenizer(cfg.tokenizer)
-
-        super().__init__(cfg=cfg, trainer=trainer)
-
-        # Token to use for the self-alignment loss, typically the first token, [CLS]
-        self._idx_conditioned_on = 0
-        self.loss = MultiSimilarityLoss()
-
-    def _setup_tokenizer(self, cfg: DictConfig):
-        tokenizer = AutoTokenizer.from_pretrained(
-            cfg.tokenizer_name, vocab_file=cfg.vocab_file, do_lower_case=cfg.do_lower_case
-        )
-
-        self.tokenizer = tokenizer
-
-    @typecheck()
-    def forward(self, input_ids, token_type_ids, attention_mask):
-        hidden_states = self.bert_model(
-            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-        )
-        if isinstance(hidden_states, tuple):
-            hidden_states = hidden_states[0]
-
-        # normalize to unit sphere
-        logits = torch.nn.functional.normalize(hidden_states[:, self._idx_conditioned_on], p=2, dim=1)
-        return logits
-
-    def training_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the training loop with the data from the training dataloader
-        passed in as `batch`.
-        """
-        input_ids, token_type_ids, attention_mask, concept_ids = batch
-        logits = self.forward(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
-        train_loss = self.loss(logits=logits, labels=concept_ids)
-
-        # No hard examples found in batch,
-        # shouldn't use this batch to update model weights
-        if train_loss == 0:
-            train_loss = None
-            lr = None
-
-        else:
-            lr = self._optimizer.param_groups[0]["lr"]
-            self.log("train_loss", train_loss)
-            self.log("lr", lr, prog_bar=True)
-
-        return {"loss": train_loss, "lr": lr}
-
-    def validation_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the validation loop with the data from the validation dataloader
-        passed in as `batch`.
-        """
-        input_ids, input_type_ids, input_mask, concept_ids = batch
-        with torch.no_grad():
-            logits = self.forward(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
-            val_loss = self.loss(logits=logits, labels=concept_ids)
-
-        # No hard examples found in batch,
-        # val loss not used to update model weights
-        if val_loss == 0:
-            val_loss = None
-        else:
-            self.log("val_loss", val_loss)
-            logging.info(f"val loss: {val_loss}")
-
-        loss = {"val_loss": val_loss}
-        self.validation_step_outputs.append(loss)
-        return loss
-
-    def on_validation_epoch_end(self):
-        """
-        Called at the end of validation to aggregate outputs.
-
-        Args:
-            outputs: list of individual outputs of each validation step.
-        Returns:
-
-        """
-        if self.validation_step_outputs:
-            avg_loss = torch.stack(
-                [x["val_loss"] for x in self.validation_step_outputs if x["val_loss"] != None]
-            ).mean()
-            self.log(f"val_loss", avg_loss, prog_bar=True)
-            self.validation_step_outputs.clear()  # free memory
-            return {"val_loss": avg_loss}
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig]):
-        if not train_data_config or not train_data_config.data_file:
-            logging.info(
-                f"Dataloader config or file_path or processed data path for the train dataset is missing, \
-                        so no data loader for train is created!"
-            )
-
-            self._train_dl = None
-            return
-
-        self._train_dl = self.setup_dataloader(cfg=train_data_config)
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig]):
-        if not val_data_config or not val_data_config.data_file:
-            logging.info(
-                f"Dataloader config or file_path or processed data path for the val dataset is missing, \
-                        so no data loader for validation is created!"
-            )
-
-            self._validation_dl = None
-            return
-
-        self._validation_dl = self.setup_dataloader(cfg=val_data_config)
-
-    def setup_dataloader(self, cfg: Dict, is_index_data: bool = False) -> 'torch.utils.data.DataLoader':
-
-        dataset = EntityLinkingDataset(
-            tokenizer=self.tokenizer,
-            data_file=cfg.data_file,
-            max_seq_length=cfg.max_seq_length,
-            is_index_data=is_index_data,
-        )
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=cfg.batch_size,
-            collate_fn=dataset.collate_fn,
-            shuffle=cfg.get("shuffle", True),
-            num_workers=cfg.get("num_workers", 2),
-            pin_memory=cfg.get("pin_memory", False),
-            drop_last=cfg.get("drop_last", False),
-        )
-
-    @classmethod
-    def list_available_models(cls) -> Optional[Dict[str, str]]:
-        pass
-
-    @classmethod
-    def from_pretrained(cls, name: str):
-        pass
diff --git a/nemo/collections/nlp/models/glue_benchmark/__init__.py b/nemo/collections/nlp/models/glue_benchmark/__init__.py
deleted file mode 100644
index eecc4db9100c..000000000000
--- a/nemo/collections/nlp/models/glue_benchmark/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.models.glue_benchmark.glue_benchmark_model import GLUEModel
diff --git a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
deleted file mode 100644
index e90cf9d88c30..000000000000
--- a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Dict, Optional, Union
-
-import numpy as np
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig
-
-from nemo.collections.common.losses import CrossEntropyLoss, MSELoss
-from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUE_TASKS_NUM_LABELS, GLUEDataset
-from nemo.collections.nlp.models.glue_benchmark.metrics_for_glue import compute_metrics
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.collections.nlp.modules.common import SequenceClassifier, SequenceRegression
-from nemo.collections.nlp.parts.utils_funcs import list2str, tensor2list
-from nemo.core.classes import typecheck
-from nemo.core.neural_types import NeuralType
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['GLUEModel']
-
-'''
-Some transformer of this code were adapted from the HuggingFace library at
-https://github.com/huggingface/transformers
-Example of running a pretrained BERT model on the 9 GLUE tasks, read more
-about GLUE benchmark here: https://gluebenchmark.com
-Download the GLUE data by running the script:
-https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
-
-Some of these tasks have a small dataset and training can lead to high variance
-in the results between different runs. Below is the median on 5 runs
-(with different seeds) for each of the metrics on the dev set of the benchmark
-with an uncased BERT base model (the checkpoint bert-base-uncased)
-(source https://github.com/huggingface/transformers/tree/master/examples#glue).
-Task	Metric	                        Result
-CoLA	Matthew's corr	                48.87
-SST-2	Accuracy	                    91.74
-MRPC	F1/Accuracy	                 90.70/86.27
-STS-B	Person/Spearman corr.	     91.39/91.04
-QQP	    Accuracy/F1	                 90.79/87.66
-MNLI	Matched acc./Mismatched acc. 83.70/84.83
-QNLI	Accuracy	                    89.31
-RTE	    Accuracy	                    71.43
-WNLI	Accuracy	                    43.66
-
-'''
-
-
-class GLUEModel(NLPModel):
-    @property
-    def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return self.bert_model.input_types
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return self.pooler.output_types
-
-    @property
-    def output_module(self):
-        return self.pooler
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        """
-        Initializes model to use BERT model for GLUE tasks.
-        """
-        # deprecation warning
-        deprecated_warning("GLUEModel")
-
-        if cfg.task_name not in cfg.supported_tasks:
-            raise ValueError(f'{cfg.task_name} not in supported task. Choose from {cfg.supported_tasks}')
-        self.task_name = cfg.task_name
-
-        # needed to setup validation on multiple datasets
-        # MNLI task has two separate dev sets: matched and mismatched
-        if not self._is_model_being_restored():
-            if self.task_name == "mnli":
-                cfg.validation_ds.ds_item = [
-                    os.path.join(cfg.dataset.data_dir, 'dev_matched.tsv'),
-                    os.path.join(cfg.dataset.data_dir, 'dev_mismatched.tsv'),
-                ]
-            else:
-                cfg.validation_ds.ds_item = os.path.join(cfg.dataset.data_dir, cfg.validation_ds.ds_item)
-            cfg.train_ds.ds_item = os.path.join(cfg.dataset.data_dir, cfg.train_ds.ds_item)
-            logging.info(f'Using {cfg.validation_ds.ds_item} for model evaluation.')
-
-        super().__init__(cfg=cfg, trainer=trainer)
-
-        num_labels = GLUE_TASKS_NUM_LABELS[self.task_name]
-        # uses [CLS] token for classification (the first token)
-        if self.task_name == "sts-b":
-            self.pooler = SequenceRegression(hidden_size=self.bert_model.config.hidden_size)
-            self.loss = MSELoss()
-        else:
-            self.pooler = SequenceClassifier(
-                hidden_size=self.bert_model.config.hidden_size, num_classes=num_labels, log_softmax=False
-            )
-            self.loss = CrossEntropyLoss()
-
-    def update_data_dir(self, data_dir: str) -> None:
-        """
-        Update data directory and get data stats with Data Descriptor
-        Weights are later used to setup loss
-
-        Args:
-            data_dir: path to data directory
-        """
-        self._cfg.dataset.data_dir = data_dir
-        logging.info(f'Setting model.dataset.data_dir to {data_dir}.')
-        if self.task_name == "mnli":
-            self._cfg.validation_ds.ds_item = [
-                os.path.join(data_dir, 'dev_matched.tsv'),
-                os.path.join(data_dir, 'dev_mismatched.tsv'),
-            ]
-        else:
-            self._cfg.validation_ds.ds_item = os.path.join(data_dir, 'dev.tsv')
-
-        self._cfg.train_ds.ds_item = os.path.join(data_dir, 'train.tsv')
-        logging.info(f'Using {self._cfg.validation_ds.ds_item} for model evaluation.')
-
-    @typecheck()
-    def forward(self, input_ids, token_type_ids, attention_mask):
-        hidden_states = self.bert_model(
-            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-        )
-        if isinstance(hidden_states, tuple):
-            hidden_states = hidden_states[0]
-
-        output = self.pooler(hidden_states=hidden_states)
-        return output
-
-    def training_step(self, batch, batch_idx):
-        input_ids, input_type_ids, input_mask, labels = batch
-        model_output = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
-
-        if self.task_name == "sts-b":
-            loss = self.loss(preds=model_output, labels=labels)
-        else:
-            loss = self.loss(logits=model_output, labels=labels)
-
-        lr = self._optimizer.param_groups[0]['lr']
-
-        self.log('train_loss', loss)
-        self.log('lr', lr, prog_bar=True)
-
-        return {
-            'loss': loss,
-            'lr': lr,
-        }
-
-    def validation_step(self, batch, batch_idx, dataloader_idx=0):
-        input_ids, input_type_ids, input_mask, labels = batch
-        model_output = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
-
-        if self.task_name == "sts-b":
-            val_loss = self.loss(preds=model_output, labels=labels)
-        else:
-            val_loss = self.loss(logits=model_output, labels=labels)
-
-        if self.task_name != 'sts-b':
-            model_output = torch.argmax(model_output, 1)
-
-        eval_tensors = {'preds': model_output, 'labels': labels}
-        output = {'val_loss': val_loss, 'eval_tensors': eval_tensors}
-        self.validation_step_outputs.append(output)
-        return output
-
-    def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0):
-        """
-        Called at the end of validation to aggregate outputs.
-        outputs: list of individual outputs of each validation step.
-        """
-        avg_loss = torch.stack([x['val_loss'] for x in self.validation_step_outputs]).mean()
-        preds = torch.cat([x['eval_tensors']['preds'] for x in self.validation_step_outputs])
-        labels = torch.cat([x['eval_tensors']['labels'] for x in self.validation_step_outputs])
-
-        all_preds = []
-        all_labels = []
-        if torch.distributed.is_initialized():
-            world_size = torch.distributed.get_world_size()
-            for ind in range(world_size):
-                all_preds.append(torch.empty_like(preds))
-                all_labels.append(torch.empty_like(labels))
-            torch.distributed.all_gather(all_preds, preds)
-            torch.distributed.all_gather(all_labels, labels)
-        else:
-            all_preds.append(preds)
-            all_labels.append(labels)
-
-        if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
-            preds = []
-            labels = []
-            for p in all_preds:
-                preds.extend(tensor2list(p))
-            for l in all_labels:
-                labels.extend(tensor2list(l))
-
-            results = compute_metrics(self.task_name, np.array(preds), np.array(labels))
-            val_name = self._validation_names[dataloader_idx].upper()
-            logging.info(f'{val_name} evaluation: {results}')
-
-            # writing labels and predictions to a file in output_dir is specified in the config
-            output_dir = self._cfg.output_dir
-            if output_dir:
-                os.makedirs(output_dir, exist_ok=True)
-                filename = os.path.join(output_dir, f'{self.task_name}_{val_name}.txt')
-                logging.info(f'Saving labels and predictions to {filename}')
-                with open(filename, 'w') as f:
-                    f.write('labels\t' + list2str(labels) + '\n')
-                    f.write('preds\t' + list2str(preds) + '\n')
-
-        self.log('val_loss', avg_loss)
-        if self.trainer.is_global_zero:
-            for k, v in results.items():
-                self.log(f'{val_name}_{k}', v, rank_zero_only=True)
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig] = None):
-        if train_data_config is None:
-            train_data_config = self._cfg.train_ds
-
-        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config)
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig] = None):
-        if val_data_config is None:
-            val_data_config = self._cfg.validation_ds
-
-        self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config)
-
-    def setup_multiple_validation_data(self, val_data_config: Union[DictConfig, Dict] = None):
-        if val_data_config is None:
-            val_data_config = self._cfg.validation_ds
-
-        return super().setup_multiple_validation_data(val_data_config)
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig):
-        file_name = cfg.ds_item
-        if not os.path.exists(file_name):
-            raise FileNotFoundError(
-                "GLUE datasets not found. For more details on how to get the data, see: "
-                "https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e"
-            )
-
-        dataset = GLUEDataset(
-            file_name=file_name,
-            task_name=self.task_name,
-            tokenizer=self.tokenizer,
-            max_seq_length=self._cfg.dataset.max_seq_length,
-            use_cache=self._cfg.dataset.use_cache,
-        )
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            collate_fn=dataset.collate_fn,
-            batch_size=cfg.batch_size,
-            shuffle=cfg.shuffle,
-            num_workers=self._cfg.dataset.num_workers,
-            pin_memory=self._cfg.dataset.pin_memory,
-            drop_last=self._cfg.dataset.drop_last,
-        )
-
-    @classmethod
-    def list_available_models(cls) -> Optional[Dict[str, str]]:
-        pass
diff --git a/nemo/collections/nlp/models/glue_benchmark/metrics_for_glue.py b/nemo/collections/nlp/models/glue_benchmark/metrics_for_glue.py
deleted file mode 100644
index dd4ecada1a87..000000000000
--- a/nemo/collections/nlp/models/glue_benchmark/metrics_for_glue.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, List
-
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import f1_score, matthews_corrcoef
-
-__all__ = ['compute_metrics']
-
-
-def accuracy(preds: List[int], labels: List[int]):
-    return {"acc": (preds == labels).mean()}
-
-
-def acc_and_f1(preds: List[int], labels: List[int]):
-    accuracy = (preds == labels).mean()
-    f1 = f1_score(y_true=labels, y_pred=preds)
-    return {"acc": accuracy, "f1": f1}
-
-
-def mcc(preds: List[int], labels: List[int]):
-    return {"mcc": matthews_corrcoef(labels, preds)}
-
-
-def pearson_and_spearman(preds: List[int], labels: List[int]):
-    pearson_corr = pearsonr(preds, labels)[0]
-    spearman_corr = spearmanr(preds, labels)[0]
-    return {"pearson": pearson_corr, "spearmanr": spearman_corr, "pear+spear av": (pearson_corr + spearman_corr) / 2}
-
-
-def compute_metrics(task_name: str, preds: List[int], labels: List[int]) -> Dict[str, float]:
-    """
-    Computes metrics for GLUE tasks
-    Args:
-        task_name: GLUE task name
-        preds: model predictions
-        labels: golden labels
-    Returns:
-        metrics
-    """
-    if len(preds) != len(labels):
-        raise ValueError("Predictions and labels must have the same length")
-
-    metric_fn = accuracy
-    if task_name == 'cola':
-        metric_fn = mcc
-    elif task_name in ['mrpc', 'qqp']:
-        metric_fn = acc_and_f1
-    elif task_name == 'sts-b':
-        metric_fn = pearson_and_spearman
-
-    return metric_fn(preds, labels)
diff --git a/nemo/collections/nlp/models/information_retrieval/__init__.py b/nemo/collections/nlp/models/information_retrieval/__init__.py
deleted file mode 100644
index f07a53c76cb2..000000000000
--- a/nemo/collections/nlp/models/information_retrieval/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.models.information_retrieval.bert_dpr_model import BertDPRModel
-from nemo.collections.nlp.models.information_retrieval.bert_joint_ir_model import BertJointIRModel
diff --git a/nemo/collections/nlp/models/information_retrieval/base_ir_model.py b/nemo/collections/nlp/models/information_retrieval/base_ir_model.py
deleted file mode 100644
index 91d86fef1851..000000000000
--- a/nemo/collections/nlp/models/information_retrieval/base_ir_model.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import Dict, Optional
-
-import numpy as np
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig, OmegaConf
-
-from nemo.collections.nlp.data import BertInformationRetrievalDataset
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.collections.nlp.modules.common.lm_utils import get_lm_model
-from nemo.core.classes.common import typecheck
-
-__all__ = ['BaseIRModel']
-
-
-class BaseIRModel(NLPModel):
-    """
-    Base class for information retrieval models.
-    """
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-
-        self.setup_tokenizer(cfg.tokenizer)
-
-        super().__init__(cfg=cfg, trainer=trainer)
-
-    @typecheck()
-    def forward(self, *args):
-        pass
-
-    def compute_scores_and_loss(self, inputs):
-        pass
-
-    @staticmethod
-    def get_lm_model_with_padded_embedding(cfg: DictConfig):
-        """
-        Function which ensures that vocabulary size is divisivble by 8
-        for faster mixed precision training.
-        """
-        model = get_lm_model(
-            config_file=cfg.language_model.config_file,
-            config_dict=OmegaConf.to_container(cfg.language_model.config) if cfg.language_model.config else None,
-            vocab_file=cfg.tokenizer.vocab_file,
-            trainer=trainer,
-            cfg=cfg,
-        )
-        vocab_size, hidden_size = model.config.vocab_size, model.config.hidden_size
-        tokens_to_add = 8 * math.ceil(vocab_size / 8) - vocab_size
-        zeros = torch.zeros((tokens_to_add, hidden_size))
-        model.embeddings.word_embeddings.weight.data = torch.cat((model.embeddings.word_embeddings.weight.data, zeros))
-        return model
-
-    @staticmethod
-    def calculate_mean_reciprocal_rank(query2passages, query2rel):
-        """
-        Helper function which calculates mean reciprocal rank.
-        Args:
-            query2passages: dict which contains passage ids and corresponding
-                scores for each query
-            query2rel: dict which contains ids of relevant passages for each query
-        """
-        reciprocal_ranks = []
-
-        for query in query2passages:
-            indices = np.argsort(query2passages[query]["scores"])[::-1]
-            sorted_psgs = query2passages[query]["psg_ids"][indices]
-
-            reciprocal_ranks.append(0)
-            for i, psg_id in enumerate(sorted_psgs):
-                if psg_id in query2rel[query]:
-                    reciprocal_ranks[-1] = 1 / (i + 1)
-                    break
-        return np.mean(reciprocal_ranks)
-
-    def training_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the training loop with the data from the training dataloader
-        passed in as `batch`.
-        """
-        scores, train_loss = self.compute_scores_and_loss(batch[:-2])
-        tensorboard_logs = {"train_loss": train_loss, "lr": self._optimizer.param_groups[0]["lr"]}
-        return {"loss": train_loss, "log": tensorboard_logs}
-
-    def validation_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the validation loop with the data from the validation dataloader
-        passed in as `batch`.
-        """
-        scores, val_loss = self.compute_scores_and_loss(batch[:-2])
-        query_ids, passage_ids = batch[-2:]
-        data_for_val = {
-            "val_loss": val_loss,
-            "scores": scores,
-            "query_ids": query_ids,
-            "passage_ids": passage_ids,
-        }
-        self.validation_step_outputs.append(data_for_val)
-        return data_for_val
-
-    def on_validation_epoch_end(self):
-        """
-        Called at the end of validation to aggregate outputs.
-        :param outputs: list of individual outputs of each validation step.
-        """
-
-        query_ids = torch.cat([x["query_ids"] for x in self.validation_step_outputs])
-        passage_ids = torch.cat([x["passage_ids"] for x in self.validation_step_outputs])
-        scores = torch.cat([x["scores"] for x in self.validation_step_outputs])
-
-        all_query_ids, all_passage_ids, all_scores = [], [], []
-        if torch.distributed.is_initialized():
-            world_size = torch.distributed.get_world_size()
-            for ind in range(world_size):
-                all_query_ids.append(torch.empty_like(query_ids))
-                all_passage_ids.append(torch.empty_like(passage_ids))
-                all_scores.append(torch.empty_like(scores))
-            torch.distributed.all_gather(all_query_ids, query_ids)
-            torch.distributed.all_gather(all_passage_ids, passage_ids)
-            torch.distributed.all_gather(all_scores, scores)
-        else:
-            all_query_ids.append(query_ids)
-            all_passage_ids.append(passage_ids)
-            all_scores.append(scores)
-
-        val_mrr = 0
-        if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
-            query2passages, query2rels = {}, {}
-            processed_queries = set()
-
-            for i in range(len(all_query_ids)):
-
-                query_ids = all_query_ids[i].detach().cpu().numpy()
-                passage_ids = all_passage_ids[i].detach().cpu().numpy()
-                scores = all_scores[i].detach().cpu().numpy()
-
-                for j, query_id in enumerate(query_ids):
-
-                    if query_id not in processed_queries:
-                        processed_queries.add(query_id)
-                        query2passages[query_id] = {
-                            "psg_ids": passage_ids[j],
-                            "scores": scores[j],
-                        }
-                        query2rels[query_id] = [passage_ids[j][0]]
-                    else:
-                        query2passages[query_id]["psg_ids"] = np.concatenate(
-                            (query2passages[query_id]["psg_ids"], passage_ids[j][1:])
-                        )
-                        query2passages[query_id]["scores"] = np.concatenate(
-                            (query2passages[query_id]["scores"], scores[j][1:])
-                        )
-
-            val_mrr = self.calculate_mean_reciprocal_rank(query2passages, query2rels)
-
-        val_loss = torch.stack([x["val_loss"] for x in self.validation_step_outputs]).mean()
-        self.validation_step_outputs.clear()  # free memory
-        tensorboard_logs = {
-            "val_mrr": val_mrr,
-            "val_loss": val_loss,
-        }
-
-        return {"log": tensorboard_logs}
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig]):
-        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config)
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig]):
-        self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config)
-
-    def setup_test_data(self, test_data_config: Optional[DictConfig]):
-        self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config)
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig):
-
-        dataset = BertInformationRetrievalDataset(
-            tokenizer=self.tokenizer,
-            passages=cfg.passages,
-            queries=cfg.queries,
-            query_to_passages=cfg.query_to_passages,
-            num_negatives=cfg.num_negatives,
-            psg_cache_format=cfg.get("psg_cache_format", "pkl"),
-            max_query_length=cfg.get("max_query_length", 31),
-            max_passage_length=cfg.get("max_passage_length", 190),
-        )
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=cfg.batch_size,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.get("num_workers", 2),
-            pin_memory=cfg.get("pin_memory", False),
-            drop_last=cfg.get("drop_last", False),
-        )
-
-    @classmethod
-    def list_available_models(cls) -> Optional[Dict[str, str]]:
-        pass
diff --git a/nemo/collections/nlp/models/information_retrieval/bert_dpr_model.py b/nemo/collections/nlp/models/information_retrieval/bert_dpr_model.py
deleted file mode 100644
index bfbec123d13e..000000000000
--- a/nemo/collections/nlp/models/information_retrieval/bert_dpr_model.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional
-
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig
-
-from nemo.collections.common.losses import SmoothedCrossEntropyLoss
-from nemo.collections.nlp.data import BertInformationRetrievalDataset
-from nemo.collections.nlp.models.information_retrieval.base_ir_model import BaseIRModel
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
-from nemo.core.classes.common import typecheck
-from nemo.core.neural_types import ChannelType, LogitsType, MaskType, NeuralType
-
-__all__ = ["BertDPRModel"]
-
-
-class BertDPRModel(BaseIRModel):
-    """
-    Information retrieval model which encodes query and passage separately
-    with two different BERT encoders and computes their similarity score
-    as a dot-product between corresponding [CLS] token representations.
-    """
-
-    @property
-    def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {
-            "q_input_ids": NeuralType(("B", "T"), ChannelType()),
-            "q_attention_mask": NeuralType(("B", "T"), MaskType()),
-            "q_token_type_ids": NeuralType(("B", "T"), ChannelType()),
-            "p_input_ids": NeuralType(("B", "T"), ChannelType()),
-            "p_attention_mask": NeuralType(("B", "T"), MaskType()),
-            "p_token_type_ids": NeuralType(("B", "T"), ChannelType()),
-        }
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {"logits": NeuralType(("B", "D"), LogitsType())}
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-
-        model_name = cfg.language_model.pretrained_model_name
-        self.tokenizer = get_tokenizer(tokenizer_name=model_name)
-
-        super().__init__(cfg=cfg, trainer=trainer)
-
-        self.q_encoder = self.get_lm_model_with_padded_embedding(cfg)
-        self.p_encoder = self.get_lm_model_with_padded_embedding(cfg)
-        self.loss = SmoothedCrossEntropyLoss(pad_id=self.tokenizer.pad_id)
-
-    @typecheck()
-    def forward(
-        self,
-        q_input_ids,
-        q_token_type_ids,
-        q_attention_mask,
-        p_input_ids,
-        p_token_type_ids,
-        p_attention_mask,
-    ):
-
-        q_vectors = self.q_encoder(
-            input_ids=q_input_ids,
-            token_type_ids=q_token_type_ids,
-            attention_mask=q_attention_mask,
-        )
-        q_vectors = q_vectors[:, 0]
-        batch_size, hidden_size = q_vectors.size()
-
-        p_vectors = self.p_encoder(
-            input_ids=p_input_ids,
-            token_type_ids=p_token_type_ids,
-            attention_mask=p_attention_mask,
-        )
-        num_passages = p_vectors.shape[0] // batch_size
-        p_vectors = p_vectors[:, 0].view(-1, num_passages, hidden_size)
-        p_positives, p_negatives = p_vectors[:, 0], p_vectors[:, 1:]
-        scores = torch.cat(
-            (
-                torch.matmul(q_vectors, p_positives.T),
-                torch.einsum("ij,ipj->ip", q_vectors, p_negatives),
-            ),
-            dim=1,
-        )
-
-        return scores
-
-    def compute_scores_and_loss(self, inputs):
-        (
-            q_input_ids,
-            q_input_mask,
-            q_input_type_ids,
-            p_input_ids,
-            p_input_mask,
-            p_input_type_ids,
-        ) = inputs
-        batch_size, num_passages, p_seq_length = p_input_ids.size()
-        q_seq_length = q_input_ids.size()[-1]
-
-        scores = self(
-            q_input_ids=q_input_ids.view(-1, q_seq_length),
-            q_token_type_ids=q_input_type_ids.view(-1, q_seq_length),
-            q_attention_mask=q_input_mask.view(-1, q_seq_length),
-            p_input_ids=p_input_ids.view(-1, p_seq_length),
-            p_token_type_ids=p_input_type_ids.view(-1, p_seq_length),
-            p_attention_mask=p_input_mask.view(-1, p_seq_length),
-        ).view(batch_size, 1, batch_size + num_passages - 1)
-        normalized_scores = torch.log_softmax(scores, dim=-1)
-
-        labels = torch.arange(batch_size)[:, None].long().to(normalized_scores.device)
-        loss = self.loss(
-            log_probs=normalized_scores,
-            labels=labels,
-            output_mask=torch.ones_like(labels),
-        )
-
-        scores = scores[:, 0]
-        scores = torch.cat(
-            (torch.diag(scores)[:, None], scores[:, batch_size:]),
-            dim=1,
-        )
-
-        return scores, loss
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig):
-
-        dataset = BertInformationRetrievalDataset(
-            tokenizer=self.tokenizer,
-            passages=cfg.passages,
-            queries=cfg.queries,
-            query_to_passages=cfg.query_to_passages,
-            num_negatives=cfg.num_negatives,
-            psg_cache_format=cfg.get("psg_cache_format", "pkl"),
-            max_query_length=cfg.get("max_query_length", 31),
-            max_passage_length=cfg.get("max_passage_length", 190),
-            preprocess_fn="preprocess_dpr",
-        )
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=cfg.batch_size,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.get("num_workers", 2),
-            pin_memory=cfg.get("pin_memory", False),
-            drop_last=cfg.get("drop_last", False),
-        )
diff --git a/nemo/collections/nlp/models/information_retrieval/bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/bert_embedding_model.py
deleted file mode 100644
index 2f0445d4c184..000000000000
--- a/nemo/collections/nlp/models/information_retrieval/bert_embedding_model.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import warnings
-
-import torch
-import torch.nn.functional as F
-from torch import Tensor, nn
-
-from nemo.collections.nlp.models.language_modeling.megatron.bert.bert_model import (
-    MCoreBertModelWrapperWithPostLNSupport,
-    NeMoBertModel,
-)
-from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
-
-try:
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-
-    ModelParallelConfig = ApexGuardDefaults
-
-    HAVE_MEGATRON_CORE = False
-
-
-class BertEmbeddingHead(nn.Module):
-    """Performs mean pooling on the token embeddings."""
-
-    def __init__(
-        self,
-        word_embedding_dimension: int,
-        pooling_mode_mean_tokens: bool = True,
-    ):
-        super(BertEmbeddingHead, self).__init__()
-
-        self.config_keys = [
-            "word_embedding_dimension",
-            "pooling_mode_mean_tokens",
-        ]
-        self.word_embedding_dimension = word_embedding_dimension
-        self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
-
-    def forward(self, token_embeddings: Tensor, attention_mask: Tensor):
-        # pylint: disable=C0116
-        token_embeddings = token_embeddings.permute(1, 0, 2)
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
-
-        sum_mask = input_mask_expanded.sum(1)
-
-        sum_mask = torch.clamp(sum_mask, min=1e-9)
-
-        output_vector = sum_embeddings / sum_mask
-
-        output_vector = F.normalize(output_vector, p=2, dim=1)
-
-        return output_vector
-
-    def __repr__(self):
-        return "Pooling({}) and Normalize".format(self.get_config_dict())
-
-    def get_config_dict(self):
-        # pylint: disable=C0116
-        return {key: self.__dict__[key] for key in self.config_keys}
-
-
-class MCoreBertEmbeddingModel(MCoreBertModelWrapperWithPostLNSupport):
-    """BertEmbeddingModel that wraps a BertEmbeddingHead and MCoreBertEmbeddingModel"""
-
-    def __init__(self, *args, **kwargs):
-
-        super(MCoreBertEmbeddingModel, self).__init__(*args, **kwargs)
-        # Changing the default settings of the original Bert model to make it compatible with the embedding model.
-        self.post_process = False
-        self.binary_head = None
-        self.lm_head = None
-        self.output_layer = None
-        self.encoder.final_layernorm = None
-        self.encoder.post_process = False
-        self.embedding_head = BertEmbeddingHead(
-            word_embedding_dimension=self.config.hidden_size,
-            pooling_mode_mean_tokens=True,
-        )
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        attention_mask: Tensor,
-        tokentype_ids: Tensor = None,
-        lm_labels: Tensor = None,
-        inference_params=None,
-        **kwargs,
-    ):
-        """Forward function of BERT model
-
-        Forward function of the BERT Model This function passes the input tensors
-        through the embedding layer, and then the encoder and finally into the post
-        processing layer (optional).
-
-        It either returns the Loss values if labels are given  or the final hidden units
-        """
-        hidden_states = super(MCoreBertEmbeddingModel, self).forward(
-            input_ids, attention_mask, tokentype_ids, lm_labels, inference_params
-        )
-        embeddings_out = self.embedding_head(hidden_states, attention_mask)
-        return embeddings_out
-
-
-class NeMoBertEmbeddingModel(NeMoBertModel):
-    """
-    Bert Language model.
-    Model returns [seq, batch, hidden] shape
-    """
-
-    def __init__(self, *args, **kwargs):
-        warnings.warn(
-            "NeMoBertModel will be deprecated mid 2024. Use MCoreBertEmbeddingModel instead.", DeprecationWarning
-        )
-        super().__init__(*args, **kwargs)
-        self.embedding_head = BertEmbeddingHead(
-            word_embedding_dimension=self.config.hidden_size,
-            pooling_mode_mean_tokens=True,
-        )
-
-    def forward(
-        self,
-        bert_model_input,
-        attention_mask,
-        token_type_ids=None,
-        lm_labels=None,
-        checkpoint_activations_all_layers=None,
-    ):
-
-        lm_output = super(NeMoBertEmbeddingModel, self).forward(
-            bert_model_input, attention_mask, token_type_ids, lm_labels, checkpoint_activations_all_layers
-        )
-        embeddings_out = self.embedding_head(lm_output[0], attention_mask)
-        return embeddings_out
diff --git a/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py b/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py
deleted file mode 100644
index 33885e6b50c6..000000000000
--- a/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional
-
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig
-
-from nemo.collections.common.losses import SmoothedCrossEntropyLoss
-from nemo.collections.nlp.models.information_retrieval.base_ir_model import BaseIRModel
-from nemo.collections.nlp.modules.common import SequenceRegression
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
-from nemo.core.classes.common import typecheck
-from nemo.core.neural_types import NeuralType
-
-__all__ = ["BertJointIRModel"]
-
-
-class BertJointIRModel(BaseIRModel):
-    """
-    Information retrieval model which jointly encodes both query and passage
-    and passes them to BERT encoder followed by a fully-connected layer for
-    similarity score prediction.
-    """
-
-    @property
-    def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return self.bert_model.input_types
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return self.sim_score_regressor.output_types
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-
-        model_name = cfg.language_model.pretrained_model_name
-        self.tokenizer = get_tokenizer(tokenizer_name=model_name)
-
-        super().__init__(cfg=cfg, trainer=trainer)
-
-        self.bert_model = self.get_lm_model_with_padded_embedding(cfg)
-        hidden_size = self.bert_model.config.hidden_size
-        self.sim_score_regressor = SequenceRegression(
-            hidden_size=hidden_size,
-            num_layers=1,
-            dropout=cfg.language_model.sim_score_dropout,
-        )
-        self.loss = SmoothedCrossEntropyLoss(pad_id=self.tokenizer.pad_id)
-
-    @typecheck()
-    def forward(self, input_ids, attention_mask, token_type_ids):
-
-        hidden_states = self.bert_model(
-            input_ids=input_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=attention_mask,
-        )
-        if isinstance(hidden_states, tuple):
-            hidden_states = hidden_states[0]
-
-        scores = self.sim_score_regressor(hidden_states=hidden_states)
-
-        return scores
-
-    def compute_scores_and_loss(self, inputs):
-        input_ids, input_mask, input_type_ids = inputs
-        batch_size, num_passages, seq_length = input_ids.size()
-
-        unnormalized_scores = self(
-            input_ids=input_ids.view(-1, seq_length),
-            attention_mask=input_mask.view(-1, seq_length),
-            token_type_ids=input_type_ids.view(-1, seq_length),
-        ).view(batch_size, 1, num_passages)
-        scores = torch.log_softmax(unnormalized_scores, dim=-1)
-
-        labels = torch.zeros_like(input_ids[:, :1, 0])
-        loss = self.loss(log_probs=scores, labels=labels, output_mask=torch.ones_like(labels))
-
-        return unnormalized_scores[:, 0], loss
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
deleted file mode 100644
index 4f47c3a67216..000000000000
--- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
+++ /dev/null
@@ -1,724 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# flake8: noqa
-# pylint: skip-file
-
-import logging
-import os
-
-import numpy as np
-import torch
-from lightning.pytorch.trainer.trainer import Trainer
-from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
-from omegaconf import DictConfig, OmegaConf, open_dict
-from omegaconf.dictconfig import DictConfig
-from torch.distributed import all_gather as all_gather_no_backprop
-from torch.distributed.nn.functional import all_gather as all_gather_with_backprop
-
-from nemo.collections.nlp.data.information_retrieval.bert_embedding_dataset import BertEmbeddingDataset
-from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
-    MegatronPretrainingRandomSampler,
-    MegatronPretrainingSampler,
-)
-from nemo.collections.nlp.models.information_retrieval.bert_embedding_model import (
-    MCoreBertEmbeddingModel,
-    NeMoBertEmbeddingModel,
-)
-from nemo.collections.nlp.models.language_modeling.megatron.bert.bert_spec import (
-    bert_layer_with_transformer_engine_spec_postln,
-)
-from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
-from nemo.collections.nlp.modules.common.megatron.utils import (
-    ApexGuardDefaults,
-    average_losses_across_data_parallel_group,
-)
-from nemo.collections.nlp.parts.utils_funcs import get_last_rank
-from nemo.utils import logging
-
-try:
-    from megatron.core import parallel_state
-    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
-    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-    TransformerConfig = ApexGuardDefaults
-    ModelParallelConfig = ApexGuardDefaults
-
-    HAVE_MEGATRON_CORE = False
-
-try:
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
-
-except (ImportError, ModuleNotFoundError):
-    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
-    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
-
-
-def listify(tensor):
-    l_tensor = []
-    for t in tensor:
-        r = t[:].unsqueeze(0).cpu()
-        l_tensor.append(r)
-    return l_tensor
-
-
-class MegatronBertEmbeddingModel(MegatronBertModel):
-    """
-    Megatron Bert pretraining.
-    Model returns [batch, seq, hidden] shape
-    """
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer):
-
-        super().__init__(cfg, trainer=trainer)
-        self.cross_entropy_loss = torch.nn.CrossEntropyLoss(label_smoothing=cfg.get('label_smoothing', 0.0))
-        softmax_temp = cfg.get('softmax_temp', 0.05)
-        self.scale = 1.0 / softmax_temp
-        self.hard_negatives_to_train = self.cfg.data.get("hard_negatives_to_train", 4)
-        self.global_inbatch_negatives = self.cfg.get("global_inbatch_negatives", True)
-        self.backprop_type = self.cfg.get("backprop_type", "local")
-        assert self.backprop_type in ["local", "global"], "Backprop type must be `local` or `global`"
-
-    def model_provider_func(self, pre_process, post_process):
-        cfg = self.cfg
-        num_tokentypes = 2 if cfg.bert_binary_head else 0
-        transformer_block_type = cfg.get('transformer_block_type', 'post_ln')
-        if self.mcore_bert:
-            if transformer_block_type == 'pre_ln':
-                layer_spec = bert_layer_with_transformer_engine_spec
-            else:
-                layer_spec = bert_layer_with_transformer_engine_spec_postln
-            model = MCoreBertEmbeddingModel(
-                config=self.transformer_config,
-                transformer_layer_spec=layer_spec,
-                vocab_size=self.padded_vocab_size,
-                max_sequence_length=cfg.max_position_embeddings,
-                num_tokentypes=num_tokentypes,
-                add_binary_head=cfg.bert_binary_head,
-                share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True),
-                parallel_output=True,
-                pre_process=pre_process,
-                post_process=post_process,
-                transformer_block_type=transformer_block_type,
-                add_pooler=self.cfg.get('add_pooler', True),
-            )
-
-        else:
-            model = NeMoBertEmbeddingModel(
-                config=self.model_parallel_config,
-                vocab_size=self.padded_vocab_size,
-                hidden_size=cfg.hidden_size,
-                max_position_embeddings=cfg.max_position_embeddings,
-                num_layers=cfg.num_layers,
-                num_attention_heads=cfg.num_attention_heads,
-                apply_query_key_layer_scaling=cfg.get('apply_query_key_layer_scaling', True),
-                kv_channels=cfg.get('kv_channels', None),
-                ffn_hidden_size=cfg.ffn_hidden_size,
-                num_tokentypes=num_tokentypes,
-                parallel_output=True,
-                pre_process=pre_process,
-                post_process=post_process,
-                init_method_std=cfg.get('init_method_std', 0.02),
-                fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False),
-                hidden_dropout=cfg.get('hidden_dropout', 0.1),
-                precision=cfg.get('precision', 16),
-                fp32_residual_connection=cfg.get('fp32_residual_connection', False),
-                activations_checkpoint_granularity=self.cfg.get('activations_checkpoint_granularity', None),
-                activations_checkpoint_method=self.cfg.get('activations_checkpoint_method', None),
-                activations_checkpoint_num_layers=self.cfg.get('activations_checkpoint_num_layers', 1),
-                activations_checkpoint_layers_per_pipeline=self.cfg.get(
-                    'activations_checkpoint_layers_per_pipeline', None
-                ),
-                layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
-                masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
-                normalization=cfg.get('normalization', 'layernorm'),
-                transformer_block_type=transformer_block_type,
-                bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
-                bias_dropout_add_fusion=cfg.get("bias_dropout_add_fusion", True),
-                onnx_safe=cfg.get('onnx_safe', False),
-                add_binary_head=cfg.bert_binary_head,
-                megatron_legacy=cfg.get('megatron_legacy', False),
-                position_embedding_type=self.cfg.get("position_embedding_type", "learned_absolute"),
-                add_pooler=cfg.get('add_pooler', True),
-                add_lm_head=cfg.get('add_lm_head', False),
-            )
-
-        return model
-
-    def build_train_valid_test_datasets(self, is_train=True):
-
-        self._train_ds = None
-        self._validation_ds = None
-        self._test_ds = None
-
-        if is_train:
-            self._train_ds = BertEmbeddingDataset(
-                self.cfg.data.data_train,
-                tokenizer=self.tokenizer,
-                add_bos=True,
-                num_hard_negatives=self.cfg.data.get("hard_negatives_to_train", 4),
-                max_seq_length=self.cfg.encoder_seq_length,
-            )
-            if self.cfg.data.data_validation:
-                self._validation_ds = BertEmbeddingDataset(
-                    self.cfg.data.data_validation,
-                    tokenizer=self.tokenizer,
-                    add_bos=True,
-                    num_hard_negatives=self.cfg.data.get("hard_negatives_to_train", 4),
-                    max_seq_length=self.cfg.encoder_seq_length,
-                )
-
-        else:
-            logging.info(f'Building test dataset')
-            if self.cfg.data.data_test.query_file_names is None or self.cfg.data.data_test.doc_file_names is None:
-                return []
-
-            query_dataset = BertEmbeddingDataset(
-                file_path=self.cfg.data.data_test.query_file_names[0],
-                tokenizer=self.tokenizer,
-                max_seq_length=self.cfg.encoder_seq_length,
-                add_bos=True,
-                add_eos=True,
-                data_type="query",
-            )
-            doc_dataset = BertEmbeddingDataset(
-                file_path=self.cfg.data.data_test.doc_file_names[0],
-                tokenizer=self.tokenizer,
-                max_seq_length=self.cfg.encoder_seq_length,
-                add_bos=True,
-                add_eos=True,
-                data_type="doc",
-            )
-
-            self._test_ds = [query_dataset, doc_dataset]
-
-        if self._train_ds is not None:
-            logging.info(f'Length of train dataset: {len(self._train_ds)}')
-        if self._validation_ds is not None:
-            logging.info(f'Length of val dataset: {len(self._validation_ds)}')
-        if self._test_ds is not None:
-            logging.info(f'Length of test query dataset: {len(self._test_ds[0])}')
-            logging.info(f'Length of test doc dataset: {len(self._test_ds[1])}')
-
-        logging.info(f'Finished building SBert datasets.')
-
-        return self._train_ds, self._validation_ds, self._test_ds
-
-    def setup(self, stage=None):
-        """
-        PTL hook that is executed after DDP spawns.
-        We setup datasets here as megatron datasets require DDP to instantiate.
-        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
-
-        Args:
-            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
-        """
-
-        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert()
-
-        logging.info(
-            f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
-            f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, '
-            f'Number of model parameters on device: {num_parameters_on_device:.2e}. '
-            f'Total number of model parameters: {total_num_parameters:.2e}.'
-        )
-
-        resume_checkpoint_path = self.trainer.ckpt_path
-        if resume_checkpoint_path:
-            init_consumed_samples = self._extract_consumed_samples_from_ckpt(resume_checkpoint_path)
-        else:
-            init_consumed_samples = 0
-        self.init_consumed_samples = init_consumed_samples
-        self.init_global_step = self.trainer.global_step
-
-        if stage == 'predict':
-            return
-        elif stage == 'test':
-            self.build_train_valid_test_datasets(is_train=False)
-            self.setup_test_data(self.cfg.data)
-        else:
-            # TODO: consider adding a ModelPT guard to check if model is being restored.
-            # allowing restored models to optionally setup datasets
-            if self.cfg.data.dataloader_type == "LDDL":
-                self.build_LDDL_data(self.cfg.data)
-                torch.distributed.barrier()
-            else:
-                self.build_train_valid_test_datasets()
-                self.setup_training_data(self.cfg.data)
-                self.setup_validation_data(self.cfg.data)
-
-        # when using pipeline model parallel the final stage need to initialize word embeddings
-        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-            if isinstance(self.model, list):
-                for i, module in enumerate(self.model):
-                    sync_embeddings = (
-                        module.setup_embeddings_and_output_layer
-                        if self.mcore_bert
-                        else module.sync_initial_word_embeddings
-                    )
-                    sync_embeddings()
-            else:
-                sync_embeddings = (
-                    self.model.setup_embeddings_and_output_layer
-                    if self.mcore_bert
-                    else self.model.sync_initial_word_embeddings
-                )
-                sync_embeddings()
-
-        if self.cfg.get('transformer_engine', False) or self.cfg.get('mcore_bert', False):
-            self.setup_transformer_engine_tp_groups()
-
-    @classmethod
-    def merge_cfg_with(cls, path, cfg):
-        """
-        Merge a given configuration dictionary `cfg` with the configuration dictionary
-        obtained from restoring a MegatronBertModel at the specified `path`.
-
-        Args:
-            path (str): The path to the Bert model checkpoint to be restored.
-            cfg (DictConfig): The configuration dictionary to merge.
-
-        Returns:
-            DictConfig: The merged configuration dictionary.
-
-        Examples:
-            >>> path = "/path/to/model/checkpoint"
-            >>> cfg = DictConfig({"model": {"key": "value"}, "trainer": {"precision": 16}})
-            >>> merged_cfg = merge_cfg_with(path, cfg)
-
-        Notes:
-            - The function resolves variables within the `cfg` dictionary using `OmegaConf.resolve`.
-            - Keys in `cfg.model` will override the corresponding keys in the output dictionary.
-            - If "train_ds" exists in `cfg.model.data`, it updates `micro_batch_size` and `global_batch_size`.
-            - If `cfg.trainer` contains a "precision" key, it updates `output.precision`.
-
-        """
-
-        base_cfg = cls.restore_from(path, return_config=True)
-
-        OmegaConf.resolve(cfg)
-        with open_dict(base_cfg):
-            for key, val in cfg.model.items():
-                base_cfg[key] = val
-            if "train_ds" in cfg.model.data:
-                base_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
-                base_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
-            if cfg.get("trainer", None) and cfg.trainer.get("precision"):
-                base_cfg.precision = cfg.trainer.precision
-
-        return base_cfg
-
-    def build_pretraining_data_loader(self, dataset, consumed_samples):
-        """Buld dataloader given an input dataset."""
-
-        if dataset is None:
-            return None
-
-        # Megatron sampler
-        if hasattr(self.cfg.data, 'dataloader_type') and self.cfg.data.dataloader_type is not None:
-            if self.cfg.data.dataloader_type == 'single':
-                batch_sampler = MegatronPretrainingSampler(
-                    total_samples=len(dataset),
-                    consumed_samples=consumed_samples,
-                    micro_batch_size=self.cfg.micro_batch_size,
-                    global_batch_size=self.cfg.global_batch_size,
-                    data_parallel_rank=parallel_state.get_data_parallel_rank(),
-                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
-                    drop_last=self.cfg.get('drop_last', False),
-                    pad_samples_to_global_batch_size=not self.cfg.get('drop_last', False),
-                )
-            elif self.cfg.data.dataloader_type == 'cyclic':
-                batch_sampler = MegatronPretrainingRandomSampler(
-                    total_samples=len(dataset),
-                    consumed_samples=consumed_samples,
-                    micro_batch_size=self.cfg.micro_batch_size,
-                    data_parallel_rank=parallel_state.get_data_parallel_rank(),
-                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
-                    drop_last=self.cfg.get('drop_last', False),
-                    pad_samples_to_global_batch_size=not self.cfg.get('drop_last', False),
-                )
-            else:
-                raise ValueError('cfg.data.dataloader_type must be "single" or "cyclic"')
-        else:
-            raise ValueError('cfg.data.dataloader_type not found. Must be "single" or "cyclic"')
-
-        # Torch dataloader.
-
-        dataloader = torch.utils.data.DataLoader(
-            dataset,
-            shuffle=False,
-            batch_sampler=batch_sampler,
-            num_workers=self.cfg.data.num_workers,
-            pin_memory=True,
-            persistent_workers=True if self.cfg.data.num_workers > 0 else False,
-            collate_fn=dataset.collate_fn,
-        )
-        return dataloader
-
-    def setup_training_data(self, cfg):
-        if self._train_ds:
-            consumed_samples = self.compute_consumed_samples(0)
-            logging.info(
-                f'Setting up train dataloader with len(len(self._train_ds)): {len(self._train_ds)} and consumed samples: {consumed_samples}'
-            )
-            self._train_dl = self.build_pretraining_data_loader(self._train_ds, consumed_samples)
-
-    def setup_validation_data(self, cfg):
-        if self._validation_ds:
-            consumed_samples = 0
-            logging.info(
-                f'Setting up validation dataloader with len(len(self._validation_ds)): {len(self._validation_ds)} and consumed samples: {consumed_samples}'
-            )
-            self._validation_dl = self.build_pretraining_data_loader(self._validation_ds, consumed_samples)
-
-    def setup_eval_dataloader(self, datasets):
-        dataloaders = []
-        for dataset in datasets:
-            eval_dl = self.build_pretraining_data_loader(
-                dataset=dataset,
-                consumed_samples=0,
-            )
-            dataloaders.append(eval_dl)
-        return dataloaders
-
-    def setup_test_data(self, cfg):
-        if self._test_ds:
-            logging.info(
-                f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds[0])}, {len(self._test_ds[1])}'
-            )
-            self._test_dl = self.setup_eval_dataloader(self._test_ds)
-            return
-
-    def training_step(self, dataloader_iter):
-
-        self._optimizer.zero_grad()
-
-        if self.with_distributed_adam:
-            # hack to enable overlapping param sync and forward compute
-            # note: the distributed optimizer monkey-patches each
-            # parameter's __getattribute__ function so that it can
-            # launch parameter all-gathers the first time the
-            # parameter is accessed after the optimizer step. However,
-            # PyTorch directly passes embedding parameters into a C++,
-            # bypassing this process. A quick-and-dirty hack is to
-            # manually interact with the parameter.
-            modules = self.model if isinstance(self.model, list) else [self.model]
-            for module in modules:
-                if isinstance(module, (Float16Module, MCoreFloat16Module)):
-                    module = module.module
-                if not self.mcore_bert:
-                    module = module.language_model
-                if hasattr(module, 'embedding'):
-                    for param in module.embedding.parameters():
-                        param.data_ptr()
-
-        if self.cfg.data.dataloader_type == "LDDL":
-            # this is of type bert dataset
-            seq_length = dataloader_iter.iterator.loaders.get_seqlen()
-        else:
-            seq_length = self.cfg.encoder_seq_length
-
-        # run forward and backwards passes for an entire global batch
-        # we do this inside training_step to support pipeline parallelism
-        fwd_bwd_function = get_forward_backward_func()
-
-        losses_reduced_per_micro_batch = fwd_bwd_function(
-            forward_step_func=self.get_forward_output_and_loss_func(),
-            data_iterator=self._make_data_iterator_list(dataloader_iter),
-            model=self.model,
-            num_microbatches=get_num_microbatches(),
-            forward_only=False,
-            seq_length=seq_length,
-            micro_batch_size=self.cfg.micro_batch_size,
-        )
-
-        if losses_reduced_per_micro_batch:
-            loss_tensors_list = [loss_reduced['loss'] for loss_reduced in losses_reduced_per_micro_batch]
-            loss_tensor = torch.vstack(loss_tensors_list)
-            loss_mean = loss_tensor.mean(axis=0)
-        else:
-            if self.cfg.bert_binary_head == True:
-                loss_mean = torch.tensor([0.0, 0.0, 0.0]).cuda()
-            else:
-                loss_mean = torch.tensor([0.0, 0.0]).cuda()
-
-        # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
-        if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
-            self.allreduce_sequence_parallel_gradients()
-
-        if self.with_distributed_adam:
-            # synchronize asynchronous grad reductions
-            # note: not necessary, but reduces performance degradation
-            # from multiple simultaneous NCCL calls
-            self._optimizer._finish_bucket_grad_sync()
-        elif self.megatron_amp_O2:
-            if self.cfg.get('pipeline_model_parallel_size', 1) > 1 or self.cfg.get('sequence_parallel', False):
-                # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
-                self._optimizer.allreduce_main_grads()
-        else:
-            # async grad allreduce is not currently implemented for O1/autocasting mixed precision training
-            # so we all-reduce gradients after the pipeline
-            self.allreduce_gradients()  # @sangkug we think this is causing memory to blow up (hurts perf)
-
-        if self.cfg.get('pipeline_model_parallel_size', 1) > 1:
-            # when using pipeline parallelism the first and last stage must keep embeddings in sync
-            self.allreduce_first_last_embeddings()
-
-        torch.distributed.broadcast(loss_mean, get_last_rank())
-
-        if self.torch_dtype == torch.float16:
-            loss_scale = self.trainer.precision_plugin.scaler._scale
-            if loss_scale is not None:
-                self.log('loss_scale', loss_scale, batch_size=1)
-
-        self.log('reduced_train_loss', loss_mean[0], prog_bar=True, batch_size=1)
-        if len(loss_mean) > 2:
-            self.log('reduced_lm_train_loss', loss_mean[1], prog_bar=True, batch_size=1)
-            self.log('reduced_sop_train_loss', loss_mean[2], prog_bar=True, batch_size=1)
-        lr = self._optimizer.param_groups[0]['lr']
-        self.log('lr', lr, batch_size=1)
-        self.log('global_step', self.trainer.global_step, prog_bar=True, batch_size=1)
-        self.log(
-            'consumed_samples',
-            self._compute_consumed_samples_after_training_step(),
-            prog_bar=True,
-            batch_size=1,
-        )
-        return loss_mean[0]
-
-    def get_forward_output_and_loss_func(self):
-        def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
-
-            batches, _, dl_idx = next(dataloader_iter)
-            metadata = batches.pop('metadata')
-            batches = {k: v.cuda(non_blocking=True) for k, v in batches.items()}
-
-            if self.mcore_bert:
-
-                batches["tokentype_ids"] = batches.pop("token_type_ids")
-                output_tensor = model(**batches)
-            else:
-                output_tensor = self.forward(**batches).permute(1, 0)
-
-            def loss_func(output_tensor):
-
-                loss_dict = self.loss_func(output_tensor)
-
-                if 'sop loss' in loss_dict:
-                    lm_loss = loss_dict['lm loss']
-                    sop_loss = loss_dict['sop loss']
-                    loss = lm_loss + sop_loss
-                    reduced_loss = average_losses_across_data_parallel_group([loss, lm_loss, sop_loss])
-                else:
-                    lm_loss = loss_dict['lm loss']
-                    loss = lm_loss
-                    reduced_loss = average_losses_across_data_parallel_group([loss, lm_loss])
-
-                if 'hs' in loss_dict:
-                    # metadata = batches.get('metadata', [{}] * len(batches['input_ids']))
-                    return loss, {
-                        'loss': reduced_loss,
-                        'd_hs': loss_dict['hs'],
-                        'q_hs': loss_dict['hs'],
-                        'metadata': metadata,
-                        'dl_idx': dl_idx,
-                    }
-                else:
-                    return loss, {'loss': reduced_loss}
-
-            return output_tensor, loss_func
-
-        return fwd_output_and_loss_func
-
-    def validation_step(self, dataloader_iter):
-        prefix = "test" if self.trainer.testing else "val"
-        if self.cfg.data.dataloader_type == "LDDL":
-            seq_length = dataloader_iter.iterator.get_seqlen()
-        else:
-            seq_length = self.cfg.encoder_seq_length
-
-        fwd_bwd_function = get_forward_backward_func()
-
-        losses_reduced_per_micro_batch = fwd_bwd_function(
-            forward_step_func=self.get_forward_output_and_loss_func(),
-            data_iterator=self._make_data_iterator_list(dataloader_iter),
-            model=self.model,
-            num_microbatches=get_num_microbatches(),
-            forward_only=True,
-            seq_length=seq_length,
-            micro_batch_size=self.cfg.micro_batch_size,
-        )
-
-        if losses_reduced_per_micro_batch:
-            loss_tensors_list = [loss_reduced['loss'] for loss_reduced in losses_reduced_per_micro_batch]
-            loss_tensor = torch.vstack(loss_tensors_list)
-            loss_mean = loss_tensor.mean(axis=0)
-        else:
-            loss_mean = torch.tensor([0.0]).cuda()
-
-        loss = loss_mean[0]
-        if prefix == 'val':
-            self.validation_step_outputs.append(loss)
-        else:
-            assert len(losses_reduced_per_micro_batch) == 1
-            dataloader_idx = losses_reduced_per_micro_batch[0]['dl_idx']
-            self.test_step_outputs[dataloader_idx].append(losses_reduced_per_micro_batch[0])
-        return loss
-
-    def on_test_epoch_end(self):
-        for dataloader_idx, output in enumerate(self.test_step_outputs):
-            self.gather_and_maybe_write_predictions(output, self.cfg.data.data_test, 'test', dataloader_idx)
-
-    def gather_and_maybe_write_predictions(self, output, data_cfg, mode, dataloader_idx=0):
-        if not data_cfg.get("write_embeddings_to_file", False):
-            return True
-        gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())]
-        torch.distributed.all_gather_object(
-            gathered_output_batches,
-            [
-                {
-                    'q_hs': batch['q_hs'],
-                    'd_hs': batch['d_hs'],
-                    'metadata': batch['metadata'],
-                }
-                for batch in output
-            ],
-            group=parallel_state.get_data_parallel_group(),
-        )
-
-        # Remove duplicate examples due to distributed sampler.
-        deduplicated_outputs = {
-            'q_hs': [],
-            'd_hs': [],
-            'metadata': [],
-        }
-        total_size, skipped = 0, 0
-        for rank in range(0, parallel_state.get_data_parallel_world_size()):
-            for batch in gathered_output_batches[rank]:
-                l_q_hs = listify(batch['q_hs'])
-                l_d_hs = listify(batch['d_hs'])
-                l_m = batch['metadata']
-                assert len(l_m) == len(l_q_hs) == len(l_d_hs)
-                for q_hs, d_hs, metadata in zip(
-                    l_q_hs,
-                    l_d_hs,
-                    l_m,
-                ):
-                    total_size += 1
-                    if not metadata.get("__AUTOGENERATED__", False):
-                        deduplicated_outputs['q_hs'].append(q_hs)
-                        deduplicated_outputs['d_hs'].append(d_hs)
-                        deduplicated_outputs['metadata'].append(metadata)
-                    else:
-                        skipped += 1
-
-        logging.info(
-            f"{total_size-skipped} deduplicated outputs in dataloader:{dataloader_idx}, (skipped {skipped} autogenerated examples)."
-        )
-
-        # Write predictions to file
-        if self.global_rank == 0 and data_cfg.get("write_embeddings_to_file", False):
-            logging.info(
-                f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['metadata'])}"
-            )
-
-            # Check if the user provided a prefix path to the file(s) they want to write.
-            if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None:
-                raise ValueError(
-                    f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file."
-                )
-            filename_log_key = f"{mode}_{data_cfg.names[dataloader_idx]}"
-            consumed_samples = self._compute_consumed_samples_after_training_step()
-            fldr_path = f"{data_cfg.output_file_path_prefix}/consumed_samples{consumed_samples}/{filename_log_key}"
-            self.write_embeddings_to_file(deduplicated_outputs, fldr_path, dataloader_idx)
-        return deduplicated_outputs, total_size
-
-    def write_embeddings_to_file(self, outputs, output_file_path, d_idx):
-        emb_type = 'query' if d_idx == 0 else 'doc'
-        hs = torch.cat(outputs['q_hs' if d_idx == 0 else 'd_hs'], dim=0)
-        hs_npy = hs.float().numpy()
-        emb_fldr = f"{output_file_path}"
-        os.makedirs(emb_fldr, exist_ok=True)
-        with open(f"{output_file_path}/{emb_type}.ids", "w") as f:
-            for m in outputs['metadata']:
-                f.write(m[f"{emb_type}_id"] + "\n")
-        np.save(f"{emb_fldr}/{emb_type}.npy", hs_npy)
-        return True
-
-    def inference_loss_func(self, eos_tensors):
-        hs = eos_tensors
-        _blank = torch.zeros(1, device=hs.device, dtype=hs.dtype)[0]
-        return {
-            'hs': eos_tensors,
-            'lm loss': _blank,
-        }
-
-    def _gather_global_inbatch_representations(self, local_tensor):
-        local_tensor = local_tensor.contiguous()
-        if self.backprop_type == 'local':
-            global_tensors = [
-                torch.zeros_like(local_tensor) for _ in range(parallel_state.get_data_parallel_world_size())
-            ]
-            all_gather_no_backprop(global_tensors, local_tensor, group=parallel_state.get_data_parallel_group())
-            global_tensors[parallel_state.get_data_parallel_rank()] = local_tensor
-            global_tensors = torch.cat(global_tensors, dim=0)
-
-        else:
-            global_tensors = all_gather_with_backprop(local_tensor)
-            global_tensors = torch.cat(global_tensors, dim=0)
-
-        return global_tensors
-
-    def loss_func(self, output_tensor):
-        if self.global_inbatch_negatives and self.trainer.training:
-            output_tensor = self._gather_global_inbatch_representations(output_tensor)
-        if self.trainer.testing:
-            return self.inference_loss_func(output_tensor)
-
-        num_tensors_per_example = 2 + self.hard_negatives_to_train
-        bs = output_tensor.shape[0] // num_tensors_per_example
-        chunks = output_tensor.chunk(bs)
-        queries = torch.stack([item[0] for item in chunks])  # shape (bs, embedding_dim)
-        positives = torch.stack([item[1] for item in chunks])  # shape (bs, embedding_dim)
-
-        pos_inbatch_negs_scores = torch.mm(
-            queries, positives.transpose(0, 1)
-        )  # shape (bs, bs); each positive is negative for other queries.
-
-        hard_negs = [
-            torch.stack([item[i + 2] for item in chunks]) for i in range(self.hard_negatives_to_train)
-        ]  # List of length "num_negatives", each tensor of shape (bs, embedding_dim)
-
-        hard_negs_scores = (
-            torch.multiply(
-                queries.unsqueeze(0).repeat(len(hard_negs), 1, 1),
-                torch.stack(hard_negs),
-            )
-            .sum(axis=-1)
-            .T
-        )  # shape = (bs, num_negatives); Hard negatives are not shared between queries.
-
-        scores = torch.cat([pos_inbatch_negs_scores, hard_negs_scores], axis=1)
-
-        scores = scores.clamp(-1.0, 1.0)
-        scores *= self.scale
-
-        labels = torch.tensor(
-            range(len(scores)), dtype=torch.long, device=scores.device
-        )  # Indices of the (query, positive) pairs
-
-        return {'lm loss': self.cross_entropy_loss(scores, labels)}
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
deleted file mode 100644
index b5240ec2e170..000000000000
--- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
+++ /dev/null
@@ -1,475 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import itertools
-import os
-
-import numpy as np
-import torch
-from lightning.pytorch.trainer.trainer import Trainer
-from omegaconf import DictConfig, ListConfig
-
-from nemo.collections.nlp.data.information_retrieval.gpt_embedding_dataset import GPTEmbeddingDataset
-from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
-    get_datasets_weights_and_num_samples,
-)
-from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
-from nemo.utils import logging
-
-try:
-    from megatron.core import parallel_state
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-
-    HAVE_MEGATRON_CORE = False
-
-
-def listify(tensor):
-    l_tensor = []
-    for t in tensor:
-        for rid in range(t.shape[0]):
-            r = t[rid, :].unsqueeze(0).cpu()
-            l_tensor.append(r)
-    return l_tensor
-
-
-def _gather_global_inbatch_representations(local_eos_tensor):
-    local_eos_tensor = local_eos_tensor.contiguous()
-    global_eos_tensors = [
-        torch.zeros_like(local_eos_tensor) for _ in range(parallel_state.get_data_parallel_world_size())
-    ]
-    torch.distributed.all_gather(global_eos_tensors, local_eos_tensor, group=parallel_state.get_data_parallel_group())
-    global_eos_tensors[parallel_state.get_data_parallel_rank()] = local_eos_tensor
-    global_eos_tensors = torch.cat(global_eos_tensors, dim=0)
-    return global_eos_tensors
-
-
-class MegatronGPTEmbeddingModel(MegatronGPTSFTModel):
-    def __init__(self, cfg: DictConfig, trainer: Trainer):
-        super().__init__(cfg, trainer=trainer)
-        self.temperature = self.cfg.get('temperature', 0.02)
-        self.use_all_possible_negatives = self.cfg.get("use_all_possible_negatives", True)
-        self.global_inbatch_negatives = self.cfg.get("global_inbatch_negatives", True)
-        if self.cfg.get("do_mrl", False):
-            min_mrl = self.cfg.get("min_mrl_dim", int(np.log2(32))) - 1
-            max_mrl = int(np.log2(self.cfg.hidden_size // 2))
-            self.mrl_dims = [2**i for i in range(max_mrl, min_mrl, -1)]
-        else:
-            self.mrl_dims = []
-
-        assert (
-            self.cfg.get("post_process", False) is False
-        ), "post_process must be False to get hidden states in the loss_func"
-
-    def model_provider_func(self, pre_process, post_process):
-        # (@adithyare) We need post_process to be False to get hidden states in the loss_func
-        return super().model_provider_func(pre_process, post_process=False)
-
-    def maybe_setup_test(self):
-        if (
-            hasattr(self.cfg.data, 'test_ds')
-            and self.cfg.data.test_ds.get('doc_file_names', None) is not None
-            and self.cfg.data.test_ds.get('query_file_names', None) is not None
-        ):
-            self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds)
-        return
-
-    def maybe_build_test(self):
-        if (
-            hasattr(self.cfg.data, 'test_ds')
-            and self.cfg.data.test_ds.get('doc_file_names', None) is not None
-            and self.cfg.data.test_ds.get('query_file_names', None) is not None
-        ):
-            logging.info('Building GPT Embedder test datasets.')
-            # Wrap this in a list since the general finetuning parent class supports multi-validation.
-            self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False)
-
-    def _build_dataset(self, data_cfg, is_train=True):
-        packed_sequence = data_cfg.get("packed_sequence", False)
-
-        # Determine if we are using a single dataset or a list of datasets.
-        if is_train:
-            # Construct the data prefix list for `get_datasets_weights_and_num_samples()`
-            # that is of the format [weight1,file_name1,weight2,file_name2,...]
-            if data_cfg.concat_sampling_probabilities is None or not isinstance(
-                data_cfg.concat_sampling_probabilities, ListConfig
-            ):
-                raise ValueError(
-                    (
-                        f"concat_sampling_probabilities must be a ListConfig with the same number of files in file_names."
-                        f"Found: {data_cfg.concat_sampling_probabilities}"
-                    )
-                )
-
-            if len(data_cfg.get('concat_sampling_probabilities', None)) != len(data_cfg.file_names):
-                raise ValueError(
-                    (
-                        f"concat_sampling_probabilities must be of the same size as file_names.",
-                        f"Provided size {len(data_cfg.concat_sampling_probabilities)}, number of datasets {len(data_cfg.file_names)}",
-                    )
-                )
-
-            data_prefix = []
-            for weight, prefix in zip(data_cfg.concat_sampling_probabilities, data_cfg.file_names):
-                data_prefix.append(weight)
-                data_prefix.append(prefix)
-
-            if self.trainer.max_steps is None or self.trainer.max_steps <= 0:
-                raise ValueError(
-                    f'Trainer max_steps must be set to a positive integer. Found {self.trainer.max_steps}'
-                )
-            num_train_samples = [self.trainer.max_steps * data_cfg.global_batch_size]
-            _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples)
-            num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset])
-        else:
-            num_query_files = len(data_cfg.query_file_names) if data_cfg.query_file_names is not None else 0
-            num_doc_files = len(data_cfg.doc_file_names) if data_cfg.doc_file_names is not None else 0
-            num_query_samples_per_dataset = [[None]] * num_query_files
-            num_doc_samples_per_dataset = [[None]] * num_doc_files
-
-        # Check dataset max_seq_legnth and max_position_embeddings size
-        if (
-            self.cfg.get('position_embedding_type', None) in [None, 'learned_absolute']
-            and data_cfg.max_seq_length > self.cfg.max_position_embeddings
-        ):
-            logging.warning(
-                f"Set dataset max_seq_length to max_position_embeddings {self.cfg.max_position_embeddings} if using learned_absolute position embedding"
-            )
-            data_cfg.max_seq_length = self.cfg.max_position_embeddings
-
-        # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8
-        # When using sequence parallel, sequence will further be split by TP size
-        pad_seq_length_to_mult = (
-            8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16
-        )
-        if is_train:
-            datasets = []
-            for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset):
-                dataset = GPTEmbeddingDataset(
-                    file_path=file_path,
-                    tokenizer=self.tokenizer,
-                    max_seq_length=data_cfg.max_seq_length,
-                    min_seq_length=data_cfg.min_seq_length,
-                    add_bos=data_cfg.get('add_bos', False),
-                    add_eos=data_cfg.get('add_eos', True),
-                    max_num_samples=num_samples[0],
-                    seed=data_cfg.get('seed', 1234),
-                    index_mapping_dir=data_cfg.get('index_mapping_dir', None),
-                    virtual_tokens=self.virtual_tokens,
-                    memmap_workers=data_cfg.get(
-                        'memmap_workers', None
-                    ),  # used to set num. of workers to create the memmap index files
-                    truncation_method=data_cfg.get(
-                        'truncation_method', 'right'
-                    ),  # used to choose truncation method. Options: ['random', 'left', 'right']
-                    special_tokens=self.cfg.data.get(
-                        'chat_prompt_tokens', None
-                    ),  # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
-                )
-                datasets.append(dataset)
-            if packed_sequence:
-                raise NotImplementedError("Packed sequence is not supported for MegatronGPTEmbeddingModel")
-
-            dataset = BlendableDataset(
-                datasets=datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend
-            )
-            return dataset
-        else:
-            if data_cfg.query_file_names is None or data_cfg.doc_file_names is None:
-                return []
-
-            query_dataset = GPTEmbeddingDataset(
-                file_path=data_cfg.query_file_names[0],
-                tokenizer=self.tokenizer,
-                max_seq_length=data_cfg.max_seq_length,
-                min_seq_length=data_cfg.min_seq_length,
-                add_bos=data_cfg.get('add_bos', False),
-                add_eos=data_cfg.get('add_eos', True),
-                max_num_samples=None,
-                seed=data_cfg.get('seed', 1234),
-                index_mapping_dir=data_cfg.get('index_mapping_dir', None),
-                virtual_tokens=self.virtual_tokens,
-                memmap_workers=data_cfg.get(
-                    'memmap_workers', None
-                ),  # used to set num. of workers to create the memmap index files
-                truncation_method=data_cfg.get(
-                    'truncation_method', 'right'
-                ),  # used to choose truncation method. Options: ['random', 'left', 'right']
-                special_tokens=self.cfg.data.get(
-                    'chat_prompt_tokens', None
-                ),  # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
-                data_type="query",
-            )
-            doc_dataset = GPTEmbeddingDataset(
-                file_path=data_cfg.doc_file_names[0],
-                tokenizer=self.tokenizer,
-                max_seq_length=data_cfg.max_seq_length,
-                min_seq_length=data_cfg.min_seq_length,
-                add_bos=data_cfg.get('add_bos', False),
-                add_eos=data_cfg.get('add_eos', True),
-                max_num_samples=None,
-                seed=data_cfg.get('seed', 1234),
-                index_mapping_dir=data_cfg.get('index_mapping_dir', None),
-                virtual_tokens=self.virtual_tokens,
-                memmap_workers=data_cfg.get(
-                    'memmap_workers', None
-                ),  # used to set num. of workers to create the memmap index files
-                truncation_method=data_cfg.get(
-                    'truncation_method', 'right'
-                ),  # used to choose truncation method. Options: ['random', 'left', 'right']
-                special_tokens=self.cfg.data.get(
-                    'chat_prompt_tokens', None
-                ),  # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
-                data_type="doc",
-            )
-            return [query_dataset, doc_dataset]
-
-    def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only):
-        loss_mean, non_loss_tensors = self.fwd_bwd_step(dataloader_iter, forward_only)
-        avg_pos_cs = non_loss_tensors['avg_pos_cs'][0].item()
-        avg_neg_cs = non_loss_tensors['avg_neg_cs'][0].item()
-        diff_cs = non_loss_tensors['diff_cs'][0].item()
-        self.log("avg_pos_cs", avg_pos_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
-        self.log("avg_neg_cs", avg_neg_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
-        self.log("diff_cs", diff_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
-        return loss_mean
-
-    def inference_step_validation_call(self, batch, batch_idx, data_cfg, dataloader_idx=0):
-        metadata = batch.get('metadata', [{}] * len(batch['tokens']))
-        loss, non_loss_tensors = self.local_validation_step(itertools.chain([dataloader_idx], [batch]))
-        outputs = {
-            'loss': loss,
-            'metadata': metadata,  # [dict]
-            'q_hs': non_loss_tensors['query_hs'],  # [batch_size, hidden_size]
-            'd_hs': non_loss_tensors['doc_hs'],  # [batch_size, hidden_size]
-        }
-        return outputs
-
-    def gather_and_maybe_write_predictions(self, output, data_cfg, mode, averaged_metric, dataloader_idx=0):
-        if not data_cfg.get("write_embeddings_to_file", False):
-            return True
-        gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())]
-        torch.distributed.all_gather_object(
-            gathered_output_batches,
-            [
-                {
-                    'q_hs': batch['q_hs'],
-                    'd_hs': batch['d_hs'],
-                    'metadata': batch['metadata'],
-                }
-                for batch in output
-            ],
-            group=parallel_state.get_data_parallel_group(),
-        )
-
-        # Remove duplicate examples due to distributed sampler.
-        deduplicated_outputs = {
-            'q_hs': [],
-            'd_hs': [],
-            'metadata': [],
-        }
-        total_size, skipped = 0, 0
-        for rank in range(0, parallel_state.get_data_parallel_world_size()):
-            for batch in gathered_output_batches[rank]:
-                l_q_hs = listify(batch['q_hs'])
-                l_d_hs = listify(batch['d_hs'])
-                l_m = batch['metadata']
-                assert len(l_m) == len(l_q_hs) == len(l_d_hs)
-                for q_hs, d_hs, metadata in zip(
-                    l_q_hs,
-                    l_d_hs,
-                    l_m,
-                ):
-                    total_size += 1
-                    if not metadata.get("__AUTOGENERATED__", False):
-                        deduplicated_outputs['q_hs'].append(q_hs)
-                        deduplicated_outputs['d_hs'].append(d_hs)
-                        deduplicated_outputs['metadata'].append(metadata)
-                    else:
-                        skipped += 1
-
-        logging.info(
-            f"{total_size-skipped} deduplicated outputs in dataloader:{dataloader_idx}, (skipped {skipped} autogenerated examples)."
-        )
-        # Compute metric score
-        metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name
-        assert metric_name == "loss", "Only loss is supported for now."
-        # avg_pos_cs = torch.tensor(deduplicated_outputs['avg_pos_cs']).mean().item()
-        # avg_neg_cs = torch.tensor(deduplicated_outputs['avg_neg_cs']).mean().item()
-        # diff_cs = torch.tensor(deduplicated_outputs['diff_cs']).mean().item()
-        # self.log('val_avg_pos_cs', avg_pos_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
-        # self.log('val_avg_neg_cs', avg_neg_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
-        # self.log('val_diff_cs', diff_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
-
-        # Write predictions to file
-        if self.global_rank == 0 and data_cfg.get("write_embeddings_to_file", False):
-            logging.info(
-                f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['metadata'])}"
-            )
-
-            # Check if the user provided a prefix path to the file(s) they want to write.
-            if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None:
-                raise ValueError(
-                    f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file."
-                )
-            # (@adithyare) We are not using the log key to write the embeddings to file
-            filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode)
-            consumed_samples = self._compute_consumed_samples_after_training_step()
-            fldr_path = f"{data_cfg.output_file_path_prefix}/consumed_samples{consumed_samples}/{filename_log_key}"
-            self.write_embeddings_to_file(deduplicated_outputs, fldr_path, dataloader_idx)
-        return deduplicated_outputs, total_size
-
-    def write_embeddings_to_file(self, outputs, output_file_path, d_idx):
-        emb_type = 'query' if d_idx == 0 else 'doc'
-        hs = torch.cat(outputs['q_hs' if d_idx == 0 else 'd_hs'], dim=0)
-        hs_npy = hs.float().numpy()
-        emb_fldr = f"{output_file_path}"
-        os.makedirs(emb_fldr, exist_ok=True)
-        with open(f"{output_file_path}/{emb_type}.ids", "w") as f:
-            for m in outputs['metadata']:
-                f.write(m[f"{emb_type}_id"] + "\n")
-        np.save(f"{emb_fldr}/{emb_type}.npy", hs_npy)
-        return True
-
-    def local_validation_step(self, dataloader_iter):
-        """
-        Our dataloaders produce a micro-batch and then we fetch
-        a number of microbatches depending on the global batch size and model parallel size
-        from the dataloader to produce a list of microbatches.
-        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
-        """
-        # Check if iterator is exhausted
-        # dataloader_iter, done = self._val_iterator_done(dataloader_iter)
-        # if done:
-        #     return
-        # Get the dataloader_idx when MegatronGPTSFTModel calls validation_step of MegatronGPTModel
-        next_item_dataloader = next(dataloader_iter)
-        if isinstance(next_item_dataloader, int):
-            dataloader_idx = next_item_dataloader
-        else:
-            dataloader_iter = itertools.chain([next_item_dataloader], dataloader_iter)
-        mode = 'test' if self.trainer.testing else 'val'
-        # Initialize userbuffer communicators.
-        if self.initialize_ub:
-            self.initialize_ub_func()
-
-        if isinstance(self.model, list):
-            for model_module in self.model:
-                model_module.eval()
-
-        if self.cfg.get('fp8', False):
-            first_val_step = self.prev_step_training and not self.training
-            self.prev_step_training = self.training
-        else:
-            first_val_step = None
-
-        loss, non_loss_tensors = self.fwd_bwd_step(dataloader_iter, True, first_val_step)
-
-        if isinstance(self.model, list):
-            for model_module in self.model:
-                model_module.train()
-
-        if mode == 'val':
-            # MegatronGPTSFTModel class supports multiple dataloaders and uses validation_step of MegatronGPTModel.
-            # Supporting that case with below lines
-            if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
-                self.validation_step_outputs[dataloader_idx].append(loss)
-            else:
-                self.validation_step_outputs.append(loss)
-        else:
-            if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
-                self.test_step_outputs[dataloader_idx].append(loss)
-            else:
-                self.test_step_outputs.append(loss)
-
-        return loss, non_loss_tensors
-
-    def constrastive_scores(self, pos_doc_hs, neg_doc_hs, query_hs, bs, temperature, use_all_possible_negatives=False):
-        all_doc_hs = torch.cat([pos_doc_hs, neg_doc_hs], dim=0)  # (2bs) x hidden_size
-        cs = torch.mm(query_hs, all_doc_hs.transpose(0, 1))  # (bs) x (2bs)
-        pos_cs = cs[:, :bs].diag()
-        neg_cs = cs[:, bs:].diag()
-        if use_all_possible_negatives:
-            labels = torch.arange(bs, device=cs.device).long()
-        else:
-            labels = torch.zeros(bs, device=cs.device).long()
-            cs = torch.cat([pos_cs.unsqueeze(1), neg_cs.unsqueeze(1)], dim=1)
-        pos_cs = pos_cs.clone().detach().mean()
-        neg_cs = neg_cs.clone().detach().mean()
-        cs = cs.clamp(-1.0, 1.0)
-        cs = cs / temperature
-        return cs, pos_cs, neg_cs, labels
-
-    def inference_loss_func(self, loss_mask, num_valid_tokens_in_ub, eos_tensors):
-        hs = eos_tensors
-        hs = torch.nn.functional.normalize(hs, dim=1)
-        _blank = torch.zeros(1, device=hs.device, dtype=hs.dtype)[0]
-        return {
-            "loss": _blank,
-            "query_hs": hs,
-            "pos_doc_hs": hs,
-            "pos_cs": _blank,
-            "neg_cs": _blank,
-            "diff_cs": _blank,
-        }
-
-    def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor):
-        idx = torch.arange(output_tensor.shape[1], device=output_tensor.device)
-        eos_tensors = output_tensor[loss_mask, idx, :]
-        if self.global_inbatch_negatives and self.trainer.training:
-            eos_tensors = _gather_global_inbatch_representations(eos_tensors)
-        if not self.trainer.training:
-            return self.inference_loss_func(loss_mask, num_valid_tokens_in_ub, eos_tensors)
-        bs = eos_tensors.shape[0] // 3
-        query_hs = eos_tensors[::3, :]  # every third tensor is a query (bs x hidden_size)
-        pos_doc_hs = eos_tensors[1::3, :]  # every third tensor is a positive doc (bs x hidden_size)
-        neg_doc_hs = eos_tensors[2::3, :]  # every third tensor is a negative doc (bs x hidden_size)
-
-        query_hs = torch.nn.functional.normalize(query_hs, dim=1)
-        pos_doc_hs = torch.nn.functional.normalize(pos_doc_hs, dim=1)
-        neg_doc_hs = torch.nn.functional.normalize(neg_doc_hs, dim=1)
-
-        cs, pos_cs, neg_cs, labels = self.constrastive_scores(
-            pos_doc_hs, neg_doc_hs, query_hs, bs, self.temperature, self.use_all_possible_negatives
-        )
-        loss = torch.nn.functional.cross_entropy(cs, labels)
-        if self.mrl_dims:
-            for dim in self.mrl_dims:
-                cs_dim, _, _, _ = self.constrastive_scores(
-                    pos_doc_hs[:, :dim],
-                    neg_doc_hs[:, :dim],
-                    query_hs[:, :dim],
-                    bs,
-                    self.temperature,
-                    self.use_all_possible_negatives,
-                )
-                loss += torch.nn.functional.cross_entropy(cs_dim, labels)
-
-        cp_size = self.cfg.get('context_parallel_size', 1)
-        if cp_size > 1:
-            torch.distributed.all_reduce(loss, group=parallel_state.get_context_parallel_group())
-        query_hs = query_hs.clone().detach()
-        pos_doc_hs = pos_doc_hs.clone().detach()
-        diff_cs = pos_cs - neg_cs
-        return {
-            "loss": loss,
-            "query_hs": query_hs,
-            "pos_doc_hs": pos_doc_hs,
-            "pos_cs": pos_cs,
-            "neg_cs": neg_cs,
-            "diff_cs": diff_cs,
-        }
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py
deleted file mode 100644
index 2f7722abc663..000000000000
--- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import itertools
-import os
-
-import numpy as np
-import torch
-from lightning.pytorch.trainer.trainer import Trainer
-from omegaconf import DictConfig, ListConfig
-
-from nemo.collections.nlp.data.information_retrieval.gpt_embedding_dataset import GPTRerankerDataset
-from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
-    get_datasets_weights_and_num_samples,
-)
-from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
-from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import (
-    MegatronGPTEmbeddingModel,
-    _gather_global_inbatch_representations,
-)
-from nemo.utils import logging
-
-try:
-    from megatron.core import parallel_state
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-
-    HAVE_MEGATRON_CORE = False
-
-
-def listify(tensor):
-    l_tensor = []
-    for t in tensor:
-        for rid in range(t.shape[0]):
-            r = t[rid, :].unsqueeze(0).cpu()
-            l_tensor.append(r)
-    return l_tensor
-
-
-class MegatronGPTRerankerModel(MegatronGPTEmbeddingModel):
-    def __init__(self, cfg: DictConfig, trainer: Trainer):
-        self.reward_model_loss = cfg.get("reward_model_loss", False)
-        super().__init__(cfg, trainer=trainer)
-
-    def model_provider_func(self, pre_process, post_process):
-        # (@adithyare) We need post_process to be False to get hidden states in the loss_func
-        return super().model_provider_func(pre_process, post_process=False)
-
-    def maybe_setup_test(self):
-        if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None:
-            self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds)
-        return
-
-    def maybe_build_test(self):
-        if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None:
-            logging.info('Building GPT Reranker test datasets.')
-            # Wrap this in a list since the general finetuning parent class supports multi-validation.
-            self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False)
-
-    def _build_dataset(self, data_cfg, is_train=True):
-        packed_sequence = data_cfg.get("packed_sequence", False)
-
-        # Determine if we are using a single dataset or a list of datasets.
-        if is_train:
-            # Construct the data prefix list for `get_datasets_weights_and_num_samples()`
-            # that is of the format [weight1,file_name1,weight2,file_name2,...]
-            if data_cfg.concat_sampling_probabilities is None or not isinstance(
-                data_cfg.concat_sampling_probabilities, ListConfig
-            ):
-                raise ValueError(
-                    (
-                        f"concat_sampling_probabilities must be a ListConfig with the same number of files in file_names."
-                        f"Found: {data_cfg.concat_sampling_probabilities}"
-                    )
-                )
-
-            if len(data_cfg.get('concat_sampling_probabilities', None)) != len(data_cfg.file_names):
-                raise ValueError(
-                    (
-                        f"concat_sampling_probabilities must be of the same size as file_names.",
-                        f"Provided size {len(data_cfg.concat_sampling_probabilities)}, number of datasets {len(data_cfg.file_names)}",
-                    )
-                )
-
-            data_prefix = []
-            for weight, prefix in zip(data_cfg.concat_sampling_probabilities, data_cfg.file_names):
-                data_prefix.append(weight)
-                data_prefix.append(prefix)
-
-            if self.trainer.max_steps is None or self.trainer.max_steps <= 0:
-                raise ValueError(
-                    f'Trainer max_steps must be set to a positive integer. Found {self.trainer.max_steps}'
-                )
-            num_train_samples = [self.trainer.max_steps * data_cfg.global_batch_size]
-            _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples)
-            num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset])
-        else:
-            num_train_samples_per_dataset = [[None]] * len(data_cfg.file_names)
-
-        # Check dataset max_seq_legnth and max_position_embeddings size
-        if (
-            self.cfg.get('position_embedding_type', None) in [None, 'learned_absolute']
-            and data_cfg.max_seq_length > self.cfg.max_position_embeddings
-        ):
-            logging.warning(
-                f"Set dataset max_seq_length to max_position_embeddings {self.cfg.max_position_embeddings} if using learned_absolute position embedding"
-            )
-            data_cfg.max_seq_length = self.cfg.max_position_embeddings
-
-        # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8
-        # When using sequence parallel, sequence will further be split by TP size
-        pad_seq_length_to_mult = (
-            8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16
-        )
-        pad_seq_length_to_mult *= self.cfg.get('context_parallel_size', 1)
-
-        datasets = []
-        for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset):
-            dataset = GPTRerankerDataset(
-                file_path=file_path,
-                tokenizer=self.tokenizer,
-                max_seq_length=data_cfg.max_seq_length,
-                min_seq_length=data_cfg.min_seq_length,
-                add_bos=data_cfg.get('add_bos', False),
-                add_eos=data_cfg.get('add_eos', True),
-                max_num_samples=num_samples[0],
-                seed=data_cfg.get('seed', 1234),
-                index_mapping_dir=data_cfg.get('index_mapping_dir', None),
-                virtual_tokens=self.virtual_tokens,
-                memmap_workers=data_cfg.get(
-                    'memmap_workers', None
-                ),  # used to set num. of workers to create the memmap index files
-                truncation_method=data_cfg.get(
-                    'truncation_method', 'right'
-                ),  # used to choose truncation method. Options: ['random', 'left', 'right']
-                special_tokens=self.cfg.data.get(
-                    'chat_prompt_tokens', None
-                ),  # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
-                data_type="train" if is_train else "validation",
-            )
-            datasets.append(dataset)
-        if is_train:
-            if packed_sequence:
-                num_train_samples_after_blend = sum(len(dataset) for dataset in datasets)
-            dataset = BlendableDataset(
-                datasets=datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend
-            )
-            return dataset
-        else:
-            return datasets
-
-    def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only):
-        loss_mean, non_loss_tensors = self.fwd_bwd_step(dataloader_iter, forward_only)
-        logit_diff = non_loss_tensors['logit_diff'][0].item()
-        self.log("logit_diff", logit_diff, prog_bar=True, rank_zero_only=True, batch_size=1)
-        return loss_mean
-
-    def inference_step_validation_call(self, batch, batch_idx, data_cfg, dataloader_idx=0):
-        metadata = batch.get('metadata', [{}] * len(batch['tokens']))
-        loss, non_loss_tensors = self.local_validation_step(itertools.chain([dataloader_idx], [batch]))
-        outputs = {
-            'loss': loss,
-            'metadata': metadata,  # [dict]
-            'query_pos_doc_logit': non_loss_tensors['query_pos_doc_logit'],  # [batch_size, hidden_size]
-        }
-        return outputs
-
-    def inference_loss_func(self, loss_mask, num_valid_tokens_in_ub, eos_tensors):
-        query_pos_doc_hs = eos_tensors
-        _blank = torch.zeros(1, device=query_pos_doc_hs.device, dtype=query_pos_doc_hs.dtype)[0]
-        return {
-            "loss": _blank,
-            "query_pos_doc_logit": query_pos_doc_hs,
-            "query_neg_doc_logit": _blank,
-            "logit_diff": _blank,
-        }
-
-    def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor):
-        idx = torch.arange(output_tensor.shape[1], device=output_tensor.device)
-        eos_tensors = output_tensor[loss_mask, idx, :]  # (bs x 1)
-        if self.global_inbatch_negatives and self.trainer.training:
-            eos_tensors = _gather_global_inbatch_representations(eos_tensors)
-        if not self.trainer.training:
-            return self.inference_loss_func(loss_mask, num_valid_tokens_in_ub, eos_tensors)
-        bs = eos_tensors.shape[0] // 2
-        query_pos_doc_hs = eos_tensors[::2, :]  # every second tensor from idx 0 is a query w pos_doc (bs x 1)
-        query_neg_doc_hs = eos_tensors[1::2, :]  # every second tensor from idx 1 is a query w negative doc (bs x 1)
-
-        if self.reward_model_loss:
-            loss = -torch.nn.functional.logsigmoid(query_pos_doc_hs - query_neg_doc_hs).mean()
-        else:
-            cs = torch.cat([query_pos_doc_hs, query_neg_doc_hs], dim=1)  # (bs x 2)
-            cs = cs / self.temperature
-            labels = torch.zeros(bs, device=cs.device).long()
-            loss = torch.nn.functional.cross_entropy(cs, labels)
-
-        cp_size = self.cfg.get('context_parallel_size', 1)
-        if cp_size > 1:
-            torch.distributed.all_reduce(loss, group=parallel_state.get_context_parallel_group())
-        query_pos_doc_hs = query_pos_doc_hs.clone().detach()
-        query_neg_doc_hs = query_neg_doc_hs.clone().detach()
-        logit_diffs = torch.mean(query_pos_doc_hs - query_neg_doc_hs)
-        return {
-            "loss": loss,
-            "query_pos_doc_logit": query_pos_doc_hs,
-            "query_neg_doc_logit": query_neg_doc_hs,
-            "logit_diff": logit_diffs,
-        }
-
-    def gather_and_maybe_write_predictions(self, output, data_cfg, mode, averaged_metric, dataloader_idx=0):
-        if not data_cfg.get("write_embeddings_to_file", False):
-            return True
-        gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())]
-        torch.distributed.all_gather_object(
-            gathered_output_batches,
-            [
-                {
-                    'query_pos_doc_logit': batch['query_pos_doc_logit'],
-                    'metadata': batch['metadata'],
-                }
-                for batch in output
-            ],
-            group=parallel_state.get_data_parallel_group(),
-        )
-
-        # Remove duplicate examples due to distributed sampler.
-        deduplicated_outputs = {
-            'query_pos_doc_logit': [],
-            'metadata': [],
-        }
-        total_size, skipped = 0, 0
-        for rank in range(0, parallel_state.get_data_parallel_world_size()):
-            for batch in gathered_output_batches[rank]:
-                l_q_hs = listify(batch['query_pos_doc_logit'])
-                l_m = batch['metadata']
-                assert len(l_m) == len(l_q_hs)
-                for q_hs, metadata in zip(
-                    l_q_hs,
-                    l_m,
-                ):
-                    total_size += 1
-                    if not metadata.get("__AUTOGENERATED__", False):
-                        deduplicated_outputs['query_pos_doc_logit'].append(q_hs)
-                        deduplicated_outputs['metadata'].append(metadata)
-                    else:
-                        skipped += 1
-
-        logging.info(
-            f"{total_size-skipped} deduplicated outputs in dataloader:{dataloader_idx}, (skipped {skipped} autogenerated examples)."
-        )
-        # Compute metric score
-        metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name
-        assert metric_name == "loss", "Only loss is supported for now."
-        # avg_pos_cs = torch.tensor(deduplicated_outputs['avg_pos_cs']).mean().item()
-        # avg_neg_cs = torch.tensor(deduplicated_outputs['avg_neg_cs']).mean().item()
-        # diff_cs = torch.tensor(deduplicated_outputs['diff_cs']).mean().item()
-        # self.log('val_avg_pos_cs', avg_pos_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
-        # self.log('val_avg_neg_cs', avg_neg_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
-        # self.log('val_diff_cs', diff_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
-
-        # Write predictions to file
-        if self.global_rank == 0 and data_cfg.get("write_embeddings_to_file", False):
-            logging.info(
-                f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['metadata'])}"
-            )
-
-            # Check if the user provided a prefix path to the file(s) they want to write.
-            if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None:
-                raise ValueError(
-                    f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file."
-                )
-            # (@adithyare) We are not using the log key to write the embeddings to file
-            filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode)
-            consumed_samples = self._compute_consumed_samples_after_training_step()
-            fldr_path = f"{data_cfg.output_file_path_prefix}/consumed_samples{consumed_samples}/{filename_log_key}"
-            self.write_embeddings_to_file(deduplicated_outputs, fldr_path, dataloader_idx)
-        return deduplicated_outputs, total_size
-
-    def write_embeddings_to_file(self, outputs, output_file_path, d_idx):
-        hs = torch.cat(outputs['query_pos_doc_logit'], dim=0)
-        hs_npy = hs.float().numpy()
-        emb_fldr = f"{output_file_path}"
-        os.makedirs(emb_fldr, exist_ok=True)
-        with open(f"{output_file_path}/logits.ids", "w") as f:
-            for m in outputs['metadata']:
-                f.write(f"{m['query_id'].strip()} {m['doc_id']}\n")
-        np.save(f"{emb_fldr}/logits.npy", hs_npy)
-        return True
diff --git a/nemo/collections/nlp/models/intent_slot_classification/__init__.py b/nemo/collections/nlp/models/intent_slot_classification/__init__.py
deleted file mode 100644
index 80f5f92bd80b..000000000000
--- a/nemo/collections/nlp/models/intent_slot_classification/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.models.intent_slot_classification.intent_slot_classification_model import (
-    IntentSlotClassificationModel,
-)
-from nemo.collections.nlp.models.intent_slot_classification.multi_label_intent_slot_classification_model import (
-    MultiLabelIntentSlotClassificationModel,
-)
diff --git a/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py b/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py
deleted file mode 100644
index a49bc699ab24..000000000000
--- a/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pathlib
-from typing import Dict, List, Optional
-
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig, OmegaConf
-from torch.utils.data import DataLoader
-
-from nemo.collections.common.losses import AggregatorLoss, CrossEntropyLoss
-from nemo.collections.nlp.data.intent_slot_classification import (
-    IntentSlotClassificationDataset,
-    IntentSlotDataDesc,
-    IntentSlotInferenceDataset,
-)
-from nemo.collections.nlp.metrics.classification_report import ClassificationReport
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.collections.nlp.modules.common import SequenceTokenClassifier
-from nemo.collections.nlp.parts.utils_funcs import tensor2list
-from nemo.core.classes import typecheck
-from nemo.core.classes.common import PretrainedModelInfo
-from nemo.utils import logging
-
-
-class IntentSlotClassificationModel(NLPModel):
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        """Initializes BERT Joint Intent and Slot model."""
-        self.max_seq_length = cfg.language_model.max_seq_length
-        # init superclass
-        # Check the presence of data_dir.
-        if not cfg.data_dir or not os.path.exists(cfg.data_dir):
-            # Set default values of data_desc.
-            self._set_defaults_data_desc(cfg)
-        else:
-            self.data_dir = cfg.data_dir
-            # Update configuration of data_desc.
-            self._set_data_desc_to_cfg(cfg, cfg.data_dir, cfg.train_ds, cfg.validation_ds)
-        super().__init__(cfg=cfg, trainer=trainer)
-        # Initialize Classifier.
-        self._reconfigure_classifier()
-
-    def _set_defaults_data_desc(self, cfg):
-        """
-        Method makes sure that cfg.data_desc params are set.
-        If not, set's them to "dummy" defaults.
-        """
-        if not hasattr(cfg, "data_desc"):
-            OmegaConf.set_struct(cfg, False)
-            cfg.data_desc = {}
-            # Intents.
-            cfg.data_desc.intent_labels = " "
-            cfg.data_desc.intent_label_ids = {" ": 0}
-            cfg.data_desc.intent_weights = [1]
-            # Slots.
-            cfg.data_desc.slot_labels = " "
-            cfg.data_desc.slot_label_ids = {" ": 0}
-            cfg.data_desc.slot_weights = [1]
-
-            cfg.data_desc.pad_label = "O"
-            OmegaConf.set_struct(cfg, True)
-
-    def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds):
-        """Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc."""
-        # Save data from data desc to config - so it can be reused later, e.g. in inference.
-        data_desc = IntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix])
-        OmegaConf.set_struct(cfg, False)
-        if not hasattr(cfg, "data_desc") or cfg.data_desc is None:
-            cfg.data_desc = {}
-        # Intents.
-        cfg.data_desc.intent_labels = list(data_desc.intents_label_ids.keys())
-        cfg.data_desc.intent_label_ids = data_desc.intents_label_ids
-        cfg.data_desc.intent_weights = data_desc.intent_weights
-        # Slots.
-        cfg.data_desc.slot_labels = list(data_desc.slots_label_ids.keys())
-        cfg.data_desc.slot_label_ids = data_desc.slots_label_ids
-        cfg.data_desc.slot_weights = data_desc.slot_weights
-
-        cfg.data_desc.pad_label = data_desc.pad_label
-
-        # for older(pre - 1.0.0.b3) configs compatibility
-        if not hasattr(cfg, "class_labels") or cfg.class_labels is None:
-            cfg.class_labels = {}
-            cfg.class_labels = OmegaConf.create(
-                {'intent_labels_file': 'intent_labels.csv', 'slot_labels_file': 'slot_labels.csv'}
-            )
-
-        slot_labels_file = os.path.join(data_dir, pathlib.Path(cfg.class_labels.slot_labels_file).name)
-        intent_labels_file = os.path.join(data_dir, pathlib.Path(cfg.class_labels.intent_labels_file).name)
-        self._save_label_ids(data_desc.slots_label_ids, slot_labels_file)
-        self._save_label_ids(data_desc.intents_label_ids, intent_labels_file)
-
-        self.register_artifact('class_labels.intent_labels_file', intent_labels_file)
-        self.register_artifact('class_labels.slot_labels_file', slot_labels_file)
-        OmegaConf.set_struct(cfg, True)
-
-    def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None:
-        """Saves label ids map to a file"""
-        with open(filename, 'w') as out:
-            labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1]))
-            out.write('\n'.join(labels))
-            logging.info(f'Labels: {label_ids}')
-            logging.info(f'Labels mapping saved to : {out.name}')
-
-    def _reconfigure_classifier(self):
-        """Method reconfigures the classifier depending on the settings of model cfg.data_desc"""
-
-        self.classifier = SequenceTokenClassifier(
-            hidden_size=self.hidden_size,
-            num_intents=len(self.cfg.data_desc.intent_labels),
-            num_slots=len(self.cfg.data_desc.slot_labels),
-            dropout=self.cfg.head.fc_dropout,
-            num_layers=self.cfg.head.num_output_layers,
-            log_softmax=False,
-        )
-
-        # define losses
-        if self.cfg.class_balancing == 'weighted_loss':
-            # You may need to increase the number of epochs for convergence when using weighted_loss
-            self.intent_loss = CrossEntropyLoss(logits_ndim=2, weight=self.cfg.data_desc.intent_weights)
-            self.slot_loss = CrossEntropyLoss(logits_ndim=3, weight=self.cfg.data_desc.slot_weights)
-        else:
-            self.intent_loss = CrossEntropyLoss(logits_ndim=2)
-            self.slot_loss = CrossEntropyLoss(logits_ndim=3)
-
-        self.total_loss = AggregatorLoss(
-            num_inputs=2, weights=[self.cfg.intent_loss_weight, 1.0 - self.cfg.intent_loss_weight]
-        )
-
-        # setup to track metrics
-        self.intent_classification_report = ClassificationReport(
-            num_classes=len(self.cfg.data_desc.intent_labels),
-            label_ids=self.cfg.data_desc.intent_label_ids,
-            dist_sync_on_step=True,
-            mode='micro',
-        )
-        self.slot_classification_report = ClassificationReport(
-            num_classes=len(self.cfg.data_desc.slot_labels),
-            label_ids=self.cfg.data_desc.slot_label_ids,
-            dist_sync_on_step=True,
-            mode='micro',
-        )
-
-    def update_data_dir_for_training(self, data_dir: str, train_ds, validation_ds) -> None:
-        """
-        Update data directory and get data stats with Data Descriptor.
-        Also, reconfigures the classifier - to cope with data with e.g. different number of slots.
-
-        Args:
-            data_dir: path to data directory
-        """
-        logging.info(f'Setting data_dir to {data_dir}.')
-        self.data_dir = data_dir
-        # Update configuration with new data.
-        self._set_data_desc_to_cfg(self.cfg, data_dir, train_ds, validation_ds)
-        # Reconfigure the classifier for different settings (number of intents, slots etc.).
-        self._reconfigure_classifier()
-
-    def update_data_dir_for_testing(self, data_dir) -> None:
-        """
-        Update data directory.
-
-        Args:
-            data_dir: path to data directory
-        """
-        logging.info(f'Setting data_dir to {data_dir}.')
-        self.data_dir = data_dir
-
-    @typecheck()
-    def forward(self, input_ids, attention_mask, token_type_ids):
-        """
-        No special modification required for Lightning, define it as you normally would
-        in the `nn.Module` in vanilla PyTorch.
-        """
-        hidden_states = self.bert_model(
-            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-        )
-        if isinstance(hidden_states, tuple):
-            hidden_states = hidden_states[0]
-
-        intent_logits, slot_logits = self.classifier(hidden_states=hidden_states)
-        return intent_logits.float(), slot_logits.float()
-
-    def training_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the training loop with the data from the training dataloader
-        passed in as `batch`.
-        """
-        # forward pass
-        input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, intent_labels, slot_labels = batch
-        intent_logits, slot_logits = self(
-            input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask
-        )
-
-        # calculate combined loss for intents and slots
-        intent_loss = self.intent_loss(logits=intent_logits, labels=intent_labels)
-        slot_loss = self.slot_loss(logits=slot_logits, labels=slot_labels, loss_mask=loss_mask)
-        train_loss = self.total_loss(loss_1=intent_loss, loss_2=slot_loss)
-        lr = self._optimizer.param_groups[0]['lr']
-
-        self.log('train_loss', train_loss)
-        self.log('lr', lr, prog_bar=True)
-
-        return {
-            'loss': train_loss,
-            'lr': lr,
-        }
-
-    def validation_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the validation loop with the data from the validation dataloader
-        passed in as `batch`.
-        """
-        input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, intent_labels, slot_labels = batch
-        intent_logits, slot_logits = self(
-            input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask
-        )
-
-        # calculate combined loss for intents and slots
-        intent_loss = self.intent_loss(logits=intent_logits, labels=intent_labels)
-        slot_loss = self.slot_loss(logits=slot_logits, labels=slot_labels, loss_mask=loss_mask)
-        val_loss = self.total_loss(loss_1=intent_loss, loss_2=slot_loss)
-
-        # calculate accuracy metrics for intents and slot reporting
-        # intents
-        preds = torch.argmax(intent_logits, axis=-1)
-        self.intent_classification_report.update(preds, intent_labels)
-        # slots
-        subtokens_mask = subtokens_mask > 0.5
-        preds = torch.argmax(slot_logits, axis=-1)[subtokens_mask]
-        slot_labels = slot_labels[subtokens_mask]
-        self.slot_classification_report.update(preds, slot_labels)
-
-        loss = {
-            'val_loss': val_loss,
-            'intent_tp': self.intent_classification_report.tp,
-            'intent_fn': self.intent_classification_report.fn,
-            'intent_fp': self.intent_classification_report.fp,
-            'slot_tp': self.slot_classification_report.tp,
-            'slot_fn': self.slot_classification_report.fn,
-            'slot_fp': self.slot_classification_report.fp,
-        }
-        self.validation_step_outputs.append(loss)
-        return loss
-
-    def on_validation_epoch_end(self):
-        """
-        Called at the end of validation to aggregate outputs.
-        :param outputs: list of individual outputs of each validation step.
-        """
-        prefix = "test" if self.trainer.testing else "val"
-        if prefix == "val":
-            outputs = self.validation_step_outputs
-        else:
-            outputs = self.test_step_outputs
-        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
-
-        # calculate metrics and log classification report (separately for intents and slots)
-        intent_precision, intent_recall, intent_f1, intent_report = self.intent_classification_report.compute()
-        logging.info(f'Intent report: {intent_report}')
-
-        slot_precision, slot_recall, slot_f1, slot_report = self.slot_classification_report.compute()
-        logging.info(f'Slot report: {slot_report}')
-
-        self.log(f'{prefix}_loss', avg_loss)
-        self.log('intent_precision', intent_precision)
-        self.log('intent_recall', intent_recall)
-        self.log('intent_f1', intent_f1)
-        self.log('slot_precision', slot_precision)
-        self.log('slot_recall', slot_recall)
-        self.log('slot_f1', slot_f1)
-
-        self.intent_classification_report.reset()
-        self.slot_classification_report.reset()
-        self.validation_step_outputs.clear() if prefix == 'val' else self.test_step_outputs.clear()
-
-        return {
-            f'{prefix}_loss': avg_loss,
-            'intent_precision': intent_precision,
-            'intent_recall': intent_recall,
-            'intent_f1': intent_f1,
-            'slot_precision': slot_precision,
-            'slot_recall': slot_recall,
-            'slot_f1': slot_f1,
-        }
-
-    def test_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the test loop with the data from the test dataloader
-        passed in as `batch`.
-        """
-        loss = self.validation_step(batch, batch_idx)
-        self.test_step_outputs.append(loss)
-        return loss
-
-    def on_test_epoch_end(self):
-        """
-        Called at the end of test to aggregate outputs.
-        :param outputs: list of individual outputs of each test step.
-        """
-        return self.on_validation_epoch_end()
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig]):
-        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config)
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig]):
-        self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config)
-
-    def setup_test_data(self, test_data_config: Optional[DictConfig]):
-        self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config)
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig):
-        input_file = f'{self.data_dir}/{cfg.prefix}.tsv'
-        slot_file = f'{self.data_dir}/{cfg.prefix}_slots.tsv'
-
-        if not (os.path.exists(input_file) and os.path.exists(slot_file)):
-            raise FileNotFoundError(
-                f'{input_file} or {slot_file} not found. Please refer to the documentation for the right format \
-                 of Intents and Slots files.'
-            )
-
-        dataset = IntentSlotClassificationDataset(
-            input_file=input_file,
-            slot_file=slot_file,
-            tokenizer=self.tokenizer,
-            max_seq_length=self.max_seq_length,
-            num_samples=cfg.num_samples,
-            pad_label=self.cfg.data_desc.pad_label,
-            ignore_extra_tokens=self.cfg.ignore_extra_tokens,
-            ignore_start_end=self.cfg.ignore_start_end,
-        )
-
-        return DataLoader(
-            dataset=dataset,
-            batch_size=cfg.batch_size,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.num_workers,
-            pin_memory=cfg.pin_memory,
-            drop_last=cfg.drop_last,
-            collate_fn=dataset.collate_fn,
-        )
-
-    def _setup_infer_dataloader(self, queries: List[str], test_ds) -> 'torch.utils.data.DataLoader':
-        """
-        Setup function for a infer data loader.
-        Args:
-            queries: text
-            batch_size: batch size to use during inference
-        Returns:
-            A pytorch DataLoader.
-        """
-
-        dataset = IntentSlotInferenceDataset(
-            tokenizer=self.tokenizer, queries=queries, max_seq_length=-1, do_lower_case=False
-        )
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            collate_fn=dataset.collate_fn,
-            batch_size=test_ds.batch_size,
-            shuffle=test_ds.shuffle,
-            num_workers=test_ds.num_workers,
-            pin_memory=test_ds.pin_memory,
-            drop_last=test_ds.drop_last,
-        )
-
-    def predict_from_examples(self, queries: List[str], test_ds) -> List[List[str]]:
-        """
-        Get prediction for the queries (intent and slots)
-        Args:
-            queries: text sequences
-            test_ds: Dataset configuration section.
-        Returns:
-            predicted_intents, predicted_slots: model intent and slot predictions
-        """
-        predicted_intents = []
-        predicted_slots = []
-        mode = self.training
-        try:
-            device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-            # Retrieve intent and slot vocabularies from configuration.
-            intent_labels = self.cfg.data_desc.intent_labels
-            slot_labels = self.cfg.data_desc.slot_labels
-
-            # Initialize tokenizer.
-            # if not hasattr(self, "tokenizer"):
-            #    self._setup_tokenizer(self.cfg.tokenizer)
-            # Initialize modules.
-            # self._reconfigure_classifier()
-
-            # Switch model to evaluation mode
-            self.eval()
-            self.to(device)
-
-            # Dataset.
-            infer_datalayer = self._setup_infer_dataloader(queries, test_ds)
-
-            for batch in infer_datalayer:
-                input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch
-
-                intent_logits, slot_logits = self.forward(
-                    input_ids=input_ids.to(device),
-                    token_type_ids=input_type_ids.to(device),
-                    attention_mask=input_mask.to(device),
-                )
-
-                # predict intents and slots for these examples
-                # intents
-                intent_preds = tensor2list(torch.argmax(intent_logits, axis=-1))
-
-                # convert numerical outputs to Intent and Slot labels from the dictionaries
-                for intent_num in intent_preds:
-                    if intent_num < len(intent_labels):
-                        predicted_intents.append(intent_labels[int(intent_num)])
-                    else:
-                        # should not happen
-                        predicted_intents.append("Unknown Intent")
-
-                # slots
-                slot_preds = torch.argmax(slot_logits, axis=-1)
-
-                for slot_preds_query, mask_query in zip(slot_preds, subtokens_mask):
-                    query_slots = ''
-                    for slot, mask in zip(slot_preds_query, mask_query):
-                        if mask == 1:
-                            if slot < len(slot_labels):
-                                query_slots += slot_labels[int(slot)] + ' '
-                            else:
-                                query_slots += 'Unknown_slot '
-                    predicted_slots.append(query_slots.strip())
-
-        finally:
-            # set mode back to its original value
-            self.train(mode=mode)
-
-        return predicted_intents, predicted_slots
-
-    @classmethod
-    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
-        """
-        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
-
-        Returns:
-            List of available pre-trained models.
-        """
-        result = []
-        model = PretrainedModelInfo(
-            pretrained_model_name="Joint_Intent_Slot_Assistant",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemonlpmodels/versions/1.0.0a5/files/Joint_Intent_Slot_Assistant.nemo",
-            description="This models is trained on this https://github.com/xliuhw/NLU-Evaluation-Data dataset which includes 64 various intents and 55 slots. Final Intent accuracy is about 87%, Slot accuracy is about 89%.",
-        )
-        result.append(model)
-        return result
diff --git a/nemo/collections/nlp/models/intent_slot_classification/multi_label_intent_slot_classification_model.py b/nemo/collections/nlp/models/intent_slot_classification/multi_label_intent_slot_classification_model.py
deleted file mode 100644
index 7a2bec1f2cc0..000000000000
--- a/nemo/collections/nlp/models/intent_slot_classification/multi_label_intent_slot_classification_model.py
+++ /dev/null
@@ -1,471 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Tuple
-
-import numpy as np
-import numpy.typing as npt
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig, OmegaConf
-from sklearn.metrics import f1_score, precision_score, recall_score
-from torch.utils.data import DataLoader
-
-from nemo.collections.common.losses import AggregatorLoss, BCEWithLogitsLoss, CrossEntropyLoss
-from nemo.collections.nlp.data.intent_slot_classification import (
-    MultiLabelIntentSlotClassificationDataset,
-    MultiLabelIntentSlotDataDesc,
-)
-from nemo.collections.nlp.metrics.classification_report import ClassificationReport, MultiLabelClassificationReport
-from nemo.collections.nlp.models.intent_slot_classification import IntentSlotClassificationModel
-from nemo.collections.nlp.modules.common import SequenceTokenClassifier
-from nemo.collections.nlp.parts.utils_funcs import tensor2list
-from nemo.core.classes.common import PretrainedModelInfo
-from nemo.utils import logging
-
-
-class MultiLabelIntentSlotClassificationModel(IntentSlotClassificationModel):
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        """
-        Initializes BERT Joint Intent and Slot model.
-
-        Args:
-            cfg: configuration object
-            trainer: trainer for Pytorch Lightning
-        """
-        self.max_seq_length = cfg.language_model.max_seq_length
-
-        # Optimal Threshold
-        self.threshold = 0.5
-        self.max_f1 = 0
-
-        # Check the presence of data_dir.
-        if not cfg.data_dir or not os.path.exists(cfg.data_dir):
-            # Set default values of data_desc.
-            self._set_defaults_data_desc(cfg)
-        else:
-            self.data_dir = cfg.data_dir
-            # Update configuration of data_desc.
-            self._set_data_desc_to_cfg(cfg, cfg.data_dir, cfg.train_ds, cfg.validation_ds)
-
-        # init superclass
-        super().__init__(cfg=cfg, trainer=trainer)
-
-        # Initialize Classifier.
-        self._reconfigure_classifier()
-
-    def _set_data_desc_to_cfg(
-        self, cfg: DictConfig, data_dir: str, train_ds: DictConfig, validation_ds: DictConfig
-    ) -> None:
-        """
-        Creates MultiLabelIntentSlotDataDesc and copies generated values to Configuration object's data descriptor.
-
-        Args:
-            cfg: configuration object
-            data_dir: data directory
-            train_ds: training dataset file name
-            validation_ds: validation dataset file name
-
-        Returns:
-            None
-        """
-        # Save data from data desc to config - so it can be reused later, e.g. in inference.
-        data_desc = MultiLabelIntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix])
-        OmegaConf.set_struct(cfg, False)
-        if not hasattr(cfg, "data_desc") or cfg.data_desc is None:
-            cfg.data_desc = {}
-        # Intents.
-        cfg.data_desc.intent_labels = list(data_desc.intents_label_ids.keys())
-        cfg.data_desc.intent_label_ids = data_desc.intents_label_ids
-        cfg.data_desc.intent_weights = data_desc.intent_weights
-        # Slots.
-        cfg.data_desc.slot_labels = list(data_desc.slots_label_ids.keys())
-        cfg.data_desc.slot_label_ids = data_desc.slots_label_ids
-        cfg.data_desc.slot_weights = data_desc.slot_weights
-
-        cfg.data_desc.pad_label = data_desc.pad_label
-
-        # for older(pre - 1.0.0.b3) configs compatibility
-        if not hasattr(cfg, "class_labels") or cfg.class_labels is None:
-            cfg.class_labels = {}
-            cfg.class_labels = OmegaConf.create(
-                {
-                    "intent_labels_file": "intent_labels.csv",
-                    "slot_labels_file": "slot_labels.csv",
-                }
-            )
-
-        slot_labels_file = os.path.join(data_dir, cfg.class_labels.slot_labels_file)
-        intent_labels_file = os.path.join(data_dir, cfg.class_labels.intent_labels_file)
-        self._save_label_ids(data_desc.slots_label_ids, slot_labels_file)
-        self._save_label_ids(data_desc.intents_label_ids, intent_labels_file)
-
-        self.register_artifact("class_labels.intent_labels_file", intent_labels_file)
-        self.register_artifact("class_labels.slot_labels_file", slot_labels_file)
-        OmegaConf.set_struct(cfg, True)
-
-    def _reconfigure_classifier(self) -> None:
-        """Method reconfigures the classifier depending on the settings of model cfg.data_desc"""
-
-        self.classifier = SequenceTokenClassifier(
-            hidden_size=self.bert_model.config.hidden_size,
-            num_intents=len(self.cfg.data_desc.intent_labels),
-            num_slots=len(self.cfg.data_desc.slot_labels),
-            dropout=self.cfg.head.fc_dropout,
-            num_layers=self.cfg.head.num_output_layers,
-            log_softmax=False,
-        )
-
-        # define losses
-        if self.cfg.class_balancing == "weighted_loss":
-            # You may need to increase the number of epochs for convergence when using weighted_loss
-            self.intent_loss = BCEWithLogitsLoss(logits_ndim=2, pos_weight=self.cfg.data_desc.intent_weights)
-            self.slot_loss = CrossEntropyLoss(logits_ndim=3, weight=self.cfg.data_desc.slot_weights)
-        else:
-            self.intent_loss = BCEWithLogitsLoss(logits_ndim=2)
-            self.slot_loss = CrossEntropyLoss(logits_ndim=3)
-
-        self.total_loss = AggregatorLoss(
-            num_inputs=2,
-            weights=[self.cfg.intent_loss_weight, 1.0 - self.cfg.intent_loss_weight],
-        )
-
-        # setup to track metrics
-        self.intent_classification_report = MultiLabelClassificationReport(
-            num_classes=len(self.cfg.data_desc.intent_labels),
-            label_ids=self.cfg.data_desc.intent_label_ids,
-            dist_sync_on_step=True,
-            mode="micro",
-        )
-        self.slot_classification_report = ClassificationReport(
-            num_classes=len(self.cfg.data_desc.slot_labels),
-            label_ids=self.cfg.data_desc.slot_label_ids,
-            dist_sync_on_step=True,
-            mode="micro",
-        )
-
-    def validation_step(self, batch, batch_idx) -> None:
-        """
-        Validation Loop. Pytorch Lightning calls this inside the validation loop with the data from the validation dataloader
-        passed in as `batch`.
-
-        Args:
-            batch: batches of data from DataLoader
-            batch_idx: batch idx from DataLoader
-
-        Returns:
-            None
-        """
-        (
-            input_ids,
-            input_type_ids,
-            input_mask,
-            loss_mask,
-            subtokens_mask,
-            intent_labels,
-            slot_labels,
-        ) = batch
-        intent_logits, slot_logits = self(
-            input_ids=input_ids,
-            token_type_ids=input_type_ids,
-            attention_mask=input_mask,
-        )
-
-        # calculate combined loss for intents and slots
-        intent_loss = self.intent_loss(logits=intent_logits, labels=intent_labels)
-        slot_loss = self.slot_loss(logits=slot_logits, labels=slot_labels, loss_mask=loss_mask)
-        val_loss = self.total_loss(loss_1=intent_loss, loss_2=slot_loss)
-
-        intent_probabilities = torch.round(torch.sigmoid(intent_logits))
-
-        self.intent_classification_report.update(intent_probabilities, intent_labels)
-        # slots
-        subtokens_mask = subtokens_mask > 0.5
-        preds = torch.argmax(slot_logits, axis=-1)[subtokens_mask]
-        slot_labels = slot_labels[subtokens_mask]
-        self.slot_classification_report.update(preds, slot_labels)
-
-        loss = {
-            "val_loss": val_loss,
-            "intent_tp": self.intent_classification_report.tp,
-            "intent_fn": self.intent_classification_report.fn,
-            "intent_fp": self.intent_classification_report.fp,
-            "slot_tp": self.slot_classification_report.tp,
-            "slot_fn": self.slot_classification_report.fn,
-            "slot_fp": self.slot_classification_report.fp,
-        }
-        self.validation_step_outputs.append(loss)
-        return loss
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig) -> DataLoader:
-        """
-        Creates the DataLoader from the configuration object
-
-        Args:
-            cfg: configuration object
-
-        Returns:
-            DataLoader for model's data
-        """
-
-        input_file = f"{self.data_dir}/{cfg.prefix}.tsv"
-        slot_file = f"{self.data_dir}/{cfg.prefix}_slots.tsv"
-        intent_dict_file = self.data_dir + "/dict.intents.csv"
-
-        lines = open(intent_dict_file, "r").readlines()
-        lines = [line.strip() for line in lines if line.strip()]
-        num_intents = len(lines)
-
-        if not (os.path.exists(input_file) and os.path.exists(slot_file)):
-            raise FileNotFoundError(
-                f"{input_file} or {slot_file} not found. Please refer to the documentation for the right format \
-                 of Intents and Slots files."
-            )
-
-        dataset = MultiLabelIntentSlotClassificationDataset(
-            input_file=input_file,
-            slot_file=slot_file,
-            num_intents=num_intents,
-            tokenizer=self.tokenizer,
-            max_seq_length=self.max_seq_length,
-            num_samples=cfg.num_samples,
-            pad_label=self.cfg.data_desc.pad_label,
-            ignore_extra_tokens=self.cfg.ignore_extra_tokens,
-            ignore_start_end=self.cfg.ignore_start_end,
-        )
-
-        return DataLoader(
-            dataset=dataset,
-            batch_size=cfg.batch_size,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.num_workers,
-            pin_memory=cfg.pin_memory,
-            drop_last=cfg.drop_last,
-            collate_fn=dataset.collate_fn,
-        )
-
-    def prediction_probabilities(self, queries: List[str], test_ds: DictConfig) -> npt.NDArray:
-        """
-        Get prediction probabilities for the queries (intent and slots)
-
-        Args:
-            queries: text sequences
-            test_ds: Dataset configuration section.
-
-        Returns:
-            numpy array of intent probabilities
-        """
-
-        probabilities = []
-
-        mode = self.training
-        try:
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-
-            # Switch model to evaluation mode
-            self.eval()
-            self.to(device)
-
-            # Dataset.
-            infer_datalayer = self._setup_infer_dataloader(queries, test_ds)
-
-            for batch in infer_datalayer:
-                input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch
-
-                intent_logits, slot_logits = self.forward(
-                    input_ids=input_ids.to(device),
-                    token_type_ids=input_type_ids.to(device),
-                    attention_mask=input_mask.to(device),
-                )
-
-                # predict intents for these examples
-                probabilities.append(torch.sigmoid(intent_logits).detach().cpu().numpy())
-
-            probabilities = np.concatenate(probabilities)
-
-        finally:
-            # set mode back to its original value
-            self.train(mode=mode)
-
-        return probabilities
-
-    def optimize_threshold(self, test_ds: DictConfig, file_name: str) -> None:
-        """
-        Set the optimal threshold of the model from performance on validation set. This threshold is used to round the
-        logits to 0 or 1.
-
-        Args:
-            test_ds: location of test dataset
-            file_name: name of input file to retrieve validation set
-
-        Returns:
-            None
-        """
-
-        input_file = f"{self.data_dir}/{file_name}.tsv"
-
-        with open(input_file, "r") as f:
-            input_lines = f.readlines()[1:]  # Skipping headers at index 0
-
-        dataset = list(input_lines)
-
-        metrics_labels, sentences = [], []
-
-        for input_line in dataset:
-            sentence = input_line.strip().split("\t")[0]
-            sentences.append(sentence)
-            parts = input_line.strip().split("\t")[1:][0]
-            parts = list(map(int, parts.split(",")))
-            parts = [1 if label in parts else 0 for label in range(len(self.cfg.data_desc.intent_labels))]
-            metrics_labels.append(parts)
-
-        # Retrieve class probabilities for each sentence
-        intent_probabilities = self.prediction_probabilities(sentences, test_ds)
-
-        metrics_dict = {}
-        # Find optimal logits rounding threshold for intents
-        for i in np.arange(0.5, 0.96, 0.01):
-            predictions = (intent_probabilities >= i).tolist()
-            precision = precision_score(metrics_labels, predictions, average='micro')
-            recall = recall_score(metrics_labels, predictions, average='micro')
-            f1 = f1_score(metrics_labels, predictions, average='micro')
-            metrics_dict[i] = [precision, recall, f1]
-
-        max_precision = max(metrics_dict, key=lambda x: metrics_dict[x][0])
-        max_recall = max(metrics_dict, key=lambda x: metrics_dict[x][1])
-        max_f1_score = max(metrics_dict, key=lambda x: metrics_dict[x][2])
-
-        logging.info(
-            f'Best Threshold for F1-Score: {max_f1_score}, [Precision, Recall, F1-Score]: {metrics_dict[max_f1_score]}'
-        )
-        logging.info(
-            f'Best Threshold for Precision: {max_precision}, [Precision, Recall, F1-Score]: {metrics_dict[max_precision]}'
-        )
-        logging.info(
-            f'Best Threshold for Recall: {max_recall}, [Precision, Recall, F1-Score]: {metrics_dict[max_recall]}'
-        )
-
-        if metrics_dict[max_f1_score][2] > self.max_f1:
-            self.max_f1 = metrics_dict[max_f1_score][2]
-
-            logging.info(f'Setting Threshold to: {max_f1_score}')
-
-            self.threshold = max_f1_score
-
-    def predict_from_examples(
-        self, queries: List[str], test_ds: DictConfig, threshold: float = None
-    ) -> Tuple[List[List[Tuple[str, float]]], List[str], List[List[int]]]:
-        """
-        Get prediction for the queries (intent and slots)
-
-
-        Args:
-            queries: text sequences
-            test_ds: Dataset configuration section.
-            threshold: Threshold for rounding prediction logits
-
-        Returns:
-            predicted_intents: model intent predictions with their probabilities
-                Example:  [[('flight', 0.84)], [('airfare', 0.54),
-                            ('flight', 0.73), ('meal', 0.24)]]
-            predicted_slots: model slot predictions
-                Example:  ['O B-depart_date.month_name B-depart_date.day_number',
-                           'O O B-flight_stop O O O']
-
-            predicted_vector: model intent predictions for each individual query. Binary values within each list
-                indicate whether a class is prediced for the given query (1 for True, 0 for False)
-                Example: [[1,0,0,0,0,0], [0,0,1,0,0,0]]
-        """
-        predicted_intents = []
-
-        if threshold is None:
-            threshold = self.threshold
-        logging.info(f'Using threshold = {threshold}')
-
-        predicted_slots = []
-        predicted_vector = []
-
-        mode = self.training
-        try:
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-
-            # Retrieve intent and slot vocabularies from configuration.
-            intent_labels = self.cfg.data_desc.intent_labels
-            slot_labels = self.cfg.data_desc.slot_labels
-
-            # Switch model to evaluation mode
-            self.eval()
-            self.to(device)
-
-            # Dataset.
-            infer_datalayer = self._setup_infer_dataloader(queries, test_ds)
-
-            for batch in infer_datalayer:
-                input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch
-
-                intent_logits, slot_logits = self.forward(
-                    input_ids=input_ids.to(device),
-                    token_type_ids=input_type_ids.to(device),
-                    attention_mask=input_mask.to(device),
-                )
-
-                # predict intents and slots for these examples
-                # intents
-                intent_preds = tensor2list(torch.sigmoid(intent_logits))
-                # convert numerical outputs to Intent and Slot labels from the dictionaries
-                for intents in intent_preds:
-                    intent_lst = []
-                    temp_list = []
-                    for intent_num, probability in enumerate(intents):
-                        if probability >= threshold:
-                            intent_lst.append((intent_labels[int(intent_num)], round(probability, 2)))
-                            temp_list.append(1)
-                        else:
-                            temp_list.append(0)
-
-                    predicted_vector.append(temp_list)
-                    predicted_intents.append(intent_lst)
-
-                # slots
-                slot_preds = torch.argmax(slot_logits, axis=-1)
-                temp_slots_preds = []
-
-                for slot_preds_query, mask_query in zip(slot_preds, subtokens_mask):
-                    temp_slots = ""
-                    query_slots = ""
-                    for slot, mask in zip(slot_preds_query, mask_query):
-                        if mask == 1:
-                            if slot < len(slot_labels):
-                                query_slots += slot_labels[int(slot)] + " "
-                                temp_slots += f"{slot} "
-                            else:
-                                query_slots += "Unknown_slot "
-                                temp_slots += "0 "
-                    predicted_slots.append(query_slots.strip())
-                    temp_slots_preds.append(temp_slots)
-
-        finally:
-            # set mode back to its original value
-            self.train(mode=mode)
-
-        return predicted_intents, predicted_slots, predicted_vector
-
-    @classmethod
-    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
-        """
-        To be added
-        """
-        result = []
-        return result
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
index 6cc317d1efea..e6eaee440acb 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# pylint: skip-file
+# flake8: noqa
+
 import itertools
 import re
 from collections import OrderedDict
@@ -24,7 +27,15 @@
 from torch import Tensor
 
 from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
-from nemo.collections.nlp.metrics.prompt_learning_metrics import AccuracyScore, BLEUScore, ROUGEScores
+
+try:
+    from nemo.collections.nlp.metrics.prompt_learning_metrics import AccuracyScore, BLEUScore, ROUGEScores
+except ModuleNotFoundError:
+    from abc import ABC
+
+    AccuracyScore = ABC
+    BLEUScore = ABC
+    ROUGEScores = ABC
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common import (
     PromptEncoder,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py
index d3829c3e8de1..e459f69c1fcd 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py
@@ -11,13 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+# pylint: skip-file
+# flake8: noqa
+
 from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf.dictconfig import DictConfig
 
-from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import (
-    TextToTextGLUEDataset,
-    TextToTextXNLIDataset,
-)
+try:
+    from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import (
+        TextToTextGLUEDataset,
+        TextToTextXNLIDataset,
+    )
+except ModuleNotFoundError:
+    from abc import ABC
+
+    TextToTextGLUEDataset = ABC
+    TextToTextXNLIDataset = ABC
 from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel
 from nemo.utils import logging
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
index 3b5c9f1161bb..1ea2481fae22 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
@@ -27,7 +27,15 @@
 
 from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
 from nemo.collections.nlp.data.language_modeling.megatron.gpt_prompt_learning_dataset import GPTPromptLearningDataset
-from nemo.collections.nlp.metrics.prompt_learning_metrics import AccuracyScore, BLEUScore, ROUGEScores
+
+try:
+    from nemo.collections.nlp.metrics.prompt_learning_metrics import AccuracyScore, BLEUScore, ROUGEScores
+except ModuleNotFoundError:
+    from abc import ABC
+
+    AccuracyScore = ABC
+    BLEUScore = ABC
+    ROUGEScores = ABC
 from nemo.collections.nlp.models.language_modeling.megatron_base_prompt_learning_model import (
     MegatronBasePromptLearningModel,
 )
diff --git a/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py b/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py
index 3b8e1f819ea1..769ea3a0ddd7 100644
--- a/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py
+++ b/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# pylint: skip-file
+
 import json
 import math
 from typing import Dict, Optional
@@ -26,7 +28,13 @@
 from nemo.collections.common.metrics import GlobalAverageLossMetric
 from nemo.collections.common.parts import transformer_weights_init
 from nemo.collections.nlp.data import SentenceDataset, TarredSentenceDataset
-from nemo.collections.nlp.metrics import SequencePerplexity
+
+try:
+    from nemo.collections.nlp.metrics import SequencePerplexity
+except ModuleNotFoundError:
+    from abc import ABC
+
+    SequencePerplexity = ABC
 from nemo.collections.nlp.modules.common import TokenClassifier
 from nemo.collections.nlp.modules.common.lm_utils import get_transformer
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
diff --git a/nemo/collections/nlp/models/token_classification/__init__.py b/nemo/collections/nlp/models/token_classification/__init__.py
deleted file mode 100644
index c903cc8812cb..000000000000
--- a/nemo/collections/nlp/models/token_classification/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from nemo.collections.nlp.models.token_classification.punctuation_capitalization_config import (
-    PunctuationCapitalizationModelConfig,
-)
-from nemo.collections.nlp.models.token_classification.punctuation_capitalization_lexical_audio_model import (
-    PunctuationCapitalizationLexicalAudioModel,
-)
-from nemo.collections.nlp.models.token_classification.punctuation_capitalization_model import (
-    PunctuationCapitalizationModel,
-)
-from nemo.collections.nlp.models.token_classification.token_classification_model import TokenClassificationModel
diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_config.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_config.py
deleted file mode 100644
index 86bf12b92315..000000000000
--- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_config.py
+++ /dev/null
@@ -1,419 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-from typing import Any, Dict, Optional
-
-from omegaconf.omegaconf import MISSING, DictConfig, OmegaConf, open_dict
-
-from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig
-from nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset import (
-    PunctuationCapitalizationEvalDataConfig,
-    PunctuationCapitalizationTrainDataConfig,
-    legacy_data_config_to_new_data_config,
-)
-from nemo.core.config import TrainerConfig
-from nemo.core.config.modelPT import NemoConfig
-from nemo.utils.exp_manager import ExpManagerConfig
-
-
-@dataclass
-class FreezeConfig:
-    is_enabled: bool = False
-    """Freeze audio encoder weight and add Conformer Layers on top of it"""
-    d_model: Optional[int] = 256
-    """`d_model` parameter of ``ConformerLayer``"""
-    d_ff: Optional[int] = 1024
-    """``d_ff`` parameter of ``ConformerLayer``"""
-    num_layers: Optional[int] = 8
-    """``num_layers`` number of ``ConformerLayer`` modules to add on top of audio encoder"""
-
-
-@dataclass
-class AdapterConfig:
-    config: Optional[LinearAdapterConfig] = None
-    """Linear adapter config see ``collections.common.parts.LinearAdapterConfig``"""
-    enable: bool = False
-    """Use adapters for audio encoder"""
-
-
-@dataclass
-class FusionConfig:
-    num_layers: Optional[int] = 4
-    """"Number of layers to use in fusion"""
-    num_attention_heads: Optional[int] = 4
-    """Number of attention heads to use in fusion"""
-    inner_size: Optional[int] = 2048
-    """Fusion inner size"""
-
-
-@dataclass
-class AudioEncoderConfig:
-    pretrained_model: str = MISSING
-    """A configuration for restoring pretrained audio encoder"""
-    freeze: Optional[FreezeConfig] = None
-    adapter: Optional[AdapterConfig] = None
-    fusion: Optional[FusionConfig] = None
-
-
-@dataclass
-class TokenizerConfig:
-    """A structure and default values of source text tokenizer."""
-
-    vocab_file: Optional[str] = None
-    """A path to vocabulary file which is used in ``'word'``, ``'char'``, and HuggingFace tokenizers"""
-
-    tokenizer_name: str = MISSING
-    """A name of the tokenizer used for tokenization of source sequences. Possible options are ``'sentencepiece'``,
-    ``'word'``, ``'char'``, HuggingFace tokenizers (e.g. ``'bert-base-uncased'``). For more options see function
-    ``nemo.collections.nlp.modules.common.get_tokenizer``. The tokenizer must have properties ``cls_id``, ``pad_id``,
-    ``sep_id``, ``unk_id``."""
-
-    special_tokens: Optional[Dict[str, str]] = None
-    """A dictionary with special tokens passed to constructors of ``'char'``, ``'word'``, ``'sentencepiece'``, and
-    various HuggingFace tokenizers."""
-
-    tokenizer_model: Optional[str] = None
-    """A path to a tokenizer model required for ``'sentencepiece'`` tokenizer."""
-
-
-@dataclass
-class LanguageModelConfig:
-    """
-    A structure and default values of language model configuration of punctuation and capitalization model. BERT like
-    HuggingFace models are supported. Provide a valid ``pretrained_model_name`` and, optionally, you may
-    reinitialize model via ``config_file`` or ``config``.
-
-    Alternatively you can initialize the language model using ``lm_checkpoint``.
-
-    This config is a part of :class:`PunctuationCapitalizationModelConfig` config.
-    """
-
-    pretrained_model_name: str = MISSING
-    """A mandatory parameter containing name of HuggingFace pretrained model. For example, ``'bert-base-uncased'``."""
-
-    config_file: Optional[str] = None
-    """A path to a file with HuggingFace model config which is used to reinitialize language model."""
-
-    config: Optional[Dict] = None
-    """A HuggingFace config which is used to reinitialize language model."""
-
-    lm_checkpoint: Optional[str] = None
-    """A path to a ``torch`` checkpoint of a language model."""
-
-
-@dataclass
-class HeadConfig:
-    """
-    A structure and default values of configuration of capitalization or punctuation model head. This config defines a
-    multilayer perceptron which is applied to output of a language model. Number of units in the hidden layer is equal
-    to the dimension of the language model.
-
-    This config is a part of :class:`PunctuationCapitalizationModelConfig` config.
-    """
-
-    num_fc_layers: int = 1
-    """A number of hidden layers in a multilayer perceptron."""
-
-    fc_dropout: float = 0.1
-    """A dropout used in an MLP."""
-
-    activation: str = 'relu'
-    """An activation used in hidden layers."""
-
-    use_transformer_init: bool = True
-    """Whether to initialize the weights of the classifier head with the approach that was used for language model
-    initialization."""
-
-
-@dataclass
-class ClassLabelsConfig:
-    """
-    A structure and default values of a mandatory part of config which contains names of files which are saved in .nemo
-    checkpoint. These files can also be used for passing label vocabulary to the model. For using them as label
-    vocabularies you will need to provide path these files in parameter
-    ``model.common_dataset_parameters.label_vocab_dir``. Each line in labels files
-    contains 1 label. The values are sorted, ``<line number>==<label id>``, starting from ``0``. A label with ``0`` id
-    must contain neutral label which must be equal to ``model.common_dataset_parameters.pad_label``.
-
-    This config is a part of :class:`~CommonDatasetParametersConfig`.
-    """
-
-    punct_labels_file: str = MISSING
-    """A name of punctuation labels file."""
-
-    capit_labels_file: str = MISSING
-    """A name of capitalization labels file."""
-
-
-@dataclass
-class CommonDatasetParametersConfig:
-    """
-    A structure and default values of common dataset parameters config which includes label and loss mask information.
-    If you omit parameters ``punct_label_ids``, ``capit_label_ids``, ``label_vocab_dir``, then labels will be inferred
-    from a training dataset or loaded from a checkpoint.
-
-    Parameters ``ignore_extra_tokens`` and ``ignore_start_end`` are responsible for forming loss mask. A loss mask
-    defines on which tokens loss is computed.
-
-    This parameter is a part of config :class:`~PunctuationCapitalizationModelConfig`.
-    """
-
-    pad_label: str = MISSING
-    """A mandatory parameter which should contain label used for punctuation and capitalization label padding. It
-    also serves as a neutral label for both punctuation and capitalization. If any of ``punct_label_ids``,
-    ``capit_label_ids`` parameters is provided, then ``pad_label`` must have ``0`` id in them. In addition, if ``label_vocab_dir``
-    is provided, then ``pad_label`` must be on the first lines in files ``class_labels.punct_labels_file`` and
-    ``class_labels.capit_labels_file``."""
-
-    ignore_extra_tokens: bool = False
-    """Whether to compute loss on not first tokens in words. If this parameter is ``True``, then loss mask is ``False``
-    for all tokens in a word except the first."""
-
-    ignore_start_end: bool = True
-    """If ``False``, then loss is computed on [CLS] and [SEP] tokens."""
-
-    punct_label_ids: Optional[Dict[str, int]] = None
-    """A dictionary with punctuation label ids. ``pad_label`` must have ``0`` id in this dictionary. You can omit this
-    parameter and pass label ids through ``class_labels.punct_labels_file`` or let the model to infer label ids from
-    dataset or load them from checkpoint."""
-
-    capit_label_ids: Optional[Dict[str, int]] = None
-    """A dictionary with capitalization label ids. ``pad_label`` must have ``0`` id in this dictionary. You can omit
-    this parameter and pass label ids through ``class_labels.capit_labels_file`` or let model to infer label ids from
-    dataset or load them from checkpoint."""
-
-    label_vocab_dir: Optional[str] = None
-    """A path to directory which contains class labels files. See :class:`ClassLabelsConfig`. If this parameter is
-    provided, then labels will be loaded from files which are located in ``label_vocab_dir`` and have names specified
-    in ``model.class_labels`` configuration section. A label specified in ``pad_label`` has to be on the first lines
-    of ``model.class_labels`` files."""
-
-
-@dataclass
-class PunctuationCapitalizationModelConfig:
-    """
-    A configuration of
-    :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel`
-    model.
-
-    See an example of model config in
-    `nemo/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml
-    <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml>`_
-
-    This config is a part of :class:`~PunctuationCapitalizationConfig`.
-    """
-
-    class_labels: ClassLabelsConfig = field(default_factory=lambda: ClassLabelsConfig())
-    """A mandatory parameter containing a dictionary with names of label id files used in .nemo checkpoints.
-    These file names can also be used for passing label vocabularies to the model. If you wish to use ``class_labels``
-    for passing vocabularies, please provide path to vocabulary files in
-    ``model.common_dataset_parameters.label_vocab_dir`` parameter."""
-
-    common_dataset_parameters: Optional[CommonDatasetParametersConfig] = field(
-        default_factory=lambda: CommonDatasetParametersConfig()
-    )
-    """Label ids and loss mask information information."""
-
-    train_ds: Optional[PunctuationCapitalizationTrainDataConfig] = None
-    """A configuration for creating training dataset and data loader."""
-
-    validation_ds: Optional[PunctuationCapitalizationEvalDataConfig] = None
-    """A configuration for creating validation datasets and data loaders."""
-
-    test_ds: Optional[PunctuationCapitalizationEvalDataConfig] = None
-    """A configuration for creating test datasets and data loaders."""
-
-    punct_head: HeadConfig = field(default_factory=lambda: HeadConfig())
-    """A configuration for creating punctuation MLP head that is applied to a language model outputs."""
-
-    capit_head: HeadConfig = field(default_factory=lambda: HeadConfig())
-    """A configuration for creating capitalization MLP head that is applied to a language model outputs."""
-
-    tokenizer: Any = field(default_factory=lambda: TokenizerConfig())
-    """A configuration for source text tokenizer."""
-
-    language_model: LanguageModelConfig = field(default_factory=lambda: LanguageModelConfig())
-    """A configuration of a BERT-like language model which serves as a model body."""
-
-    optim: Optional[Any] = None
-    """A configuration of optimizer and learning rate scheduler. There is much variability in such config. For
-    description see `Optimizers
-    <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/core/core.html#optimizers>`_ section in
-    documentation and `primer <https://github.com/NVIDIA/NeMo/blob/main/tutorials/00_NeMo_Primer.ipynb>_ tutorial."""
-
-
-@dataclass
-class PunctuationCapitalizationLexicalAudioModelConfig(PunctuationCapitalizationModelConfig):
-    """
-    A configuration of
-    :class:`~nemo.collections.nlp.models.token_classification.punctuation_lexical_audio_capitalization_model.PunctuationCapitalizationLexicalAudioModel`
-    model.
-
-    See an example of model config in
-    `nemo/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml
-    <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml>`_
-
-    Audio encoder can be frozen during training with ``freeze_audio_encoder`` parameter.
-    Adapter can be added to audio encoder with ``use_adapters`` and ``adapter_config`` parameters.
-    More conformer layers can be added on top of pretrained audio encoder with ``frozen_conf_d_model``, ``frozen_conf_d_ff`` and ``frozen_conf_num_layers`` parameters.
-    """
-
-    train_ds: Optional[PunctuationCapitalizationTrainDataConfig] = None
-    """A configuration for creating training dataset and data loader."""
-
-    validation_ds: Optional[PunctuationCapitalizationEvalDataConfig] = None
-    """A configuration for creating validation datasets and data loaders."""
-
-    test_ds: Optional[PunctuationCapitalizationEvalDataConfig] = None
-    """A configuration for creating test datasets and data loaders."""
-
-    audio_encoder: Optional[AudioEncoderConfig] = None
-
-    restore_lexical_encoder_from: Optional[str] = None
-    """"Path to .nemo checkpoint to load weights from"""  # add more comments
-
-    use_weighted_loss: Optional[bool] = False
-    """If set to ``True`` CrossEntropyLoss will be weighted"""
-
-
-@dataclass
-class PunctuationCapitalizationConfig(NemoConfig):
-    """
-    A config for punctuation model training and testing.
-
-    See an example of full config in
-    `nemo/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml
-    <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml>`_
-    """
-
-    pretrained_model: Optional[str] = None
-    """Can be an NVIDIA's NGC cloud model or a path to a .nemo checkpoint. You can get list of possible cloud options
-    by calling method
-    :func:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel.list_available_models`.
-    """
-
-    name: Optional[str] = 'Punctuation_and_Capitalization'
-    """A name of the model. Used for naming output directories and ``.nemo`` checkpoints."""
-
-    do_training: bool = True
-    """Whether to perform training of the model."""
-
-    do_testing: bool = False
-    """Whether ot perform testing of the model."""
-
-    model: PunctuationCapitalizationModelConfig = field(default_factory=lambda: PunctuationCapitalizationModelConfig())
-    """A configuration for the
-    :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel`
-    model."""
-
-    trainer: Optional[TrainerConfig] = field(default_factory=lambda: TrainerConfig())
-    """Contains ``Trainer`` Lightning class constructor parameters."""
-
-    exp_manager: Optional[ExpManagerConfig] = field(
-        default_factory=lambda: ExpManagerConfig(name=None, files_to_copy=[])
-    )
-    """A configuration with various NeMo training options such as output directories, resuming from checkpoint,
-    tensorboard and W&B logging, and so on. For possible options see :ref:`exp-manager-label`."""
-
-    def __post_init__(self):
-        if self.exp_manager is not None:
-            self.exp_manager.name = self.name
-
-
-@dataclass
-class PunctuationCapitalizationLexicalAudioConfig(PunctuationCapitalizationConfig):
-    model: PunctuationCapitalizationLexicalAudioModelConfig = field(
-        default_factory=lambda: PunctuationCapitalizationLexicalAudioModelConfig()
-    )
-
-
-def is_legacy_model_config(model_cfg: DictConfig) -> bool:
-    """
-    Test if model config is old style config. Old style configs are configs which were used before
-    ``common_dataset_parameters`` item was added. Old style datasets use ``dataset`` instead of
-    ``common_dataset_parameters``, ``batch_size`` instead of ``tokens_in_batch``. Old style configs do not support
-    tarred datasets.
-
-    Args:
-        model_cfg: model configuration
-
-    Returns:
-        whether ``model_config`` is legacy
-    """
-    return 'common_dataset_parameters' not in model_cfg
-
-
-def legacy_model_config_to_new_model_config(model_cfg: DictConfig) -> DictConfig:
-    """
-    Transform old style config into
-    :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_config.PunctuationCapitalizationModelConfig`.
-    Old style configs are configs which were used before ``common_dataset_parameters`` item was added. Old style
-    datasets use ``dataset`` instead of ``common_dataset_parameters``, ``batch_size`` instead of ``tokens_in_batch``.
-    Old style configs do not support tarred datasets.
-
-    Args:
-        model_cfg: old style config
-
-    Returns:
-        model config which follows dataclass
-            :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_config.PunctuationCapitalizationModelConfig`
-    """
-    train_ds = model_cfg.get('train_ds')
-    validation_ds = model_cfg.get('validation_ds')
-    test_ds = model_cfg.get('test_ds')
-    dataset = model_cfg.dataset
-    punct_head_config = model_cfg.get('punct_head', {})
-    capit_head_config = model_cfg.get('capit_head', {})
-    omega_conf = OmegaConf.structured(
-        PunctuationCapitalizationModelConfig(
-            class_labels=model_cfg.class_labels,
-            common_dataset_parameters=CommonDatasetParametersConfig(
-                pad_label=dataset.pad_label,
-                ignore_extra_tokens=dataset.get(
-                    'ignore_extra_tokens', CommonDatasetParametersConfig.ignore_extra_tokens
-                ),
-                ignore_start_end=dataset.get('ignore_start_end', CommonDatasetParametersConfig.ignore_start_end),
-                punct_label_ids=model_cfg.punct_label_ids,
-                capit_label_ids=model_cfg.capit_label_ids,
-            ),
-            train_ds=None
-            if train_ds is None
-            else legacy_data_config_to_new_data_config(train_ds, dataset, train=True),
-            validation_ds=None
-            if validation_ds is None
-            else legacy_data_config_to_new_data_config(validation_ds, dataset, train=False),
-            test_ds=None if test_ds is None else legacy_data_config_to_new_data_config(test_ds, dataset, train=False),
-            punct_head=HeadConfig(
-                num_fc_layers=punct_head_config.get('punct_num_fc_layers', HeadConfig.num_fc_layers),
-                fc_dropout=punct_head_config.get('fc_dropout', HeadConfig.fc_dropout),
-                activation=punct_head_config.get('activation', HeadConfig.activation),
-                use_transformer_init=punct_head_config.get('use_transformer_init', HeadConfig.use_transformer_init),
-            ),
-            capit_head=HeadConfig(
-                num_fc_layers=capit_head_config.get('capit_num_fc_layers', HeadConfig.num_fc_layers),
-                fc_dropout=capit_head_config.get('fc_dropout', HeadConfig.fc_dropout),
-                activation=capit_head_config.get('activation', HeadConfig.activation),
-                use_transformer_init=capit_head_config.get('use_transformer_init', HeadConfig.use_transformer_init),
-            ),
-            tokenizer=model_cfg.tokenizer,
-            language_model=model_cfg.language_model,
-            optim=model_cfg.optim,
-        )
-    )
-    with open_dict(omega_conf):
-        retain_during_legacy_conversion = model_cfg.get('retain_during_legacy_conversion', {})
-        for key in retain_during_legacy_conversion.keys():
-            omega_conf[key] = retain_during_legacy_conversion[key]
-    return omega_conf
diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_lexical_audio_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_lexical_audio_model.py
deleted file mode 100644
index bd42517a5720..000000000000
--- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_lexical_audio_model.py
+++ /dev/null
@@ -1,434 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from math import ceil
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig, open_dict
-from torch.nn import Linear
-from tqdm import tqdm
-
-from nemo.collections.common.losses.cross_entropy import CrossEntropyLoss
-from nemo.collections.nlp.models.token_classification.punctuation_capitalization_model import (
-    PunctuationCapitalizationModel,
-)
-from nemo.collections.nlp.modules.common.transformer import TransformerDecoder
-from nemo.core.classes.common import PretrainedModelInfo
-from nemo.core.classes.mixins import adapter_mixins
-from nemo.utils import logging
-
-try:
-    import nemo.collections.asr as nemo_asr
-    from nemo.collections.asr.parts.submodules.conformer_modules import ConformerLayer
-
-    ASR_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    ASR_AVAILABLE = False
-
-__all__ = ['PunctuationCapitalizationLexicalAudioModel']
-
-
-def update_model_config_to_support_adapter(model_cfg):
-    with open_dict(model_cfg):
-        adapter_metadata = adapter_mixins.get_registered_adapter(model_cfg.encoder._target_)
-        if adapter_metadata is not None:
-            model_cfg.encoder._target_ = adapter_metadata.adapter_class_path
-
-    return model_cfg
-
-
-class PunctuationCapitalizationLexicalAudioModel(PunctuationCapitalizationModel):
-    """
-    A model for restoring punctuation and capitalization in text using lexical and audio features.
-
-    The model consists of a language model and two multilayer perceptrons (MLP) on top the fusion of LM and AM. The first
-    MLP serves for punctuation prediction and the second is for capitalization prediction. You can use only BERT-like
-    HuggingFace language models (model ``forward`` method accepts ``input_ids``, ``token_types_ids``,
-    ``attention_mask`` arguments). See more about model config options :ref:`here<model-config-label>`.
-    And any :class:`~nemo.collections.asr.models.EncDecCTCModel` which has encoder module which is used as an AM.
-
-    For training and testing use dataset
-    :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.BertPunctuationCapitalizationDataset` with parameter ``use_audio`` set to ``True``,
-    for training on huge amounts of data which cannot be loaded into memory simultaneously use
-    :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.BertPunctuationCapitalizationTarredDataset` with parameter ``use_audio`` set to ``True``.
-
-    Args:
-        cfg: a model configuration. It should follow dataclass
-            :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_config.PunctuationCapitalizationLexicalAudioModelConfig`
-            See an example of full config in
-            `nemo/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml
-            <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml>`_
-        trainer: an instance of a PyTorch Lightning trainer
-    """
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None) -> None:
-        super().__init__(cfg, trainer)
-        if not ASR_AVAILABLE:
-            raise ModuleNotFoundError(
-                'Nemo ASR was not installed, see https://github.com/NVIDIA/NeMo#installation for installation instructions'
-            )
-        if os.path.exists(cfg.audio_encoder.pretrained_model):
-            audio_cfg = nemo_asr.models.ASRModel.restore_from(cfg.audio_encoder.pretrained_model, return_config=True)
-        else:
-            audio_cfg = nemo_asr.models.ASRModel.from_pretrained(
-                cfg.audio_encoder.pretrained_model, return_config=True
-            )
-
-        if cfg.audio_encoder.get('adapter', None):
-            if cfg.audio_encoder.adapter.enable:
-                audio_cfg = update_model_config_to_support_adapter(audio_cfg)
-
-        if os.path.exists(cfg.audio_encoder.pretrained_model):
-            self.audio_encoder = nemo_asr.models.ASRModel.restore_from(
-                cfg.audio_encoder.pretrained_model, override_config_path=audio_cfg
-            )
-        else:
-            self.audio_encoder = nemo_asr.models.ASRModel.from_pretrained(
-                cfg.audio_encoder.pretrained_model, override_config_path=audio_cfg
-            )
-
-        if cfg.audio_encoder.adapter.get('enable', False):
-            with open_dict(cfg):
-                cfg.audio_encoder.adapter.config.in_features = self.audio_encoder.cfg.decoder.feat_in
-            self.audio_encoder.add_adapter(name='audio_adapter', cfg=cfg.audio_encoder.adapter.config)
-            self.audio_encoder.set_enabled_adapters(enabled=True)
-            self.audio_encoder.freeze()
-            self.audio_encoder.unfreeze_enabled_adapters()
-
-        self.fusion = TransformerDecoder(
-            num_layers=cfg.audio_encoder.fusion.num_layers,
-            hidden_size=self.bert_model(**self.bert_model.input_example()[0]).size()[-1],
-            inner_size=cfg.audio_encoder.fusion.inner_size,
-            num_attention_heads=cfg.audio_encoder.fusion.num_attention_heads,
-        )
-
-        if hasattr(self.audio_encoder.cfg, 'decoder.feat_in'):
-            self.audio_proj = Linear(
-                self.audio_encoder.cfg.decoder.feat_in,
-                self.bert_model(**self.bert_model.input_example()[0]).size()[-1],
-            )
-        else:
-            self.audio_proj = Linear(
-                self.audio_encoder.cfg.encoder.d_model,
-                self.bert_model(**self.bert_model.input_example()[0]).size()[-1],
-            )
-
-        if cfg.audio_encoder.freeze.get('is_enabled', False):
-            for param in self.audio_encoder.parameters():
-                param.requires_grad = False
-            for i in range(cfg.audio_encoder.freeze.get('num_layers')):
-                self.audio_encoder.add_module(
-                    f'conf_encoder_{i}',
-                    ConformerLayer(
-                        d_model=cfg.audio_encoder.freeze.get('d_model'), d_ff=cfg.audio_encoder.freeze.get('d_ff')
-                    ),
-                )
-
-        if cfg.get('restore_lexical_encoder_from', None) and not self._is_model_being_restored():
-            if os.path.exists(cfg.get('restore_lexical_encoder_from')):
-                self.bert_model = (
-                    PunctuationCapitalizationModel.restore_from(cfg.restore_lexical_encoder_from)
-                    .to(self.device)
-                    .bert_model
-                )
-            else:
-                raise ValueError(f'Provided path {cfg.get("restore_lexical_encoder_from")} does not exists')
-
-        if hasattr(self.audio_encoder, 'decoder'):
-            del self.audio_encoder.decoder
-        if hasattr(self.audio_encoder, '_wer'):
-            del self.audio_encoder._wer
-        if hasattr(self.audio_encoder, 'loss'):
-            del self.audio_encoder.loss
-        if hasattr(self.audio_encoder, 'decoder_losses'):
-            del self.audio_encoder.decoder_losses
-
-        if cfg.get('use_weighted_loss', False):
-            punct_freq = torch.tensor(
-                list(self.train_dataloader().dataset.punct_label_frequencies.values()), dtype=torch.float
-            )
-            punct_weight = 1 - (punct_freq - punct_freq.min()) / punct_freq.max()
-
-            capit_freq = torch.tensor(
-                list(self.train_dataloader().dataset.capit_label_frequencies.values()), dtype=torch.float
-            )
-            capit_weight = 1 - (capit_freq - capit_freq.min()) / capit_freq.max()
-
-            self.loss_punct = CrossEntropyLoss(logits_ndim=3, weight=punct_weight)
-            self.loss_capit = CrossEntropyLoss(logits_ndim=3, weight=capit_weight)
-        else:
-            self.loss_punct = self.loss
-            self.loss_capit = self.loss
-
-        self.set_max_audio_length(1024)
-
-    def _make_step(self, batch: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        punct_logits, capit_logits = self(
-            input_ids=batch['input_ids'],
-            token_type_ids=batch['segment_ids'],
-            attention_mask=batch['input_mask'],
-            features=batch['features'],
-            features_length=batch['features_length'],
-        )
-
-        punct_loss = self.loss_punct(logits=punct_logits, labels=batch['punct_labels'], loss_mask=batch['loss_mask'])
-        capit_loss = self.loss_capit(logits=capit_logits, labels=batch['capit_labels'], loss_mask=batch['loss_mask'])
-        loss = self.agg_loss(loss_1=punct_loss, loss_2=capit_loss)
-        return loss, punct_logits, capit_logits
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        token_type_ids: Optional[torch.Tensor] = None,
-        features: torch.Tensor = None,
-        features_length: torch.Tensor = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Executes a forward pass through the model. For more details see ``forward`` method of :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_config.PunctuationCapitalizationLexicalAudioModelConfig`
-        and ``forward`` method of :class:'~nemo.collections.asr.models.EncDecCTCModel'
-
-        Args:
-            input_ids (:obj:`torch.Tensor`): an integer torch tensor of shape ``[Batch, Time]``. Contains encoded
-                source tokens.
-            attention_mask (:obj:`torch.Tensor`): a boolean torch tensor of shape ``[Batch, Time]``. Contains an
-                attention mask for excluding paddings.
-            token_type_ids (:obj:`torch.Tensor`): an integer torch Tensor of shape ``[Batch, Time]``. Contains an index
-                of segment to which a token belongs. If ``token_type_ids`` is not ``None``, then it should be a zeros
-                tensor.
-            features (:obj:`torch.Tensor`): tensor that represents a batch of raw audio signals,
-                of shape [B, T]. T here represents timesteps, with 1 second of audio represented as
-                sample_rate number of floating point values.
-            features_length (:obj:`torch.Tensor`): Vector of length B, that contains the individual lengths of the audio
-                sequences.
-
-        Returns:
-            :obj:`Tuple[torch.Tensor, torch.Tensor]`: a tuple containing
-
-                - ``punct_logits`` (:obj:`torch.Tensor`): a float torch tensor of shape
-                  ``[Batch, Time, NumPunctuationLabels]`` containing punctuation logits
-                - ``capit_logits`` (:obj:`torch.Tensor`): a float torch tensor of shape
-                  ``[Batch, Time, NumCapitalizationLabels]`` containing capitalization logits
-        """
-        self.update_max_seq_length(seq_length=features.size(1), device=features.device)
-        lexical_hidden_states = self.bert_model(
-            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-        )
-        if isinstance(lexical_hidden_states, tuple):
-            lexical_hidden_states = lexical_hidden_states[0]
-
-        processed_signal, processed_signal_length = self.audio_encoder.preprocessor(
-            input_signal=features,
-            length=features_length,
-        )
-
-        if self.audio_encoder.spec_augmentation is not None and self.training:
-            processed_signal = self.audio_encoder.spec_augmentation(
-                input_spec=processed_signal, length=processed_signal_length
-            )
-
-        audio_hidden_states, audio_hidden_states_length = self.audio_encoder.encoder(
-            audio_signal=processed_signal, length=processed_signal_length
-        )
-        audio_hidden_states = audio_hidden_states.permute(0, 2, 1)
-        audio_hidden_states = self.audio_proj(audio_hidden_states)
-
-        fused = self.fusion(
-            lexical_hidden_states,
-            attention_mask,
-            audio_hidden_states,
-            self.make_pad_mask(audio_hidden_states.size(1), audio_hidden_states_length),
-        )
-
-        punct_logits = self.punct_classifier(hidden_states=fused)
-        capit_logits = self.capit_classifier(hidden_states=fused)
-
-        return punct_logits, capit_logits
-
-    def make_pad_mask(self, max_audio_length, seq_lens):
-        """Make masking for padding."""
-        mask = self.seq_range[:max_audio_length].expand(seq_lens.size(0), -1) < seq_lens.unsqueeze(-1)
-        return mask
-
-    def update_max_seq_length(self, seq_length: int, device):
-        if torch.distributed.is_initialized():
-            global_max_len = torch.tensor([seq_length], dtype=torch.float32, device=device)
-
-            # Update across all ranks in the distributed system
-            torch.distributed.all_reduce(global_max_len, op=torch.distributed.ReduceOp.MAX)
-
-            seq_length = global_max_len.int().item()
-
-        if seq_length > self.max_audio_length:
-            self.set_max_audio_length(seq_length)
-
-    def set_max_audio_length(self, max_audio_length):
-        """
-        Sets maximum input length.
-        Pre-calculates internal seq_range mask.
-        """
-        self.max_audio_length = max_audio_length
-        device = next(self.parameters()).device
-        seq_range = torch.arange(0, self.max_audio_length, device=device)
-        if hasattr(self, 'seq_range'):
-            self.seq_range = seq_range
-        else:
-            self.register_buffer('seq_range', seq_range, persistent=False)
-
-    def add_punctuation_capitalization(
-        self,
-        queries: List[str],
-        batch_size: int = None,
-        max_seq_length: int = 64,
-        step: int = 8,
-        margin: int = 16,
-        return_labels: bool = False,
-        dataloader_kwargs: Dict[str, Any] = None,
-        audio_queries: Optional[Union[List[bytes], List[str]]] = None,
-        target_sr: Optional[int] = None,
-    ) -> List[str]:
-        """
-        Adds punctuation and capitalization to the queries. Use this method for inference.
-
-        Parameters ``max_seq_length``, ``step``, ``margin`` are for controlling the way queries are split into segments
-        which are processed by the model. Parameter ``max_seq_length`` is a length of a segment after tokenization
-        including special tokens [CLS] in the beginning and [SEP] in the end of a segment. Parameter ``step`` is a
-        shift between consequent segments. Parameter ``margin`` is used to exclude negative effect of subtokens near
-        borders of segments which have only one side context.
-
-        If segments overlap, probabilities of overlapping predictions are multiplied and then the label with
-        corresponding to the maximum probability is selected.
-
-        Args:
-            queries (:obj:`List[str]`): lower cased text without punctuation.
-            batch_size (:obj:`List[str]`, `optional`): batch size to use during inference. If ``batch_size`` parameter
-                is not provided, then it will be equal to length of ``queries`` list.
-            max_seq_length (:obj:`int`, `optional`, defaults to :obj:`64`): maximum sequence length of a segment after
-                tokenization including :code:`[CLS]` and :code:`[SEP]` tokens.
-            step (:obj:`int`, `optional`, defaults to :obj:`8`): relative shift of consequent segments into which long
-                queries are split. Long queries are split into segments which can overlap. Parameter ``step`` controls
-                such overlapping. Imagine that queries are tokenized into characters, ``max_seq_length=5``, and
-                ``step=2``. In such case, query ``"hello"`` is tokenized into segments
-                ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]``.
-            margin (:obj:`int`, `optional`, defaults to :obj:`16`): number of subtokens in the beginning and the end of
-                segments which are not used for prediction computation. The first segment does not have left margin and
-                the last segment does not have right margin. For example, if an input sequence is tokenized into
-                characters, ``max_seq_length=5``, ``step=1``, and ``margin=1``, then query ``"hello"`` will be
-                tokenized into segments ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'e', 'l', 'l', '[SEP]'],
-                ['[CLS]', 'l', 'l', 'o', '[SEP]']]``. These segments are passed to the model. Before final predictions
-                computation, margins are removed. In the next list, subtokens which logits are not used for final
-                predictions computation are marked with asterisk: ``[['[CLS]'*, 'h', 'e', 'l'*, '[SEP]'*],
-                ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]``.
-            return_labels (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to return labels in NeMo format
-                (see :ref:`nlp/punctuation_and_capitalization/NeMo Data Format`) instead of queries with restored
-                punctuation and capitalization.
-            dataloader_kwargs (:obj:`Dict[str, Any]`, `optional`): an optional dictionary with parameters of PyTorch
-                data loader. May include keys: ``'num_workers'``, ``'pin_memory'``, ``'worker_init_fn'``,
-                ``'prefetch_factor'``, ``'persistent_workers'``.
-            audio_queries (:obj:`List[str]`, `optional`): paths to audio files.
-            target_sr (:obj:`int`, `optional`): target sample rate for audios.
-        Returns:
-            :obj:`List[str]`: a list of queries with restored capitalization and punctuation if
-            ``return_labels=False``, else a list of punctuation and capitalization labels strings for all queries
-        """
-
-        if len(queries) == 0:
-            return []
-        if batch_size is None:
-            batch_size = len(queries)
-            logging.info(f'Using batch size {batch_size} for inference')
-        result: List[str] = []
-        mode = self.training
-        try:
-            self.eval()
-            infer_datalayer = self._setup_infer_dataloader(
-                queries, batch_size, max_seq_length, step, margin, dataloader_kwargs, audio_queries, target_sr
-            )
-            # Predicted labels for queries. List of labels for every query
-            all_punct_preds: List[List[int]] = [[] for _ in queries]
-            all_capit_preds: List[List[int]] = [[] for _ in queries]
-            # Accumulated probabilities (or product of probabilities acquired from different segments) of punctuation
-            # and capitalization. Probabilities for words in a query are extracted using `subtokens_mask`. Probabilities
-            # for newly processed words are appended to the accumulated probabilities. If probabilities for a word are
-            # already present in `acc_probs`, old probabilities are replaced with a product of old probabilities
-            # and probabilities acquired from new segment. Segments are processed in an order they appear in an
-            # input query. When all segments with a word are processed, a label with the highest probability
-            # (or product of probabilities) is chosen and appended to an appropriate list in `all_preds`. After adding
-            # prediction to `all_preds`, probabilities for a word are removed from `acc_probs`.
-            acc_punct_probs: List[Optional[np.ndarray]] = [None for _ in queries]
-            acc_capit_probs: List[Optional[np.ndarray]] = [None for _ in queries]
-            d = self.device
-            for batch_i, batch in tqdm(
-                enumerate(infer_datalayer), total=ceil(len(infer_datalayer.dataset) / batch_size), unit="batch"
-            ):
-                (
-                    inp_ids,
-                    inp_type_ids,
-                    inp_mask,
-                    subtokens_mask,
-                    start_word_ids,
-                    query_ids,
-                    is_first,
-                    is_last,
-                    features,
-                    features_length,
-                ) = batch
-                punct_logits, capit_logits = self.forward(
-                    input_ids=inp_ids.to(d),
-                    token_type_ids=inp_type_ids.to(d),
-                    attention_mask=inp_mask.to(d),
-                    features=features.to(d),
-                    features_length=features_length.to(d),
-                )
-                _res = self._transform_logit_to_prob_and_remove_margins_and_extract_word_probs(
-                    punct_logits, capit_logits, subtokens_mask, start_word_ids, margin, is_first, is_last
-                )
-                punct_probs, capit_probs, start_word_ids = _res
-                for i, (q_i, start_word_id, bpp_i, bcp_i) in enumerate(
-                    zip(query_ids, start_word_ids, punct_probs, capit_probs)
-                ):
-                    for all_preds, acc_probs, b_probs_i in [
-                        (all_punct_preds, acc_punct_probs, bpp_i),
-                        (all_capit_preds, acc_capit_probs, bcp_i),
-                    ]:
-                        if acc_probs[q_i] is None:
-                            acc_probs[q_i] = b_probs_i
-                        else:
-                            all_preds[q_i], acc_probs[q_i] = self._move_acc_probs_to_token_preds(
-                                all_preds[q_i],
-                                acc_probs[q_i],
-                                start_word_id - len(all_preds[q_i]),
-                            )
-                            acc_probs[q_i] = self._update_accumulated_probabilities(acc_probs[q_i], b_probs_i)
-            for all_preds, acc_probs in [(all_punct_preds, acc_punct_probs), (all_capit_preds, acc_capit_probs)]:
-                for q_i, (pred, prob) in enumerate(zip(all_preds, acc_probs)):
-                    if prob is not None:
-                        all_preds[q_i], acc_probs[q_i] = self._move_acc_probs_to_token_preds(pred, prob, len(prob))
-            for i, query in enumerate(queries):
-                result.append(
-                    self._get_labels(all_punct_preds[i], all_capit_preds[i])
-                    if return_labels
-                    else self._apply_punct_capit_predictions(query, all_punct_preds[i], all_capit_preds[i])
-                )
-        finally:
-            # set mode back to its original value
-            self.train(mode=mode)
-        return result
-
-    @classmethod
-    def list_available_models(cls) -> List[PretrainedModelInfo]:
-        return []
diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
deleted file mode 100644
index 7dd8da7a77e3..000000000000
--- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
+++ /dev/null
@@ -1,1272 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import warnings
-from math import ceil
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig, OmegaConf
-from tqdm import tqdm
-
-from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
-from nemo.collections.common.losses import AggregatorLoss, CrossEntropyLoss
-from nemo.collections.common.metrics import GlobalAverageLossMetric
-from nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset import (
-    BertPunctuationCapitalizationDataset,
-    PunctuationCapitalizationEvalDataConfig,
-    PunctuationCapitalizationTrainDataConfig,
-    load_label_ids,
-    raise_not_equal_labels_error,
-)
-from nemo.collections.nlp.data.token_classification.punctuation_capitalization_infer_dataset import (
-    BertPunctuationCapitalizationInferDataset,
-)
-from nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset import (
-    BertPunctuationCapitalizationTarredDataset,
-)
-from nemo.collections.nlp.metrics.classification_report import ClassificationReport
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.collections.nlp.models.token_classification.punctuation_capitalization_config import (
-    is_legacy_model_config,
-    legacy_model_config_to_new_model_config,
-)
-from nemo.collections.nlp.modules.common import TokenClassifier
-from nemo.core.classes.common import PretrainedModelInfo, typecheck
-from nemo.core.classes.exportable import Exportable
-from nemo.core.neural_types import LogitsType, NeuralType
-from nemo.utils import logging
-
-__all__ = ['PunctuationCapitalizationModel']
-
-
-class PunctuationCapitalizationModel(NLPModel, Exportable):
-    """
-    A model for restoring punctuation and capitalization in text. The model is usually used together with ASR model
-    because ASR models often return text without punctuation and capitalization.
-
-    The model consists of a language model and two multilayer perceptrons (MLP) on top the language model. The first
-    MLP serves for punctuation prediction and the second is for capitalization prediction. You can use only BERT-like
-    HuggingFace language models (model ``forward`` method accepts ``input_ids``, ``token_types_ids``,
-    ``attention_mask`` arguments). See more about model config options :ref:`here<model-config-label>`.
-
-    Use method :meth:`~add_punctuation_capitalization` for model inference.
-
-    For training and testing use dataset
-    :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.BertPunctuationCapitalizationDataset`,
-    for training on huge amounts of data which cannot be loaded into memory simultaneously use
-    :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.BertPunctuationCapitalizationTarredDataset`.
-
-    Args:
-        cfg: a model configuration. It should follow dataclass
-            :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_config.PunctuationCapitalizationModelConfig`
-            See an example of full config in
-            `nemo/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml
-            <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml>`_
-        trainer: an instance of a PyTorch Lightning trainer
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Neural types of a :meth:`forward` method output."""
-        return {
-            "punct_logits": NeuralType(('B', 'T', 'C'), LogitsType()),
-            "capit_logits": NeuralType(('B', 'T', 'C'), LogitsType()),
-        }
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None) -> None:
-        """Initializes BERT Punctuation and Capitalization model."""
-        if is_legacy_model_config(cfg):
-            cfg = legacy_model_config_to_new_model_config(cfg)
-
-        # For structure of `self.metrics` attribute see `self._setup_metrics_dictionary` method.
-        self.metrics: Optional[torch.nn.ModuleDict] = None
-        self.label_ids_are_set: bool = False
-        self.punct_label_ids: Optional[Dict[str, int]] = None
-        self.capit_label_ids: Optional[Dict[str, int]] = None
-        super().__init__(cfg=cfg, trainer=trainer)
-        if not self.label_ids_are_set:
-            self._set_label_ids()
-
-        self.punct_classifier = TokenClassifier(
-            hidden_size=self.hidden_size,
-            num_classes=len(self.punct_label_ids),
-            activation=cfg.punct_head.activation,
-            log_softmax=False,
-            dropout=cfg.punct_head.fc_dropout,
-            num_layers=cfg.punct_head.num_fc_layers,
-            use_transformer_init=cfg.punct_head.use_transformer_init,
-        )
-
-        self.capit_classifier = TokenClassifier(
-            hidden_size=self.hidden_size,
-            num_classes=len(self.capit_label_ids),
-            activation=cfg.capit_head.activation,
-            log_softmax=False,
-            dropout=cfg.capit_head.fc_dropout,
-            num_layers=cfg.capit_head.num_fc_layers,
-            use_transformer_init=cfg.capit_head.use_transformer_init,
-        )
-
-        self.loss = CrossEntropyLoss(logits_ndim=3)
-        self.agg_loss = AggregatorLoss(num_inputs=2)
-
-    @typecheck()
-    def forward(
-        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, token_type_ids: Optional[torch.Tensor] = None
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Executes a forward pass through the model. For more details see ``forward`` method of HuggingFace BERT-like
-        (models which accept ``input_ids``, ``attention_mask``, ``token_type_ids`` arguments) models.
-
-        Args:
-            input_ids (:obj:`torch.Tensor`): an integer torch tensor of shape ``[Batch, Time]``. Contains encoded
-                source tokens.
-            attention_mask (:obj:`torch.Tensor`): a boolean torch tensor of shape ``[Batch, Time]``. Contains an
-                attention mask for excluding paddings.
-            token_type_ids (:obj:`torch.Tensor`): an integer torch Tensor of shape ``[Batch, Time]``. Contains an index
-                of segment to which a token belongs. If ``token_type_ids`` is not ``None``, then it should be a zeros
-                tensor.
-
-        Returns:
-            :obj:`Tuple[torch.Tensor, torch.Tensor]`: a tuple containing
-
-                - ``punct_logits`` (:obj:`torch.Tensor`): a float torch tensor of shape
-                  ``[Batch, Time, NumPunctuationLabels]`` containing punctuation logits
-                - ``capit_logits`` (:obj:`torch.Tensor`): a float torch tensor of shape
-                  ``[Batch, Time, NumCapitalizationLabels]`` containing capitalization logits
-        """
-        hidden_states = self.bert_model(
-            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-        )
-        if isinstance(hidden_states, tuple):
-            hidden_states = hidden_states[0]
-
-        punct_logits = self.punct_classifier(hidden_states=hidden_states)
-        capit_logits = self.capit_classifier(hidden_states=hidden_states)
-        return punct_logits.float(), capit_logits.float()
-
-    def _make_step(self, batch: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        punct_logits, capit_logits = self(
-            input_ids=batch['input_ids'], token_type_ids=batch['segment_ids'], attention_mask=batch['input_mask']
-        )
-
-        punct_loss = self.loss(logits=punct_logits, labels=batch['punct_labels'], loss_mask=batch['loss_mask'])
-        capit_loss = self.loss(logits=capit_logits, labels=batch['capit_labels'], loss_mask=batch['loss_mask'])
-        loss = self.agg_loss(loss_1=punct_loss, loss_2=capit_loss)
-        return loss, punct_logits, capit_logits
-
-    def training_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> Dict[str, Union[torch.Tensor, float]]:
-        """
-        Lightning calls this inside the training loop with the data from the training dataloader passed in as
-        ``batch``.
-
-        Args:
-            batch: a dictionary with following
-                items:
-
-                  - ``'input_ids'`` (:obj:`torch.Tensor`): an integer torch tensor of shape ``[Batch, Time]`` containing
-                    encoded source text
-                  - ``'segment_ids'`` (:obj:`torch.Tensor`): a zeros integer torch tensor of shape ``[Batch, Time]``
-                  - ``'input_mask'`` (:obj:`torch.Tensor`): a boolean torch tensor of shape ``[Batch, Time]``. Serves as
-                    attention mask. should be ``False`` on padding tokens and ``True`` on other tokens.
-                  - ``'loss_mask'`` (:obj:`torch.Tensor`): a boolean torch tensor of shape ``[Batch, Time]``. Which token
-                    to compute loss on. See more details in description of parameters ``ignore_start_end`` and
-                    ``ignore_extra_tokens`` of a class
-                    :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.BertPunctuationCapitalizationDataset`
-                  - ``'punct_labels'`` (:obj:`torch.Tensor`): a ``long`` torch tensor of shape ``[Batch, Time]``.
-                    Contains encoded punctuation labels
-                  - ``'capit_labels'`` (:obj:`torch.Tensor`): a ``long`` torch tensor of shape ``[Batch, Time]``.
-                    Contains encoded capitalization labels
-                  - ``'subtokens_mask'`` (:obj:`torch.Tensor`): not required for training and can be omitted
-
-            batch_idx (:obj:`int`): an index of batch. Mandatory Lightning parameter
-
-        Returns:
-            :obj:`Dict[str, Union[torch.Tensor, float]]`: a dictionary with 2 items:
-
-                - ``'loss'`` (:obj:`torch.Tensor`): torch tensor containing mean aggregated punctuation and
-                  capitalization loss
-                - ``'lr'`` (:obj:`float`): a float containing learning rate
-        """
-        loss, _, _ = self._make_step(batch)
-        lr = self._optimizer.param_groups[0]['lr']
-        self.log('lr', lr, prog_bar=True)
-        self.log('train_loss', loss)
-        return {'loss': loss, 'lr': lr}
-
-    def eval_step(self, batch: Dict[str, torch.Tensor], mode: str, dataloader_idx: int) -> Dict[str, None]:
-        """
-        A method called by :meth:`validation_step` and :meth:`test_step`. Performs forward pass and updates metrics.
-
-        Args:
-            batch (:obj:`Dict[str, torch.Tensor]`): a dictionary with following items:
-
-                - ``'input_ids'`` (:obj:`torch.Tensor`): an integer torch tensor of shape ``[Batch, Time]`` containing
-                  encoded source text.
-                - ``'subtokens_mask'`` (:obj:`torch.Tensor`): a boolean torch tensor of shape ``[Batch, Time]``. An
-                  element of this item is ``True`` if corresponding token from ``'input_ids'`` element is the first
-                  token in some word.
-                - ``'segment_ids'`` (:obj:`torch.Tensor`): a zeros integer torch tensor of shape ``[Batch, Time]``.
-                - ``'input_mask'`` (:obj:`torch.Tensor`): a boolean torch tensor of shape ``[Batch, Time]``. Serves as
-                  attention mask. should be ``False`` on padding tokens and ``True`` on other tokens.
-                - ``'loss_mask'`` (:obj:`torch.Tensor`): a boolean torch tensor of shape ``[Batch, Time]``. Which token
-                  to compute loss on. See more details in description of parameters ``ignore_start_end`` and
-                  ``ignore_extra_tokens`` of class
-                  :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.BertPunctuationCapitalizationDataset`.
-                - ``'punct_labels'`` (:obj:`torch.Tensor`): a long torch tensor of shape ``[Batch, Time]``. Contains
-                  encoded punctuation labels.
-                - ``'capit_labels'`` (:obj:`torch.Tensor`): a long torch tensor of shape ``[Batch, Time]``. Contains
-                  encoded capitalization labels.
-            mode: either ``'validation'`` or ``'test'`` depending on caller method.
-            dataloader_idx: NeMo parameter for multi dataset validation.
-
-        Returns:
-            :obj:`Dict[str, None]`: a dictionary containing items ``'loss'``, ``'punct_class_report'``,
-            ``'capit_class_report'`` which values are ``None``. Values are ``None`` because metrics are computed using
-            ``torchmetrics``.
-        """
-        loss, punct_logits, capit_logits = self._make_step(batch)
-        subtokens_mask = batch['subtokens_mask']
-        punct_preds = torch.argmax(punct_logits, axis=-1)[subtokens_mask]
-        punct_labels = batch['punct_labels'][subtokens_mask]
-        capit_preds = torch.argmax(capit_logits, axis=-1)[subtokens_mask]
-        capit_labels = batch['capit_labels'][subtokens_mask]
-        self.metrics[mode]['loss'][dataloader_idx](
-            loss=loss, num_measurements=batch['loss_mask'].sum().to(loss.device)
-        )
-        self.metrics[mode]['punct_class_report'][dataloader_idx](punct_preds, punct_labels)
-        self.metrics[mode]['capit_class_report'][dataloader_idx](capit_preds, capit_labels)
-        # torchmetrics are used for metrics computation
-        return {'loss': None, 'punct_class_report': None, 'capit_class_report': None}
-
-    def validation_step(
-        self, batch: Dict[str, torch.Tensor], batch_idx: int, dataloader_idx: int = 0
-    ) -> Dict[str, None]:
-        """
-        Lightning calls this inside the validation loop with the data from the validation dataloader passed in as
-        ``batch``. See more details in :meth:`eval_step`.
-
-        Args:
-            batch (:obj:`dict`): see :meth:`eval_step` for the ``batch`` parameter explanation
-            batch_idx (:obj:`int`): an index of a batch in a dataset. A mandatory Lightning parameter
-            dataloader_idx (:obj:`int`): a NeMo parameter for performing testing on multiple datasets
-
-        Returns:
-            :obj:`Dict[str, None]`: a dictionary containing items ``'loss'``, ``'punct_class_report'``,
-            ``'capit_class_report'`` which values are ``None``. Values are ``None`` because metrics are computed using
-            ``torchmetrics``.
-        """
-        loss = self.eval_step(batch, 'val', dataloader_idx)
-        if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
-            self.validation_step_outputs[dataloader_idx].append(loss)
-        else:
-            self.validation_step_outputs.append(loss)
-        return loss
-
-    def test_step(self, batch: Dict[str, torch.Tensor], batch_idx: int, dataloader_idx: int = 0) -> Dict[str, None]:
-        """
-        Lightning calls this inside the test loop with the data from the test dataloader passed in as ``batch``.
-        See more details in :meth:`eval_step`.
-
-        Args:
-            batch (:obj:`dict`): see :meth:`eval_step` for the ``batch`` parameter explanation
-            batch_idx (:obj:`int`): an index of a batch in a dataset. A mandatory Lightning parameter
-            dataloader_idx (:obj:`int`): a NeMo parameter for performing testing on multiple datasets
-
-        Returns:
-            :obj:`Dict[str, None]`: a dictionary containing items ``'loss'``, ``'punct_class_report'``,
-            ``'capit_class_report'`` which values are ``None``. Values are ``None`` because metrics are computed using
-            ``torchmetrics``.
-        """
-        loss = self.eval_step(batch, 'test', dataloader_idx)
-        if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
-            self.test_step_outputs[dataloader_idx].append(loss)
-        else:
-            self.test_step_outputs.append(loss)
-        return loss
-
-    def on_train_epoch_end(self) -> None:
-        """
-        Called at the end of training epoch. This method properly shuffles
-        :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.BertPunctuationCapitalizationDataset`.
-        Regular data loader shuffling only permutes batches.
-
-        Args:
-            outputs (:obj:`pytorch_lightning.utilities.types.EPOCH_OUTPUT`): an output of all training steps. It is a
-                mandatory PyTorch Lightning parameter, and it is not used in this method
-        """
-        shuffle = self._cfg.train_ds.get('shuffle')
-        if shuffle is None:  # Encountered legacy config
-            shuffle = not self.cfg.train_ds.get('use_tarred_dataset', False)
-        if shuffle:
-            if isinstance(self.train_dataloader().dataset, BertPunctuationCapitalizationDataset):
-                self.train_dataloader().dataset.repack_batches_with_shuffle()
-
-    def _multi_eval_epoch_end(self, mode: str, dataloader_idx: int) -> Dict[str, Dict[str, torch.Tensor]]:
-        loss = self.metrics[mode]['loss'][dataloader_idx].compute()
-        self.metrics[mode]['loss'][dataloader_idx].reset()
-
-        punct_res = self.metrics[mode]['punct_class_report'][dataloader_idx].compute()
-        punct_precision, punct_recall, punct_f1, punct_report = punct_res
-        self.metrics[mode]['punct_class_report'][dataloader_idx].reset()
-
-        capit_res = self.metrics[mode]['capit_class_report'][dataloader_idx].compute()
-        capit_precision, capit_recall, capit_f1, capit_report = capit_res
-        self.metrics[mode]['capit_class_report'][dataloader_idx].reset()
-        log_dict = {
-            'log': {
-                f'{mode}_loss': loss,
-                f'{mode}_punct_precision': punct_precision,
-                f'{mode}_punct_f1': punct_f1,
-                f'{mode}_punct_recall': punct_recall,
-                f'{mode}_capit_precision': capit_precision,
-                f'{mode}_capit_f1': capit_f1,
-                f'{mode}_capit_recall': capit_recall,
-            }
-        }
-        logging.info(f'Punctuation report: {punct_report}')
-        logging.info(f'Capitalization report: {capit_report}')
-        return log_dict
-
-    def multi_validation_epoch_end(self, outputs: Any, dataloader_idx: int = 0) -> Dict[str, Dict[str, torch.Tensor]]:
-        """
-        Called at the end of validation to compute and log metrics.
-        """
-        return self._multi_eval_epoch_end('val', dataloader_idx)
-
-    def multi_test_epoch_end(self, outputs: Any, dataloader_idx: int = 0) -> Dict[str, Dict[str, torch.Tensor]]:
-        """
-        Called at the end of model testing to compute and log metrics.
-        """
-        return self._multi_eval_epoch_end('test', dataloader_idx)
-
-    def update_config_after_restoring_from_checkpoint(self, **kwargs) -> None:
-        """
-        Set new values for some sections of config. Useful after restoring from checkpoint for fine-tuning
-        and testing if config parameters of a restored checkpoint are not suitable.
-
-        For ``class_labels``, ``common_dataset_parameters``, ``train_ds``, ``validation_ds``, ``test_ds``, there is
-        no need to provide values for all items in an updated config section. If an item is omitted in this method
-        parameter, then corresponding item in model config does not change.
-
-        If the entire updated section is missing in the model config, then omitted items from this method parameters
-        are set according to default values listed
-        :ref:`here <run-config-label>`.
-
-        .. warning::
-            Parameter ``optim`` is processed in a special way. ``optim`` contents are used not for updating of
-            model config, but for replacement of entire config section.
-
-        If one of parameters ``train_ds``, ``validation_ds``, ``test_ds``, is provided but its value is
-        ``None``, then corresponding section is replaced with ``None``.
-
-        .. warning::
-            You may change values of parameters related to label ids:
-
-                - ``common_dataset_parameters.punct_label_ids``,
-                - ``common_dataset_parameters.capit_label_ids``,
-                - ``common_dataset_parameters.label_vocab_dir``,
-                - ``class_labels.punct_labels_file``,
-                - ``class_labels.capit_labels_file``,
-
-            yet label ids in these parameters must be equal to label ids loaded from checkpoint. Otherwise,
-            an error will be raised.
-
-        Keyword Args:
-            class_labels (:obj:`Union[DictConfig, Dict[str, str]]`): names of label id files used as label
-                id dictionaries. See more in :ref:`class labels' config<class-labels-config-label>`.
-            common_dataset_parameters (:obj:`Union[DictConfig, Dict[str, Any]]`, `optional`): see more in
-                :ref:`common dataset parameters config<common-dataset-parameters-config-label>`.
-            train_ds (:obj:`Union[DictConfig, Dict[str, Any]]`, `optional`): configuration of training dataset. See
-                possible options in :ref:`data config<data-config-label>`.
-            validation_ds (:obj:`Union[DictConfig, Dict[str, Any]]`, `optional`): configuration of validation
-                dataset. See possible options in :ref:`data config<data-config-label>`.
-            test_ds (:obj:`Union[DictConfig, Dict[str, Any]]`, `optional`): configuration of test dataset. See
-                possible options in :ref:`data config<data-config-label>`.
-            optim (:obj:`Union[DictConfig, Dict[str, Any]]`, `optional`): optimization configuration. See possible
-                options in :ref:`optimization<optimization-label>` and in `primer
-                <https://github.com/NVIDIA/NeMo/blob/main/tutorials/00_NeMo_Primer.ipynb>`_ tutorial.
-        """
-        allowed_keys = {'class_labels', 'common_dataset_parameters', 'train_ds', 'validation_ds', 'test_ds', 'optim'}
-        unexpected_keys = set(kwargs) - allowed_keys
-        if unexpected_keys:
-            raise ValueError(
-                f"Found unexpected keyword arguments: {unexpected_keys}. You can use only {allowed_keys}."
-            )
-        if 'class_labels' in kwargs:
-            if kwargs['class_labels'] is None:
-                raise ValueError(
-                    f"'class_labels' parameters is `None`, whereas you cannot remove section 'class_labels' from model "
-                    f"config."
-                )
-            self._cfg.class_labels = OmegaConf.merge(self._cfg.class_labels, OmegaConf.create(kwargs['class_labels']))
-        if 'common_dataset_parameters' in kwargs:
-            if kwargs['common_dataset_parameters'] is None:
-                raise ValueError(
-                    f"'common_dataset_parameters' item is `None`, whereas you cannot remove section"
-                    f"'common_dataset_parameters' from model config."
-                )
-            self._cfg.common_dataset_parameters = OmegaConf.merge(
-                self._cfg.common_dataset_parameters, OmegaConf.create(kwargs['common_dataset_parameters'])
-            )
-            self._check_label_config_parameters()
-        if 'train_ds' in kwargs:
-            if kwargs['train_ds'] is None:
-                self._cfg.train_ds = None
-            else:
-                if 'train_ds' in self._cfg and self._cfg.train_ds is not None:
-                    base = self._cfg.train_ds
-                else:
-                    base = OmegaConf.structured(PunctuationCapitalizationTrainDataConfig)
-                self._cfg.train_ds = OmegaConf.merge(base, OmegaConf.create(kwargs['train_ds']))
-        if 'validation_ds' in kwargs:
-            if kwargs['validation_ds'] is None:
-                self._cfg.validation_ds = None
-            else:
-                if 'validation_ds' in self._cfg and self._cfg.validation_ds is not None:
-                    base = self._cfg.validation_ds
-                else:
-                    base = OmegaConf.structured(PunctuationCapitalizationEvalDataConfig)
-                self._cfg.validation_ds = OmegaConf.merge(base, OmegaConf.create(kwargs['validation_ds']))
-        if 'test_ds' in kwargs:
-            if kwargs['test_ds'] is None:
-                self._cfg.test_ds = None
-            else:
-                if 'test_ds' in self._cfg and self._cfg.test_ds is not None:
-                    base = self._cfg.test_ds
-                else:
-                    base = OmegaConf.structured(PunctuationCapitalizationEvalDataConfig)
-                self._cfg.test_ds = OmegaConf.merge(base, OmegaConf.create(kwargs['test_ds']))
-        if 'optim' in kwargs:
-            self._cfg.optim = kwargs['optim']
-
-    def setup_training_data(self, train_data_config: Optional[Union[Dict[str, Any], DictConfig]] = None) -> None:
-        """
-        Sets up training data: creates dataset and sets data loader. If parameter ``train_data_config`` is not
-        provided, then :ref:`config<model-config-label>` section ``train_ds`` will be used.
-
-        Args:
-            train_data_config (:obj:`Union[Dict[str, Any], DictConfig]`, `optional`): a dictionary that should contain
-                only fields present in :ref:`data config<data-config-label>`.
-                If some of the fields are missing, then they will be set according to
-                :ref:`data config<data-config-label>` defaults. If ``train_data_config`` parameter is not set, then
-                ``train_ds`` item of model config is used. Here model config is a configuration used for model
-                instantiation.
-        """
-        if train_data_config is not None:
-            train_data_config = OmegaConf.create(train_data_config)
-            train_data_config = OmegaConf.merge(
-                OmegaConf.structured(PunctuationCapitalizationTrainDataConfig), train_data_config
-            )
-        if train_data_config is None:
-            train_data_config = self._cfg.train_ds
-
-        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, train=True)
-
-        # Need to set this because if using an IterableDataset, the length of the dataloader is the total number
-        # of samples rather than the number of batches, and this messes up the tqdm progress bar.
-        # So we set the number of steps manually (to the correct number) to fix this.
-        if 'use_tarred_dataset' in train_data_config and train_data_config['use_tarred_dataset']:
-            # We also need to check if limit_train_batches is already set.
-            # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches,
-            # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0).
-            if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float):
-                self._trainer.limit_train_batches = int(
-                    self._trainer.limit_train_batches * ceil(len(self._train_dl.dataset) / self.world_size)
-                )
-            elif self._trainer is None:
-                logging.warning(
-                    "Model Trainer was not set before constructing the dataset, incorrect number of "
-                    "training batches will be used. Please set the trainer and rebuild the dataset."
-                )
-
-        self.punct_label_ids = self._train_dl.dataset.punct_label_ids.copy()
-        self.capit_label_ids = self._train_dl.dataset.capit_label_ids.copy()
-        self.label_ids_are_set = True
-        if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
-            label_vocab_dir = self._cfg.common_dataset_parameters.label_vocab_dir
-            if label_vocab_dir is None:
-                punct_label_ids_file, capit_label_ids_file = self._train_dl.dataset.save_labels_and_get_file_paths(
-                    self._cfg.class_labels.punct_labels_file, self._cfg.class_labels.capit_labels_file
-                )
-            else:
-                punct_label_ids_file = Path(label_vocab_dir).expanduser() / self._cfg.class_labels.punct_labels_file
-                capit_label_ids_file = Path(label_vocab_dir).expanduser() / self._cfg.class_labels.capit_labels_file
-            self.register_artifact('class_labels.punct_labels_file', str(punct_label_ids_file))
-            self.register_artifact('class_labels.capit_labels_file', str(capit_label_ids_file))
-
-    def _get_eval_metrics_kwargs(
-        self,
-    ) -> Tuple[
-        Dict[str, bool],
-        Dict[str, Union[bool, str, int, Dict[str, int]]],
-        Dict[str, Union[bool, str, int, Dict[str, int]]],
-    ]:
-        loss_kw = {'dist_sync_on_step': False, 'take_avg_loss': True}
-        punct_kw = {
-            'num_classes': len(self.punct_label_ids),
-            'label_ids': self.punct_label_ids,
-            'mode': 'macro',
-            'dist_sync_on_step': False,
-        }
-        capit_kw = {
-            'num_classes': len(self.capit_label_ids),
-            'label_ids': self.capit_label_ids,
-            'mode': 'macro',
-            'dist_sync_on_step': False,
-        }
-        return loss_kw, punct_kw, capit_kw
-
-    def _setup_metrics_dictionary(self) -> None:
-        eval_metrics = torch.nn.ModuleDict(
-            {
-                "loss": torch.nn.ModuleList([]),
-                "punct_class_report": torch.nn.ModuleList([]),
-                "capit_class_report": torch.nn.ModuleList([]),
-            }
-        )
-        self.metrics = torch.nn.ModuleDict({"val": eval_metrics, "test": copy.deepcopy(eval_metrics)})
-
-    def setup_validation_data(self, val_data_config: Optional[Union[Dict[str, Any], DictConfig]] = None) -> None:
-        """
-        Sets up validation data: creates dataset and sets data loader. If parameter ``val_data_config`` is not
-        provided, then ``validation_ds`` :ref:`config <model-config-label>` section will be used. Here model config is
-        a configuration used for model instantiation.
-
-        Args:
-            val_data_config (:obj:`Union[Dict[str, Any], DictConfig]`, `optional`): a dictionary that should contain
-                only fields present in data config :ref:`description<data-config-label>`.
-                If some of the fields are missing, then they will be set according to data config
-                :ref:`description<data-config-label>` defaults. If ``val_data_config`` parameter is not set, then
-                ``validation_ds`` item of model config is used. Here model config is a configuration used for model
-                instantiation.
-        """
-        if val_data_config is not None:
-            val_data_config = OmegaConf.create(val_data_config)
-            val_data_config = OmegaConf.merge(
-                OmegaConf.structured(PunctuationCapitalizationEvalDataConfig), val_data_config
-            )
-        if self.metrics is None:
-            self._setup_metrics_dictionary()
-        if val_data_config is None:
-            val_data_config = self._cfg.validation_ds
-
-        self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, train=False)
-
-        # Need to set this because if using an IterableDataset, the length of the dataloader is the total number
-        # of samples rather than the number of batches, and this messes up the tqdm progress bar.
-        # So we set the number of steps manually (to the correct number) to fix this.
-        if 'use_tarred_dataset' in val_data_config and val_data_config['use_tarred_dataset']:
-            # We also need to check if limit_val_batches is already set.
-            # If it's an int, we assume that the user has set it to something sane, i.e. <= # validation batches,
-            # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0).
-            if self._trainer is not None and isinstance(self._trainer.limit_val_batches, float):
-                self._trainer.limit_val_batches = int(
-                    self._trainer.limit_val_batches * ceil(len(self._validation_dl.dataset) / self.world_size)
-                )
-            elif self._trainer is None:
-                logging.warning(
-                    "Model Trainer was not set before constructing the dataset, incorrect number of "
-                    "validation batches will be used. Please set the trainer and rebuild the dataset."
-                )
-
-        loss_kw, punct_kw, capit_kw = self._get_eval_metrics_kwargs()
-        self.metrics['val']['loss'].append(GlobalAverageLossMetric(**loss_kw))
-        self.metrics['val']['punct_class_report'].append(ClassificationReport(**punct_kw))
-        self.metrics['val']['capit_class_report'].append(ClassificationReport(**capit_kw))
-
-    def setup_test_data(self, test_data_config: Optional[Union[Dict[str, Any], DictConfig]] = None) -> None:
-        """
-        Sets up test data: creates dataset and sets data loader. If parameter ``test_data_config`` is not
-        provided, then ``test_ds`` config section will be used. See more about in data config
-        :ref:`description <data-config-label>` and model config :ref:`description<model-config-label>`.
-
-        Args:
-            test_data_config (:obj:`Union[Dict[str, Any], DictConfig]`, `optional`): a dictionary that should contain
-                only fields present in data config :ref:`description<data-config-label>`.
-                If some of the fields are missing, then they will be set according to data config
-                :ref:`description <data-config-label>` defaults. If ``test_data_config`` parameter is not set, then
-                ``test_ds`` item of :ref:`model config <model-config-label>` is used. Here model config is a
-                configuration used for model instantiation.
-        """
-        if test_data_config is not None:
-            test_data_config = OmegaConf.create(test_data_config)
-            test_data_config = OmegaConf.merge(
-                OmegaConf.structured(PunctuationCapitalizationEvalDataConfig), test_data_config
-            )
-        if self.metrics is None:
-            self._setup_metrics_dictionary()
-        if test_data_config is None:
-            test_data_config = self._cfg.test_ds
-        self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, train=False)
-        # Check for multiple dataloaders here as it may not get called in ModelPT when models are being restored
-        if type(self._test_dl) == list and len(self._test_dl) > 1:
-            for _ in range(len(self._test_dl)):
-                self.test_step_outputs.append([])
-
-        loss_kw, punct_kw, capit_kw = self._get_eval_metrics_kwargs()
-        self.metrics['test']['loss'].append(GlobalAverageLossMetric(**loss_kw))
-        self.metrics['test']['punct_class_report'].append(ClassificationReport(**punct_kw))
-        self.metrics['test']['capit_class_report'].append(ClassificationReport(**capit_kw))
-
-    def _check_label_config_parameters(self) -> None:
-        """
-        Checks that config items ``common_dataset_parameters.punct_label_ids`` and
-        ``common_dataset_parameters.punct_label_vocab_file``,
-        ``common_dataset_parameters.capit_label_ids`` and ``common_dataset_parameters.capit_label_vocab_file`` contain
-        identical label ids. Of course, if any of these parameters is ``None``, then check is not performed.
-
-        In addition, this method checks that ``common_dataset_parameters.pad_label`` has id ``0`` in punctuation and
-        capitalization label ids.
-        """
-        pli = self._cfg.common_dataset_parameters.punct_label_ids
-        cli = self._cfg.common_dataset_parameters.capit_label_ids
-        pad_label = self._cfg.common_dataset_parameters.pad_label
-        plvf, clvf = self._extract_label_vocab_files_from_config()
-        for label_ids, label_vocab_file, already_set_label_ids, label_ids_name, label_vocab_name in [
-            (pli, plvf, self.punct_label_ids, 'punct_label_ids', 'punct_label_vocab_file'),
-            (cli, clvf, self.capit_label_ids, 'capit_label_ids', 'capit_label_vocab_file'),
-        ]:
-            if label_vocab_file is not None:
-                file_label_ids = load_label_ids(label_vocab_file)
-            if label_ids is not None and label_vocab_file is not None:
-                if label_ids != file_label_ids:
-                    raise_not_equal_labels_error(
-                        first_labels=label_ids,
-                        second_labels=file_label_ids,
-                        first_labels_desc=f"Labels passed in config parameter "
-                        f"`model.common_dataset_parameters.{label_ids_name}`",
-                        second_labels_desc=f"Labels loaded from file {plvf} passed in config "
-                        f"parameter `model.common_dataset_parameters.{label_vocab_name}",
-                    )
-            if already_set_label_ids is not None:
-                config_label_ids = label_ids if label_vocab_file is None else file_label_ids
-                if config_label_ids is not None:
-                    if label_vocab_file is None:
-                        config_label_ids_source = (
-                            f"Labels passed in config parameter `model.common_dataset_parameters.{label_ids_name}`"
-                        )
-                    else:
-                        config_label_ids_source = (
-                            f"Labels loaded from file {plvf} passed in config parameter "
-                            f"`model.common_dataset_parameters.{label_vocab_name}`"
-                        )
-                    if already_set_label_ids != config_label_ids:
-                        raise_not_equal_labels_error(
-                            first_labels=config_label_ids,
-                            second_labels=already_set_label_ids,
-                            first_labels_desc=config_label_ids_source,
-                            second_labels_desc=f"Labels which are already set in an attribute "
-                            f"`PunctuationCapitalizationModel.{label_ids_name}`",
-                        )
-        if plvf is not None:
-            pli = load_label_ids(plvf)
-        if clvf is not None:
-            cli = load_label_ids(clvf)
-        for label_ids, parameter_name in [
-            (pli, 'punct_label_vocab_file' if pli is None else 'punct_label_ids'),
-            (cli, 'capit_label_vocab_file' if cli is None else 'capit_label_ids'),
-        ]:
-            if label_ids is not None and label_ids[pad_label] != 0:
-                raise ValueError(
-                    f"Pad label '{pad_label}' has non zero id {label_ids[pad_label]} in "
-                    f"`model.common_dataset_parameters.{parameter_name}`."
-                )
-
-    def _extract_label_vocab_files_from_config(self) -> Tuple[Optional[Path], Optional[Path]]:
-        if self._is_model_being_restored():
-            punct_label_vocab_file = self._cfg.class_labels.punct_labels_file
-            capit_label_vocab_file = self._cfg.class_labels.capit_labels_file
-        else:
-            if self._cfg.common_dataset_parameters.label_vocab_dir is None:
-                punct_label_vocab_file, capit_label_vocab_file = None, None
-            else:
-                label_vocab_dir = Path(self._cfg.common_dataset_parameters.label_vocab_dir).expanduser()
-                punct_label_vocab_file = label_vocab_dir / self._cfg.class_labels.punct_labels_file
-                capit_label_vocab_file = label_vocab_dir / self._cfg.class_labels.capit_labels_file
-        return punct_label_vocab_file, capit_label_vocab_file
-
-    def _set_label_ids(self) -> None:
-        """
-        Set model attributes ``punct_label_ids`` and ``capit_label_ids`` based on label ids passed in config
-        item ``common_dataset_parameters``.
-
-        This method also registers artifacts ``class_labels.punct_labels_file`` and ``class_labels.capit_labels_file``.
-
-        This method is called if you do not plan to infer label ids from training file with labels. If training file
-        with labels is going to be used, then calling :meth:`~setup_training_data` is enough to set
-        ``punct_label_ids`` and ``capit_label_ids`` and register label artifacts.
-        """
-        punct_label_vocab_file, capit_label_vocab_file = self._extract_label_vocab_files_from_config()
-        if punct_label_vocab_file is not None:
-            punct_labels_file = self.register_artifact('class_labels.punct_labels_file', str(punct_label_vocab_file))
-            if punct_labels_file is None:
-                logging.warning(
-                    f"The artifact `class_labels.punct_labels_file` was not found in checkpoint. Will rely on "
-                    f"`punct_label_ids` parameter"
-                )
-                self.punct_label_ids = OmegaConf.to_container(self._cfg.common_dataset_parameters.punct_label_ids)
-            else:
-                self.punct_label_ids = load_label_ids(
-                    self.register_artifact('class_labels.punct_labels_file', str(punct_label_vocab_file))
-                )
-        elif self._cfg.common_dataset_parameters.punct_label_ids is not None:
-            self.punct_label_ids = OmegaConf.to_container(self._cfg.common_dataset_parameters.punct_label_ids)
-        else:
-            raise ValueError(
-                f"Could not set attribute `punct_label_ids`. Config parameters "
-                f"`model.common_dataset_parameters.punct_label_ids`, "
-                f"`model.common_dataset_parameters.punct_label_vocab_file` are not set. Another way to set "
-                f"`punct_label_ids` is calling method `setup_training_data`. That way punctuation label ids will be "
-                f"inferred from training set."
-            )
-        if capit_label_vocab_file is not None:
-            capit_labels_file = self.register_artifact('class_labels.capit_labels_file', str(capit_label_vocab_file))
-            if capit_labels_file is None:
-                logging.warning(
-                    f"The artifact `class_labels.capit_labels_file` was not found in checkpoint. Will rely on "
-                    f"`capit_label_ids` parameter"
-                )
-                self.capit_label_ids = OmegaConf.to_container(self._cfg.common_dataset_parameters.capit_label_ids)
-            else:
-                self.capit_label_ids = load_label_ids(
-                    self.register_artifact('class_labels.capit_labels_file', str(capit_label_vocab_file))
-                )
-        elif self._cfg.common_dataset_parameters.capit_label_ids is not None:
-            self.capit_label_ids = OmegaConf.to_container(self._cfg.common_dataset_parameters.capit_label_ids)
-        else:
-            raise ValueError(
-                f"Could not set attribute `capit_label_ids`. Config parameters "
-                f"`model.common_dataset_parameters.capit_label_ids`, "
-                f"`model.common_dataset_parameters.capit_label_vocab_file` are not set. Another way to set "
-                f"`capit_label_ids` is calling method `setup_training_data`. That way capitalization label ids will "
-                f"be inferred from training set."
-            )
-        self.label_ids_are_set = True
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.utils.data.DataLoader:
-        """
-        Creates dataset and data loader according to config ``cfg``. If ``train=False`` and attributes
-        ``punct_label_ids`` and ``capit_label_ids`` are not set, then this method sets the attributes and registers
-        label artifacts.
-
-        Args:
-            cfg (:obj:`DictConfig`): a config which follows dataclass
-                :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.PunctuationCapitalizationEvalDataConfig`
-                Note that list ``ds_item`` is not supported because list ``ds_item`` is unpacked by NeMo core
-                instruments
-            train (:obj:`bool`): whether train data is set. If ``True``, then label ids are not set in this function
-        """
-        self._check_label_config_parameters()
-        if not self.label_ids_are_set and not train:
-            self._set_label_ids()
-        if cfg.use_tarred_dataset:
-            if cfg.tar_metadata_file is None:
-                raise ValueError(
-                    f"If parameter `use_tarred_dataset` is `True`, then a field `tar_metadata_file` has to be a path "
-                    f"to tarred dataset metadata file, whereas `None` is given."
-                )
-            tar_metadata_file = Path(cfg.ds_item) / cfg.tar_metadata_file
-            dataset = BertPunctuationCapitalizationTarredDataset(
-                metadata_file=tar_metadata_file,
-                tokenizer=self.tokenizer,
-                pad_label=self._cfg.common_dataset_parameters.pad_label,
-                ignore_extra_tokens=self._cfg.common_dataset_parameters.ignore_extra_tokens,
-                ignore_start_end=self._cfg.common_dataset_parameters.ignore_start_end,
-                world_size=self.world_size,
-                global_rank=self.global_rank,
-                shuffle_n=cfg.tar_shuffle_n,
-                shard_strategy=cfg.shard_strategy,
-                label_info_save_dir=cfg.label_info_save_dir,
-                use_audio=cfg.use_audio,
-            )
-            dataset.check_for_label_consistency_with_model_config(
-                self.punct_label_ids,
-                self.capit_label_ids,
-                self._cfg.class_labels,
-                self._cfg.common_dataset_parameters,
-            )
-        else:
-            if cfg.text_file is None or cfg.labels_file is None:
-                raise ValueError(
-                    f"If parameter `use_tarred_dataset` is `False`, then fields `text_file` and `labels_file` in "
-                    f"dataset config must not be `None`. Whereas `text_file={cfg.text_file}` and "
-                    f"`label_file={cfg.labels_file}`."
-                )
-            if cfg.tokens_in_batch is None and cfg.use_bucketing:
-                raise ValueError(
-                    f"If `use_tarred_dataset` is `False`, then you need to provide `tokens_in_batch` parameter."
-                )
-            (
-                text_file,
-                labels_file,
-            ) = (
-                Path(cfg.ds_item) / cfg.text_file,
-                Path(cfg.ds_item) / cfg.labels_file,
-            )
-            if cfg.audio_file:
-                audio_file = Path(cfg.ds_item) / cfg.audio_file
-            if self.label_ids_are_set:
-                label_kwargs = {'punct_label_ids': self.punct_label_ids, 'capit_label_ids': self.capit_label_ids}
-            else:
-                punct_label_vocab_file, capit_label_vocab_file = self._extract_label_vocab_files_from_config()
-                label_kwargs = {
-                    'punct_label_ids': self._cfg.common_dataset_parameters.punct_label_ids,
-                    'capit_label_ids': self._cfg.common_dataset_parameters.capit_label_ids,
-                    'punct_label_vocab_file': punct_label_vocab_file,
-                    'capit_label_vocab_file': capit_label_vocab_file,
-                }
-            if train:
-                number_of_batches_is_multiple_of = 1
-                if self._trainer is None:
-                    warnings.warn(
-                        'A model attribute `trainer` is not set before training dataset setting. If training is '
-                        'resumed from checkpoint, then current epoch data loading can be distorted: some batches '
-                        'may be processed several times and some can be not processed at all. `trainer.current_epoch`'
-                        ' is used as random seed for shuffling batches. Now 0 will be used. If the '
-                        'checkpoint was created not during initial epoch a shuffling of the dataset will '
-                        'be different. You may try use `exp_manager()` function and '
-                        '`PunctuationCapitalizationModel.set_trainer()` method before '
-                        '`PunctuationCapitalizationModel.setup_training_data()` method.'
-                    )
-                    batch_shuffling_random_seed = 0
-                else:
-                    batch_shuffling_random_seed = self._trainer.current_epoch
-            else:
-                batch_shuffling_random_seed = 0
-                if self._trainer is None:
-                    warnings.warn(
-                        'A model attribute `trainer` is not set before test or validation dataset setting. If more '
-                        'than 1 GPU is used for testing, then some examples may be tested several times because '
-                        'number of batches may be not evenly divisible by number of processes. This leads to '
-                        'distortion of metrics. See more in description of `number_of_batches_is_multiple_of` '
-                        'parameter of class `BertPunctuationCapitalizationDataset` initializer and '
-                        'https://pytorch.org/docs/stable/data.html#multi-process-data-loading. You may try to use '
-                        '`PunctuationCapitalizationModel.set_trainer()` method before '
-                        '`PunctuationCapitalizationModel.setup_validation_data()` and '
-                        '`PunctuationCapitalizationModel.setup_test_data()` methods.'
-                    )
-                    number_of_batches_is_multiple_of = 1
-                else:
-                    number_of_batches_is_multiple_of = self._trainer.num_nodes * self._trainer.num_devices
-            if cfg.cache_dir is None:
-                cache_dir = cfg.cache_dir
-            else:
-                # If pickled features are saved `cache_dir` not in the same directory with original data files, then
-                # a full path to data directory have to be appended to `cache_dir`. This is done to avoid collisions
-                # cache for different datasets is saved to same `cache_dir`.
-                cache_dir = Path(cfg.cache_dir).joinpath('fsroot', *text_file.expanduser().resolve().parts[1:-1])
-            dataset = BertPunctuationCapitalizationDataset(
-                tokenizer=self.tokenizer,
-                text_file=text_file,
-                labels_file=labels_file,
-                pad_label=self._cfg.common_dataset_parameters.pad_label,
-                **label_kwargs,
-                max_seq_length=cfg.max_seq_length,
-                ignore_extra_tokens=self._cfg.common_dataset_parameters.ignore_extra_tokens,
-                ignore_start_end=self._cfg.common_dataset_parameters.ignore_start_end,
-                use_cache=cfg.use_cache,
-                num_samples=cfg.num_samples,
-                tokens_in_batch=cfg.tokens_in_batch,
-                n_jobs=cfg.n_jobs,
-                number_of_batches_is_multiple_of=number_of_batches_is_multiple_of,
-                batch_shuffling_random_seed=batch_shuffling_random_seed,
-                verbose=cfg.verbose,
-                get_label_frequencies=cfg.get_label_frequences,
-                cache_dir=cache_dir,
-                label_info_save_dir=cfg.label_info_save_dir,
-                audio_file=audio_file if cfg.audio_file else None,
-                sample_rate=cfg.sample_rate,
-                use_audio=cfg.use_audio,
-                use_bucketing=cfg.use_bucketing,
-                preload_audios=cfg.preload_audios,
-            )
-        if cfg.shuffle and cfg.use_tarred_dataset:
-            logging.warning(f"Shuffling in dataloader is not supported for tarred dataset.")
-            shuffle = False
-        else:
-            shuffle = cfg.shuffle
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            collate_fn=dataset.collate_fn,
-            batch_size=1 if cfg.use_bucketing else cfg.batch_size,
-            shuffle=shuffle,
-            num_workers=cfg.num_workers,
-            pin_memory=cfg.pin_memory,
-            drop_last=cfg.drop_last,
-            persistent_workers=cfg.persistent_workers if cfg.num_workers > 0 else False,
-        )
-
-    def _setup_infer_dataloader(
-        self,
-        queries: List[str],
-        batch_size: int,
-        max_seq_length: int,
-        step: int,
-        margin: int,
-        dataloader_kwargs: Optional[Dict[str, Any]],
-        audio_queries: Optional[Union[List[bytes], List[str]]] = None,
-        target_sr: Optional[int] = None,
-    ) -> torch.utils.data.DataLoader:
-        """
-        Setup function for an infer data loader.
-
-        Args:
-            queries (:obj:`List[str]`): lower cased text without punctuation
-            batch_size (:obj:`int`): batch size to use during inference
-            max_seq_length (:obj:`int`): length of segments into which queries are split. ``max_seq_length`` includes
-                ``[CLS]`` and ``[SEP]`` so every segment contains at most ``max_seq_length-2`` tokens from input a
-                query.
-            step (:obj:`int`): number of tokens by which a segment is offset to a previous segment. Parameter ``step``
-                cannot be greater than ``max_seq_length-2``.
-            margin (:obj:`int`): number of tokens near the edge of a segment which label probabilities are not used in
-                final prediction computation.
-            audio_queries (:obj:`List[str]`, `optional`): paths to audio files.
-            target_sr (:obj:`int`, `optional`): target sample rate for audios.
-        Returns:
-            :obj:`torch.utils.data.DataLoader`: inference data loader
-        """
-        if dataloader_kwargs is None:
-            dataloader_kwargs = {}
-        dataset = BertPunctuationCapitalizationInferDataset(
-            tokenizer=self.tokenizer,
-            queries=queries,
-            max_seq_length=max_seq_length,
-            step=step,
-            margin=margin,
-            audio_queries=audio_queries,
-            target_sr=target_sr,
-        )
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            collate_fn=dataset.collate_fn,
-            batch_size=batch_size,
-            shuffle=False,
-            drop_last=False,
-            **dataloader_kwargs,
-        )
-
-    @staticmethod
-    def _remove_margins(tensor: torch.Tensor, margin_size: int, keep_left: bool, keep_right: bool) -> torch.Tensor:
-        tensor = tensor.detach().clone()
-        if not keep_left:
-            tensor = tensor[margin_size + 1 :]  # remove left margin and CLS token
-        if not keep_right:
-            tensor = tensor[: tensor.shape[0] - margin_size - 1]  # remove right margin and SEP token
-        return tensor
-
-    def _transform_logit_to_prob_and_remove_margins_and_extract_word_probs(
-        self,
-        punct_logits: torch.Tensor,
-        capit_logits: torch.Tensor,
-        subtokens_mask: torch.Tensor,
-        start_word_ids: Tuple[int],
-        margin: int,
-        is_first: Tuple[bool],
-        is_last: Tuple[bool],
-    ) -> Tuple[List[np.ndarray], List[np.ndarray], List[int]]:
-        """
-        Applies softmax to get punctuation and capitalization probabilities, applies ``subtokens_mask`` to extract
-        probabilities for words from probabilities for tokens, removes ``margin`` probabilities near edges of a segment.
-        Left margin of the first segment in a query and right margin of the last segment in a query are not removed.
-        Calculates new ``start_word_ids`` taking into the account the margins. If the left margin of a segment is
-        removed corresponding start word index is increased by number of words (number of nonzero values in
-        corresponding ``subtokens_mask``) in the margin.
-
-        Args:
-            punct_logits: a float tensor of shape ``[batch_size, segment_length, number_of_punctuation_labels]``
-            capit_logits: a float tensor of shape ``[batch_size, segment_length, number_of_capitalization_labels]``
-            subtokens_mask: a float tensor of shape ``[batch_size, segment_length]``
-            start_word_ids: indices of segment first words in a query
-            margin: number of tokens near edges of a segment which probabilities are discarded
-            is_first: is segment the first segment in a query
-            is_last: is segment the last segment in a query
-        Returns:
-            b_punct_probs: list containing ``batch_size`` numpy arrays. The numpy arrays have shapes
-                ``[number_of_word_in_this_segment, number_of_punctuation_labels]``. Word punctuation probabilities for
-                segments in the batch.
-            b_capit_probs: list containing ``batch_size`` numpy arrays. The numpy arrays have shapes
-                ``[number_of_word_in_this_segment, number_of_capitalization_labels]``. Word capitalization
-                probabilities for segments in the batch.
-            new_start_word_ids: indices of segment first words in a query after margin removal
-        """
-        new_start_word_ids = list(start_word_ids)
-        subtokens_mask = subtokens_mask > 0.5
-        b_punct_probs, b_capit_probs = [], []
-        for i, (first, last, pl, cl, stm) in enumerate(
-            zip(is_first, is_last, punct_logits, capit_logits, subtokens_mask)
-        ):
-            if not first:
-                new_start_word_ids[i] += torch.count_nonzero(stm[: margin + 1]).numpy()  # + 1 is for [CLS] token
-            stm = self._remove_margins(stm, margin, keep_left=first, keep_right=last)
-            for b_probs, logits in [(b_punct_probs, pl), (b_capit_probs, cl)]:
-                p = torch.nn.functional.softmax(
-                    self._remove_margins(logits, margin, keep_left=first, keep_right=last)[stm],
-                    dim=-1,
-                )
-                b_probs.append(p.detach().cpu().numpy())
-        return b_punct_probs, b_capit_probs, new_start_word_ids
-
-    @staticmethod
-    def _move_acc_probs_to_token_preds(
-        pred: List[int], acc_prob: np.ndarray, number_of_probs_to_move: int
-    ) -> Tuple[List[int], np.ndarray]:
-        """
-        ``number_of_probs_to_move`` rows in the beginning are removed from ``acc_prob``. From every remove row the label
-        with the largest probability is selected and appended to ``pred``.
-        Args:
-            pred: list with ready label indices for a query
-            acc_prob: numpy array of shape ``[number_of_words_for_which_probabilities_are_accumulated, number_of_labels]``
-            number_of_probs_to_move: int
-        Returns:
-            pred: list with ready label indices for a query
-            acc_prob: numpy array of shape
-                ``[number_of_words_for_which_probabilities_are_accumulated - number_of_probs_to_move, number_of_labels]``
-        """
-        if number_of_probs_to_move > acc_prob.shape[0]:
-            raise ValueError(
-                f"Not enough accumulated probabilities. Number_of_probs_to_move={number_of_probs_to_move} "
-                f"acc_prob.shape={acc_prob.shape}"
-            )
-        if number_of_probs_to_move > 0:
-            pred = pred + list(np.argmax(acc_prob[:number_of_probs_to_move], axis=-1))
-        acc_prob = acc_prob[number_of_probs_to_move:]
-        return pred, acc_prob
-
-    @staticmethod
-    def _update_accumulated_probabilities(acc_prob: np.ndarray, update: np.ndarray) -> np.ndarray:
-        """
-        Args:
-            acc_prob: numpy array of shape ``[A, L]``
-            update: numpy array of shape ``[A + N, L]``
-        Returns:
-            numpy array of shape ``[A + N, L]``
-        """
-        acc_prob = np.concatenate([acc_prob * update[: acc_prob.shape[0]], update[acc_prob.shape[0] :]], axis=0)
-        return acc_prob
-
-    def _apply_punct_capit_predictions(self, query: str, punct_preds: List[int], capit_preds: List[int]) -> str:
-        """
-        Restores punctuation and capitalization in ``query``.
-        Args:
-            query: a string without punctuation and capitalization
-            punct_preds: ids of predicted punctuation labels
-            capit_preds: ids of predicted capitalization labels
-        Returns:
-            a query with restored punctuation and capitalization
-        """
-        if isinstance(query, Hypothesis):
-            query = query.text
-        query = query.strip().split()
-        assert len(query) == len(
-            punct_preds
-        ), f"len(query)={len(query)} len(punct_preds)={len(punct_preds)}, query[:30]={query[:30]}"
-        assert len(query) == len(
-            capit_preds
-        ), f"len(query)={len(query)} len(capit_preds)={len(capit_preds)}, query[:30]={query[:30]}"
-        punct_ids_to_labels = {v: k for k, v in self.punct_label_ids.items()}
-        capit_ids_to_labels = {v: k for k, v in self.capit_label_ids.items()}
-        query_with_punct_and_capit = ''
-        for j, word in enumerate(query):
-            punct_label = punct_ids_to_labels[punct_preds[j]]
-            capit_label = capit_ids_to_labels[capit_preds[j]]
-
-            if capit_label != self._cfg.common_dataset_parameters.pad_label:
-                word = word.capitalize()
-            query_with_punct_and_capit += word
-            if punct_label != self._cfg.common_dataset_parameters.pad_label:
-                query_with_punct_and_capit += punct_label
-            query_with_punct_and_capit += ' '
-        return query_with_punct_and_capit[:-1]
-
-    def _get_labels(self, punct_preds: List[int], capit_preds: List[int]) -> str:
-        """
-        Returns punctuation and capitalization labels in NeMo format for encoded punctuation ``punct_preds``
-        and ``capit_preds`` labels (see https://docs.nvidia.com/deeplearning/nemo/
-        user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#nemo-data-format).
-        Args:
-            punct_preds: ids of predicted punctuation labels
-            capit_preds: ids of predicted capitalization labels
-        Returns:
-            labels in NeMo format
-        """
-        assert len(capit_preds) == len(
-            punct_preds
-        ), f"len(capit_preds)={len(capit_preds)} len(punct_preds)={len(punct_preds)}"
-        punct_ids_to_labels = {v: k for k, v in self.punct_label_ids.items()}
-        capit_ids_to_labels = {v: k for k, v in self.capit_label_ids.items()}
-        result = ''
-        for capit_label, punct_label in zip(capit_preds, punct_preds):
-            punct_label = punct_ids_to_labels[punct_label]
-            capit_label = capit_ids_to_labels[capit_label]
-            result += punct_label + capit_label + ' '
-        return result[:-1]
-
-    def add_punctuation_capitalization(
-        self,
-        queries: List[str],
-        batch_size: int = None,
-        max_seq_length: int = 64,
-        step: int = 8,
-        margin: int = 16,
-        return_labels: bool = False,
-        dataloader_kwargs: Dict[str, Any] = None,
-    ) -> List[str]:
-        """
-        Adds punctuation and capitalization to the queries. Use this method for inference.
-
-        Parameters ``max_seq_length``, ``step``, ``margin`` are for controlling the way queries are split into segments
-        which are processed by the model. Parameter ``max_seq_length`` is a length of a segment after tokenization
-        including special tokens [CLS] in the beginning and [SEP] in the end of a segment. Parameter ``step`` is a
-        shift between consequent segments. Parameter ``margin`` is used to exclude negative effect of subtokens near
-        borders of segments which have only one side context.
-
-        If segments overlap, probabilities of overlapping predictions are multiplied and then the label with
-        corresponding to the maximum probability is selected.
-
-        Args:
-            queries (:obj:`List[str]`): lower cased text without punctuation.
-            batch_size (:obj:`List[str]`, `optional`): batch size to use during inference. If ``batch_size`` parameter
-                is not provided, then it will be equal to length of ``queries`` list.
-            max_seq_length (:obj:`int`, `optional`, defaults to :obj:`64`): maximum sequence length of a segment after
-                tokenization including :code:`[CLS]` and :code:`[SEP]` tokens.
-            step (:obj:`int`, `optional`, defaults to :obj:`8`): relative shift of consequent segments into which long
-                queries are split. Long queries are split into segments which can overlap. Parameter ``step`` controls
-                such overlapping. Imagine that queries are tokenized into characters, ``max_seq_length=5``, and
-                ``step=2``. In such case, query ``"hello"`` is tokenized into segments
-                ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]``.
-            margin (:obj:`int`, `optional`, defaults to :obj:`16`): number of subtokens in the beginning and the end of
-                segments which are not used for prediction computation. The first segment does not have left margin and
-                the last segment does not have right margin. For example, if an input sequence is tokenized into
-                characters, ``max_seq_length=5``, ``step=1``, and ``margin=1``, then query ``"hello"`` will be
-                tokenized into segments ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'e', 'l', 'l', '[SEP]'],
-                ['[CLS]', 'l', 'l', 'o', '[SEP]']]``. These segments are passed to the model. Before final predictions
-                computation, margins are removed. In the next list, subtokens which logits are not used for final
-                predictions computation are marked with asterisk: ``[['[CLS]'*, 'h', 'e', 'l'*, '[SEP]'*],
-                ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]``.
-            return_labels (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to return labels in NeMo format
-                (see :ref:`nemo-data-format-label`) instead of queries with restored
-                punctuation and capitalization.
-            dataloader_kwargs (:obj:`Dict[str, Any]`, `optional`): an optional dictionary with parameters of PyTorch
-                data loader. May include keys: ``'num_workers'``, ``'pin_memory'``, ``'worker_init_fn'``,
-                ``'prefetch_factor'``, ``'persistent_workers'``.
-        Returns:
-            :obj:`List[str]`: a list of queries with restored capitalization and punctuation if
-            ``return_labels=False``, else a list of punctuation and capitalization labels strings for all queries
-        """
-        if len(queries) == 0:
-            return []
-        if batch_size is None:
-            batch_size = len(queries)
-            logging.info(f'Using batch size {batch_size} for inference')
-        result: List[str] = []
-        mode = self.training
-        try:
-            self.eval()
-            infer_datalayer = self._setup_infer_dataloader(
-                queries, batch_size, max_seq_length, step, margin, dataloader_kwargs
-            )
-            # Predicted labels for queries. List of labels for every query
-            all_punct_preds: List[List[int]] = [[] for _ in queries]
-            all_capit_preds: List[List[int]] = [[] for _ in queries]
-            # Accumulated probabilities (or product of probabilities acquired from different segments) of punctuation
-            # and capitalization. Probabilities for words in a query are extracted using `subtokens_mask`. Probabilities
-            # for newly processed words are appended to the accumulated probabilities. If probabilities for a word are
-            # already present in `acc_probs`, old probabilities are replaced with a product of old probabilities
-            # and probabilities acquired from new segment. Segments are processed in an order they appear in an
-            # input query. When all segments with a word are processed, a label with the highest probability
-            # (or product of probabilities) is chosen and appended to an appropriate list in `all_preds`. After adding
-            # prediction to `all_preds`, probabilities for a word are removed from `acc_probs`.
-            acc_punct_probs: List[Optional[np.ndarray]] = [None for _ in queries]
-            acc_capit_probs: List[Optional[np.ndarray]] = [None for _ in queries]
-            d = self.device
-            for batch_i, batch in tqdm(
-                enumerate(infer_datalayer), total=ceil(len(infer_datalayer.dataset) / batch_size), unit="batch"
-            ):
-                inp_ids, inp_type_ids, inp_mask, subtokens_mask, start_word_ids, query_ids, is_first, is_last = batch
-                punct_logits, capit_logits = self.forward(
-                    input_ids=inp_ids.to(d),
-                    token_type_ids=inp_type_ids.to(d),
-                    attention_mask=inp_mask.to(d),
-                )
-                _res = self._transform_logit_to_prob_and_remove_margins_and_extract_word_probs(
-                    punct_logits, capit_logits, subtokens_mask, start_word_ids, margin, is_first, is_last
-                )
-                punct_probs, capit_probs, start_word_ids = _res
-                for i, (q_i, start_word_id, bpp_i, bcp_i) in enumerate(
-                    zip(query_ids, start_word_ids, punct_probs, capit_probs)
-                ):
-                    for all_preds, acc_probs, b_probs_i in [
-                        (all_punct_preds, acc_punct_probs, bpp_i),
-                        (all_capit_preds, acc_capit_probs, bcp_i),
-                    ]:
-                        if acc_probs[q_i] is None:
-                            acc_probs[q_i] = b_probs_i
-                        else:
-                            all_preds[q_i], acc_probs[q_i] = self._move_acc_probs_to_token_preds(
-                                all_preds[q_i],
-                                acc_probs[q_i],
-                                start_word_id - len(all_preds[q_i]),
-                            )
-                            acc_probs[q_i] = self._update_accumulated_probabilities(acc_probs[q_i], b_probs_i)
-            for all_preds, acc_probs in [(all_punct_preds, acc_punct_probs), (all_capit_preds, acc_capit_probs)]:
-                for q_i, (pred, prob) in enumerate(zip(all_preds, acc_probs)):
-                    if prob is not None:
-                        all_preds[q_i], acc_probs[q_i] = self._move_acc_probs_to_token_preds(pred, prob, len(prob))
-            for i, query in enumerate(queries):
-                result.append(
-                    self._get_labels(all_punct_preds[i], all_capit_preds[i])
-                    if return_labels
-                    else self._apply_punct_capit_predictions(query, all_punct_preds[i], all_capit_preds[i])
-                )
-        finally:
-            # set mode back to its original value
-            self.train(mode=mode)
-        return result
-
-    @classmethod
-    def list_available_models(cls) -> List[PretrainedModelInfo]:
-        """
-        This method returns a list of pre-trained models which can be instantiated directly from NVIDIA's NGC cloud.
-
-        Returns:
-            :obj:`List[PretrainedModelInfo]`: a list of available pre-trained models.
-        """
-        result = [
-            PretrainedModelInfo(
-                pretrained_model_name="punctuation_en_bert",
-                location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/punctuation_en_bert/versions/1.0.0rc1/"
-                "files/punctuation_en_bert.nemo",
-                description="The model was trained with NeMo BERT base uncased checkpoint on a subset of data from "
-                "the following sources: Tatoeba sentences, books from Project Gutenberg, Fisher transcripts.",
-            ),
-            PretrainedModelInfo(
-                pretrained_model_name="punctuation_en_distilbert",
-                location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/punctuation_en_distilbert/versions/"
-                "1.0.0rc1/files/punctuation_en_distilbert.nemo",
-                description="The model was trained with DistilBERT base uncased checkpoint from HuggingFace on a "
-                "subset of data from the following sources: Tatoeba sentences, books from Project Gutenberg, "
-                "Fisher transcripts.",
-            ),
-        ]
-        return result
-
-    @property
-    def output_module(self):
-        return self
diff --git a/nemo/collections/nlp/models/token_classification/token_classification_model.py b/nemo/collections/nlp/models/token_classification/token_classification_model.py
deleted file mode 100644
index 99bb2328b956..000000000000
--- a/nemo/collections/nlp/models/token_classification/token_classification_model.py
+++ /dev/null
@@ -1,510 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig, OmegaConf
-from torch.utils.data import DataLoader
-
-from nemo.collections.common.losses import CrossEntropyLoss
-from nemo.collections.nlp.data.data_utils.data_preprocessing import get_labels_to_labels_id_mapping
-from nemo.collections.nlp.data.token_classification.token_classification_dataset import (
-    BertTokenClassificationDataset,
-    BertTokenClassificationInferDataset,
-)
-from nemo.collections.nlp.data.token_classification.token_classification_utils import get_label_ids
-from nemo.collections.nlp.metrics.classification_report import ClassificationReport
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.collections.nlp.modules.common import TokenClassifier
-from nemo.collections.nlp.parts.utils_funcs import get_classification_report, plot_confusion_matrix, tensor2list
-from nemo.core.classes.common import PretrainedModelInfo, typecheck
-from nemo.utils import logging
-
-__all__ = ['TokenClassificationModel']
-
-
-class TokenClassificationModel(NLPModel):
-    """Token Classification Model with BERT, applicable for tasks such as Named Entity Recognition"""
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        """Initializes Token Classification Model."""
-        # extract str to int labels mapping if a mapping file provided
-        if isinstance(cfg.label_ids, str):
-            if os.path.exists(cfg.label_ids):
-                logging.info(f'Reusing label_ids file found at {cfg.label_ids}.')
-                label_ids = get_labels_to_labels_id_mapping(cfg.label_ids)
-                # update the config to store name to id mapping
-                cfg.label_ids = OmegaConf.create(label_ids)
-            else:
-                raise ValueError(f'{cfg.label_ids} not found.')
-
-        self.class_weights = None
-        super().__init__(cfg=cfg, trainer=trainer)
-
-        self.classifier = TokenClassifier(
-            hidden_size=self.hidden_size,
-            num_classes=len(self._cfg.label_ids),
-            num_layers=self._cfg.head.num_fc_layers,
-            activation=self._cfg.head.activation,
-            log_softmax=False,
-            dropout=self._cfg.head.fc_dropout,
-            use_transformer_init=self._cfg.head.use_transformer_init,
-        )
-
-        self.loss = self.setup_loss(class_balancing=self._cfg.dataset.class_balancing)
-
-        # setup to track metrics
-        self.classification_report = ClassificationReport(
-            len(self._cfg.label_ids), label_ids=self._cfg.label_ids, dist_sync_on_step=True
-        )
-
-    def update_data_dir(self, data_dir: str) -> None:
-        """
-        Update data directory and get data stats with Data Descriptor
-        Weights are later used to setup loss
-
-        Args:
-            data_dir: path to data directory
-        """
-        self._cfg.dataset.data_dir = data_dir
-        logging.info(f'Setting model.dataset.data_dir to {data_dir}.')
-
-    def setup_loss(self, class_balancing: str = None):
-        """Setup loss
-           Setup or update loss.
-
-        Args:
-            class_balancing: whether to use class weights during training
-        """
-        if class_balancing not in ['weighted_loss', None]:
-            raise ValueError(f'Class balancing {class_balancing} is not supported. Choose from: [null, weighted_loss]')
-        if class_balancing == 'weighted_loss' and self.class_weights:
-            # you may need to increase the number of epochs for convergence when using weighted_loss
-            loss = CrossEntropyLoss(logits_ndim=3, weight=self.class_weights)
-            logging.debug(f'Using {class_balancing} class balancing.')
-        else:
-            loss = CrossEntropyLoss(logits_ndim=3)
-            logging.debug(f'Using CrossEntropyLoss class balancing.')
-        return loss
-
-    @typecheck()
-    def forward(self, input_ids, attention_mask, token_type_ids):
-        hidden_states = self.bert_model(
-            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-        )
-        if isinstance(hidden_states, tuple):
-            hidden_states = hidden_states[0]
-        logits = self.classifier(hidden_states=hidden_states)
-        return logits
-
-    def training_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the training loop with the data from the training dataloader
-        passed in as `batch`.
-        """
-        input_ids, input_type_ids, input_mask, subtokens_mask, loss_mask, labels = batch
-        logits = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
-        loss = self.loss(logits=logits, labels=labels, loss_mask=loss_mask)
-        lr = self._optimizer.param_groups[0]['lr']
-
-        self.log('train_loss', loss)
-        self.log('lr', lr, prog_bar=True)
-
-        return {
-            'loss': loss,
-            'lr': lr,
-        }
-
-    def validation_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the validation loop with the data from the validation dataloader
-        passed in as `batch`.
-        """
-        input_ids, input_type_ids, input_mask, subtokens_mask, loss_mask, labels = batch
-        logits = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
-        val_loss = self.loss(logits=logits, labels=labels, loss_mask=loss_mask)
-
-        subtokens_mask = subtokens_mask > 0.5
-
-        preds = torch.argmax(logits, axis=-1)[subtokens_mask]
-        labels = labels[subtokens_mask]
-        tp, fn, fp, _ = self.classification_report(preds, labels)
-
-        loss = {'val_loss': val_loss, 'tp': tp, 'fn': fn, 'fp': fp}
-        self.validation_step_outputs.append(loss)
-        return loss
-
-    def on_validation_epoch_end(self):
-        """
-        Called at the end of validation to aggregate outputs.
-        outputs: list of individual outputs of each validation step.
-        """
-        avg_loss = torch.stack([x['val_loss'] for x in self.validation_step_outputs]).mean()
-
-        # calculate metrics and classification report
-        precision, recall, f1, report = self.classification_report.compute()
-
-        logging.info(report)
-
-        self.log('val_loss', avg_loss, prog_bar=True)
-        self.log('precision', precision)
-        self.log('f1', f1)
-        self.log('recall', recall)
-
-        self.classification_report.reset()
-        self.validation_step_outputs.clear()  # free memory
-
-    def test_step(self, batch, batch_idx):
-        input_ids, input_type_ids, input_mask, subtokens_mask, loss_mask, labels = batch
-        logits = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
-        val_loss = self.loss(logits=logits, labels=labels, loss_mask=loss_mask)
-
-        subtokens_mask = subtokens_mask > 0.5
-
-        preds = torch.argmax(logits, axis=-1)[subtokens_mask]
-        labels = labels[subtokens_mask]
-        tp, fn, fp, _ = self.classification_report(preds, labels)
-
-        loss = {'test_loss': val_loss, 'tp': tp, 'fn': fn, 'fp': fp}
-        self.test_step_outputs.append(loss)
-        return loss
-
-    def on_test_epoch_end(self):
-        avg_loss = torch.stack([x['test_loss'] for x in self.test_step_outputs]).mean()
-        # calculate metrics and classification report
-        precision, recall, f1, report = self.classification_report.compute()
-        logging.info(report)
-
-        self.log('test_loss', avg_loss, prog_bar=True)
-        self.log('precision', precision)
-        self.log('f1', f1)
-        self.log('recall', recall)
-        self.test_step_outputs.clear()  # free memory
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig] = None):
-        if train_data_config is None:
-            train_data_config = self._cfg.train_ds
-
-        labels_file = os.path.join(self._cfg.dataset.data_dir, train_data_config.labels_file)
-
-        # for older(pre - 1.0.0.b3) configs compatibility
-        if not hasattr(self._cfg, "class_labels") or self._cfg.class_labels is None:
-            OmegaConf.set_struct(self._cfg, False)
-            self._cfg.class_labels = {}
-            self._cfg.class_labels = OmegaConf.create({'class_labels_file': 'label_ids.csv'})
-            OmegaConf.set_struct(self._cfg, True)
-
-        label_ids, label_ids_filename, self.class_weights = get_label_ids(
-            label_file=labels_file,
-            is_training=True,
-            pad_label=self._cfg.dataset.pad_label,
-            label_ids_dict=self._cfg.label_ids,
-            get_weights=True,
-            class_labels_file_artifact=self._cfg.class_labels.class_labels_file,
-        )
-        # save label maps to the config
-        self._cfg.label_ids = OmegaConf.create(label_ids)
-
-        self.register_artifact('class_labels.class_labels_file', label_ids_filename)
-        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config)
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig] = None):
-        if val_data_config is None:
-            val_data_config = self._cfg.validation_ds
-
-        labels_file = os.path.join(self._cfg.dataset.data_dir, val_data_config.labels_file)
-        get_label_ids(
-            label_file=labels_file,
-            is_training=False,
-            pad_label=self._cfg.dataset.pad_label,
-            label_ids_dict=self._cfg.label_ids,
-            get_weights=False,
-        )
-
-        self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config)
-
-    def setup_test_data(self, test_data_config: Optional[DictConfig] = None):
-        if test_data_config is None:
-            test_data_config = self._cfg.test_ds
-
-        labels_file = os.path.join(self._cfg.dataset.data_dir, test_data_config.labels_file)
-        get_label_ids(
-            label_file=labels_file,
-            is_training=False,
-            pad_label=self._cfg.dataset.pad_label,
-            label_ids_dict=self._cfg.label_ids,
-            get_weights=False,
-        )
-
-        self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config)
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig) -> DataLoader:
-        """
-        Setup dataloader from config
-        Args:
-            cfg: config for the dataloader
-        Return:
-            Pytorch Dataloader
-        """
-        dataset_cfg = self._cfg.dataset
-        data_dir = dataset_cfg.data_dir
-
-        if not os.path.exists(data_dir):
-            raise FileNotFoundError(f"Data directory is not found at: {data_dir}.")
-
-        text_file = os.path.join(data_dir, cfg.text_file)
-        labels_file = os.path.join(data_dir, cfg.labels_file)
-
-        if not (os.path.exists(text_file) and os.path.exists(labels_file)):
-            raise FileNotFoundError(
-                f'{text_file} or {labels_file} not found. The data should be split into 2 files: text.txt and \
-                labels.txt. Each line of the text.txt file contains text sequences, where words are separated with \
-                spaces. The labels.txt file contains corresponding labels for each word in text.txt, the labels are \
-                separated with spaces. Each line of the files should follow the format:  \
-                   [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \
-                   [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).'
-            )
-        dataset = BertTokenClassificationDataset(
-            text_file=text_file,
-            label_file=labels_file,
-            max_seq_length=dataset_cfg.max_seq_length,
-            tokenizer=self.tokenizer,
-            num_samples=cfg.num_samples,
-            pad_label=dataset_cfg.pad_label,
-            label_ids=self._cfg.label_ids,
-            ignore_extra_tokens=dataset_cfg.ignore_extra_tokens,
-            ignore_start_end=dataset_cfg.ignore_start_end,
-            use_cache=dataset_cfg.use_cache,
-        )
-        return DataLoader(
-            dataset=dataset,
-            collate_fn=dataset.collate_fn,
-            batch_size=cfg.batch_size,
-            shuffle=cfg.shuffle,
-            num_workers=dataset_cfg.num_workers,
-            pin_memory=dataset_cfg.pin_memory,
-            drop_last=dataset_cfg.drop_last,
-        )
-
-    def _setup_infer_dataloader(self, queries: List[str], batch_size: int) -> 'torch.utils.data.DataLoader':
-        """
-        Setup function for an infer data loader.
-
-        Args:
-            queries: text
-            batch_size: batch size to use during inference
-
-        Returns:
-            A pytorch DataLoader.
-        """
-
-        dataset = BertTokenClassificationInferDataset(tokenizer=self.tokenizer, queries=queries, max_seq_length=-1)
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            collate_fn=dataset.collate_fn,
-            batch_size=batch_size,
-            shuffle=False,
-            num_workers=self._cfg.dataset.num_workers,
-            pin_memory=self._cfg.dataset.pin_memory,
-            drop_last=False,
-        )
-
-    @torch.no_grad()
-    def _infer(self, queries: List[str], batch_size: int = None) -> List[int]:
-        """
-        Get prediction for the queries
-        Args:
-            queries: text sequences
-            batch_size: batch size to use during inference.
-        Returns:
-            all_preds: model predictions
-        """
-        # store predictions for all queries in a single list
-        all_preds = []
-        mode = self.training
-        try:
-            device = 'cuda' if torch.cuda.is_available() else 'cpu'
-            # Switch model to evaluation mode
-            self.eval()
-            self.to(device)
-            infer_datalayer = self._setup_infer_dataloader(queries, batch_size)
-
-            for batch in infer_datalayer:
-                input_ids, input_type_ids, input_mask, subtokens_mask = batch
-
-                logits = self.forward(
-                    input_ids=input_ids.to(device),
-                    token_type_ids=input_type_ids.to(device),
-                    attention_mask=input_mask.to(device),
-                )
-
-                subtokens_mask = subtokens_mask > 0.5
-                preds = tensor2list(torch.argmax(logits, axis=-1)[subtokens_mask])
-                all_preds.extend(preds)
-        finally:
-            # set mode back to its original value
-            self.train(mode=mode)
-        return all_preds
-
-    def add_predictions(
-        self, queries: Union[List[str], str], batch_size: int = 32, output_file: Optional[str] = None
-    ) -> List[str]:
-        """
-        Add predicted token labels to the queries. Use this method for debugging and prototyping.
-        Args:
-            queries: text
-            batch_size: batch size to use during inference.
-            output_file: file to save models predictions
-        Returns:
-            result: text with added entities
-        """
-        if queries is None or len(queries) == 0:
-            return []
-
-        if isinstance(queries, str):
-            logging.info(f'Reading from {queries} file')
-            with open(queries, 'r') as f:
-                queries = f.readlines()
-
-        result = []
-        all_preds = self._infer(queries, batch_size)
-
-        queries = [q.strip().split() for q in queries]
-        num_words = [len(q) for q in queries]
-        if sum(num_words) != len(all_preds):
-            raise ValueError('Pred and words must have the same length')
-
-        ids_to_labels = {v: k for k, v in self._cfg.label_ids.items()}
-        start_idx = 0
-        end_idx = 0
-        for query in queries:
-            end_idx += len(query)
-
-            # extract predictions for the current query from the list of all predictions
-            preds = all_preds[start_idx:end_idx]
-            start_idx = end_idx
-
-            query_with_entities = ''
-            for j, word in enumerate(query):
-                # strip out the punctuation to attach the entity tag to the word not to a punctuation mark
-                # that follows the word
-                if word[-1].isalpha():
-                    punct = ''
-                else:
-                    punct = word[-1]
-                    word = word[:-1]
-
-                query_with_entities += word
-                label = ids_to_labels[preds[j]]
-
-                if label != self._cfg.dataset.pad_label:
-                    query_with_entities += '[' + label + ']'
-                query_with_entities += punct + ' '
-            result.append(query_with_entities.strip())
-
-        if output_file is not None:
-            with open(output_file, 'w') as f:
-                for r in result:
-                    f.write(r + '\n')
-            logging.info(f'Predictions saved to {output_file}')
-        return result
-
-    def evaluate_from_file(
-        self,
-        output_dir: str,
-        text_file: str,
-        labels_file: Optional[str] = None,
-        add_confusion_matrix: Optional[bool] = False,
-        normalize_confusion_matrix: Optional[bool] = True,
-        batch_size: int = 1,
-    ) -> None:
-        """
-        Run inference on data from a file, plot confusion matrix and calculate classification report.
-        Use this method for final evaluation.
-
-        Args:
-            output_dir: path to output directory to store model predictions, confusion matrix plot (if set to True)
-            text_file: path to file with text. Each line of the text.txt file contains text sequences, where words
-                are separated with spaces: [WORD] [SPACE] [WORD] [SPACE] [WORD]
-            labels_file (Optional): path to file with labels. Each line of the labels_file should contain
-                labels corresponding to each word in the text_file, the labels are separated with spaces:
-                [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).'
-            add_confusion_matrix: whether to generate confusion matrix
-            normalize_confusion_matrix: whether to normalize confusion matrix
-            batch_size: batch size to use during inference.
-        """
-        output_dir = os.path.abspath(output_dir)
-
-        with open(text_file, 'r') as f:
-            queries = f.readlines()
-
-        all_preds = self._infer(queries, batch_size)
-        with_labels = labels_file is not None
-        if with_labels:
-            with open(labels_file, 'r') as f:
-                all_labels_str = f.readlines()
-                all_labels_str = ' '.join([labels.strip() for labels in all_labels_str])
-
-        # writing labels and predictions to a file in output_dir is specified in the config
-        os.makedirs(output_dir, exist_ok=True)
-        filename = os.path.join(output_dir, 'infer_' + os.path.basename(text_file))
-        try:
-            with open(filename, 'w') as f:
-                if with_labels:
-                    f.write('labels\t' + all_labels_str + '\n')
-                    logging.info(f'Labels save to {filename}')
-
-                # convert labels from string label to ids
-                ids_to_labels = {v: k for k, v in self._cfg.label_ids.items()}
-                all_preds_str = [ids_to_labels[pred] for pred in all_preds]
-                f.write('preds\t' + ' '.join(all_preds_str) + '\n')
-                logging.info(f'Predictions saved to {filename}')
-
-            if with_labels and add_confusion_matrix:
-                all_labels = all_labels_str.split()
-                # convert labels from string label to ids
-                label_ids = self._cfg.label_ids
-                all_labels = [label_ids[label] for label in all_labels]
-
-                plot_confusion_matrix(
-                    all_labels, all_preds, output_dir, label_ids=label_ids, normalize=normalize_confusion_matrix
-                )
-                logging.info(get_classification_report(all_labels, all_preds, label_ids))
-        except Exception:
-            logging.error(
-                f'When providing a file with labels, check that all labels in {labels_file} were'
-                f'seen during training.'
-            )
-            raise
-
-    @classmethod
-    def list_available_models(cls) -> List[PretrainedModelInfo]:
-        """
-        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
-
-        Returns:
-            List of available pre-trained models.
-        """
-        result = []
-        model = PretrainedModelInfo(
-            pretrained_model_name="ner_en_bert",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/ner_en_bert/versions/1.10/files/ner_en_bert.nemo",
-            description="The model was trained on GMB (Groningen Meaning Bank) corpus for entity recognition and achieves 74.61 F1 Macro score.",
-        )
-        result.append(model)
-        return result
diff --git a/nemo/collections/nlp/models/zero_shot_intent_recognition/__init__.py b/nemo/collections/nlp/models/zero_shot_intent_recognition/__init__.py
deleted file mode 100644
index 8b882db73967..000000000000
--- a/nemo/collections/nlp/models/zero_shot_intent_recognition/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.models.zero_shot_intent_recognition.zero_shot_intent_model import ZeroShotIntentModel
diff --git a/nemo/collections/nlp/models/zero_shot_intent_recognition/zero_shot_intent_model.py b/nemo/collections/nlp/models/zero_shot_intent_recognition/zero_shot_intent_model.py
deleted file mode 100644
index fcad89322184..000000000000
--- a/nemo/collections/nlp/models/zero_shot_intent_recognition/zero_shot_intent_model.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright 2018 The HuggingFace Inc. team.
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint: skip-file
-
-import os
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig
-
-from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import (
-    ZeroShotIntentDataset,
-    ZeroShotIntentInferenceDataset,
-    calc_class_weights_from_dataloader,
-)
-
-try:
-    from nemo.collections.nlp.models import TextClassificationModel
-except ImportError:
-    from abc import ABC
-
-    TextClassificationModel = ABC
-from nemo.core.classes.common import PretrainedModelInfo
-from nemo.utils import logging
-
-__all__ = ['ZeroShotIntentModel']
-
-
-class ZeroShotIntentModel(TextClassificationModel):
-    """TextClassificationModel to be trained on two- or three-class textual entailment data, to be used for zero shot intent recognition."""
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        super().__init__(cfg=cfg, trainer=trainer)
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig) -> 'torch.utils.data.DataLoader':
-        data_dir = self._cfg.dataset.data_dir
-        file_name = cfg.file_name
-        input_file = os.path.join(data_dir, file_name)
-        if not os.path.exists(input_file):
-            raise FileNotFoundError(
-                f"File {input_file} not found. Please check file paths and file names in the config."
-            )
-
-        dataset = ZeroShotIntentDataset(
-            file_path=input_file,
-            tokenizer=self.tokenizer,
-            max_seq_length=self._cfg.dataset.max_seq_length,
-            sent1_col=self._cfg.dataset.sentence_1_column,
-            sent2_col=self._cfg.dataset.sentence_2_column,
-            label_col=self._cfg.dataset.label_column,
-            num_classes=self.cfg.dataset.num_classes,
-            use_cache=self._cfg.dataset.use_cache,
-        )
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            collate_fn=dataset.collate_fn,
-            batch_size=cfg.batch_size,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.get("num_workers", 0),
-            pin_memory=cfg.get("pin_memory", False),
-            drop_last=cfg.get("drop_last", False),
-        )
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig]):
-        if not train_data_config or not train_data_config.file_name:
-            logging.info(
-                "Dataloader config or file_name for the training set is missing, so no data loader for test is created!"
-            )
-            self._test_dl = None
-            return
-        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config)
-
-        # calculate the class weights to be used in the loss function
-        if self.cfg.dataset.class_balancing == 'weighted_loss':
-            self.class_weights = calc_class_weights_from_dataloader(
-                self._train_dl, self.cfg.dataset.num_classes, self.cfg.dataset.data_dir
-            )
-        else:
-            self.class_weights = None
-        # we need to create/update the loss module by using the weights calculated from the training data
-        self.create_loss_module()
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig]):
-        if not val_data_config or not val_data_config.file_name:
-            logging.info(
-                "Dataloader config or file_path for the validation data set is missing, so no data loader for test is created!"
-            )
-            self._test_dl = None
-            return
-        self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config)
-
-    def setup_test_data(self, test_data_config: Optional[DictConfig]):
-        if not test_data_config or not test_data_config.file_name:
-            logging.info(
-                "Dataloader config or file_path for the test data set is missing, so no data loader for test is created!"
-            )
-            self._test_dl = None
-            return
-        self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config)
-
-    def _setup_infer_dataloader(
-        self,
-        queries: List[str],
-        candidate_labels: List[str],
-        hypothesis_template=str,
-        batch_size=1,
-        max_seq_length: int = -1,
-    ) -> 'torch.utils.data.DataLoader':
-        """
-        Setup method for inference data loader. Here the premise-hypothesis pairs are made from queries and candidate labels.
-
-        Args:
-            queries: the queries to classify
-            candidate_labels: strings to be used as labels
-            hypothesis_template: the template used to turn each label into an NLI-style hypothesis. Must include a {}
-                or similar syntax for the candidate label to be inserted.
-            batch_size: batch size to use during inference
-            max_seq_length: maximum length of queries, default is -1 for no limit
-        Returns:
-            A pytorch DataLoader.
-        """
-        dataset = ZeroShotIntentInferenceDataset(
-            queries=queries,
-            candidate_labels=candidate_labels,
-            tokenizer=self.tokenizer,
-            max_seq_length=max_seq_length,
-            hypothesis_template=hypothesis_template,
-        )
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=batch_size,
-            shuffle=False,
-            num_workers=2,
-            pin_memory=False,
-            drop_last=False,
-            collate_fn=dataset.collate_fn,
-        )
-
-    def predict(
-        self,
-        queries: Union[str, List[str]],
-        candidate_labels: Union[str, List[str]],
-        hypothesis_template='This example is {}.',
-        batch_size=1,
-        multi_label=True,
-        entailment_idx=1,
-        contradiction_idx=0,
-    ) -> List[Dict]:
-        """
-        Given a list of queries and a list of candidate labels, return a ranked list of labels and scores for each query.
-
-        Example usage:
-            queries = ["I'd like a veggie burger, fries, and a coke", "Turn off the lights in the living room",]
-            candidate_labels = ["Food order", "Change lighting"]
-            model.predict(queries, candidate_labels)
-
-        Example output:
-            [{'sentence': "I'd like a veggie burger, fries, and a coke",
-              'labels': ['Food order', 'Change lighting'],
-              'scores': [0.8557153344154358, 0.12036784738302231]},
-             {'sentence': 'Turn off the lights in the living room',
-              'labels': ['Change lighting', 'Food order'],
-              'scores': [0.8506497144699097, 0.06594637036323547]}]
-
-
-        Args:
-            queries: the query or list of queries to classify
-            candidate_labels: string or list of strings to be used as labels
-            hypothesis_template: the template used to turn each label into an NLI-style hypothesis. Must include a {}
-            or similar syntax for the candidate label to be inserted.
-            batch_size: the batch size to use for inference.
-            multi_label: whether or not multiple candidate labels can be true. If False, the scores are normalized
-            such that all class probabilities sum to 1. If True, the labels are
-            considered independent and probabilities are normalized for each candidate by doing a softmax of
-            the entailment score vs. the contradiction score.
-            entailment_idx: the index of the "entailment" class in the trained model; models trained on MNLI
-             using NeMo's glue_benchmark.py or zero_shot_intent_model.py use an index of 1 by default.
-            contradiction_idx: the index of the "contradiction" class in the trained model; models trained on MNLI
-             using NeMo's glue_benchmark.py or zero_shot_intent_model.py use an index of 0 by default.
-
-        Returns:
-            list of dictionaries; one dict per input query. Each dict has keys "sentence", "labels", "scores".
-            labels and scores are parallel lists (with each score corresponding to the label at the same index),
-                 sorted from highest to lowest score.
-
-        """
-        if not queries:
-            raise ValueError("No queries were passed for classification!")
-        if not candidate_labels:
-            raise ValueError("No candidate labels were provided!")
-
-        queries = [queries] if isinstance(queries, str) else queries
-        candidate_labels = [candidate_labels] if isinstance(candidate_labels, str) else candidate_labels
-
-        if len(candidate_labels) == 1:
-            multi_label = True
-
-        mode = self.training
-        try:
-            device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-            # Switch model to evaluation mode
-            self.eval()
-            self.to(device)
-
-            infer_datalayer = self._setup_infer_dataloader(
-                queries,
-                candidate_labels,
-                hypothesis_template=hypothesis_template,
-                batch_size=batch_size,
-                max_seq_length=self._cfg.dataset.max_seq_length,
-            )
-
-            all_batch_logits = []
-            for batch in infer_datalayer:
-                input_ids, input_type_ids, input_mask, _ = batch
-
-                logits = self.forward(
-                    input_ids=input_ids.to(device),
-                    token_type_ids=input_type_ids.to(device),
-                    attention_mask=input_mask.to(device),
-                )
-                all_batch_logits.append(logits.detach().cpu().numpy())
-
-            all_logits = np.concatenate(all_batch_logits)
-            outputs = all_logits.reshape((len(queries), len(candidate_labels), -1))
-
-            if not multi_label:
-                # softmax the "entailment" logits over all candidate labels
-                entail_logits = outputs[..., entailment_idx]
-                scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)
-            else:
-                # softmax over the entailment vs. contradiction dim for each label independently
-                entail_contr_logits = outputs[..., [contradiction_idx, entailment_idx]]
-                scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
-                scores = scores[..., 1]
-
-            result = []
-            for i in range(len(queries)):
-                sorted_idxs = list(reversed(scores[i].argsort()))
-                result.append(
-                    {
-                        "sentence": queries[i],
-                        "labels": [candidate_labels[j] for j in sorted_idxs],
-                        "scores": scores[i][sorted_idxs].tolist(),
-                    }
-                )
-
-        finally:
-            # set mode back to its original value
-            self.train(mode=mode)
-        return result
-
-    @classmethod
-    def list_available_models(cls) -> List[PretrainedModelInfo]:
-        """
-        This method returns a list of pre-trained models which can be instantiated directly from NVIDIA's NGC cloud.
-
-        Returns:
-            List of available pre-trained models.
-        """
-        result = []
-        result.append(
-            PretrainedModelInfo(
-                pretrained_model_name="zeroshotintent_en_bert_base_uncased",
-                location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/zeroshotintent_en_bert_base_uncased/versions/1.4.1/files/zeroshotintent_en_bert_base_uncased.nemo",
-                description="ZeroShotIntentModel trained by fine tuning BERT-base-uncased on the MNLI (Multi-Genre Natural Language Inference) dataset, which achieves an accuracy of 84.9% and 84.8% on the matched and mismatched dev sets, respectively.",
-            )
-        )
-        result.append(
-            PretrainedModelInfo(
-                pretrained_model_name="zeroshotintent_en_megatron_uncased",
-                location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/zeroshotintent_en_megatron_uncased/versions/1.4.1/files/zeroshotintent_en_megatron_uncased.nemo",
-                description="ZeroShotIntentModel trained by fine tuning Megatron-BERT-345m=M-uncased on the MNLI (Multi-Genre Natural Language Inference) dataset, which achieves an accuracy of 90.0% and 89.9% on the matched and mismatched dev sets, respectively",
-            )
-        )
-        return result
diff --git a/nemo/collections/tts/g2p/models/heteronym_classification.py b/nemo/collections/tts/g2p/models/heteronym_classification.py
index 47d08eb16e17..8683db444b1a 100644
--- a/nemo/collections/tts/g2p/models/heteronym_classification.py
+++ b/nemo/collections/tts/g2p/models/heteronym_classification.py
@@ -23,11 +23,11 @@
 from omegaconf import DictConfig
 
 from nemo.collections.common.losses import CrossEntropyLoss
-from nemo.collections.nlp.metrics.classification_report import ClassificationReport
 from nemo.collections.nlp.modules.common import TokenClassifier
 from nemo.collections.nlp.parts.utils_funcs import tensor2list
 from nemo.collections.tts.g2p.data.heteronym_classification import HeteronymClassificationDataset
 from nemo.collections.tts.g2p.utils import get_heteronym_spans, get_wordid_to_phonemes, read_wordids
+from nemo.collections.tts.metrics.classification_report import ClassificationReport
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
 
@@ -361,7 +361,7 @@ def disambiguate_manifest(
     def setup_training_data(self, train_data_config: Optional[DictConfig]):
         if not train_data_config or train_data_config.dataset.manifest is None:
             logging.info(
-                f"Dataloader config or file_path for the train is missing, so no data loader for train is created!"
+                "Dataloader config or file_path for the train is missing, so no data loader for train is created!"
             )
             self._train_dl = None
             return
@@ -370,7 +370,7 @@ def setup_training_data(self, train_data_config: Optional[DictConfig]):
     def setup_validation_data(self, val_data_config: Optional[DictConfig]):
         if not val_data_config or val_data_config.dataset.manifest is None:
             logging.info(
-                f"Dataloader config or file_path for the validation is missing, so no data loader for validation is created!"
+                "Dataloader config or file_path for the validation is missing, so no data loader for validation is created!"
             )
             self._validation_dl = None
             return
@@ -379,7 +379,7 @@ def setup_validation_data(self, val_data_config: Optional[DictConfig]):
     def setup_test_data(self, test_data_config: Optional[DictConfig]):
         if not test_data_config or test_data_config.dataset.manifest is None:
             logging.info(
-                f"Dataloader config or file_path for the test is missing, so no data loader for test is created!"
+                "Dataloader config or file_path for the test is missing, so no data loader for test is created!"
             )
             self._test_dl = None
             return
diff --git a/nemo/collections/nlp/data/token_classification/__init__.py b/nemo/collections/tts/metrics/__init__.py
similarity index 81%
rename from nemo/collections/nlp/data/token_classification/__init__.py
rename to nemo/collections/tts/metrics/__init__.py
index 9e3250071955..4d3ea19ecbbc 100644
--- a/nemo/collections/nlp/data/token_classification/__init__.py
+++ b/nemo/collections/tts/metrics/__init__.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+# Copyright 2019 The Google Research Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/collections/nlp/metrics/classification_report.py b/nemo/collections/tts/metrics/classification_report.py
similarity index 72%
rename from nemo/collections/nlp/metrics/classification_report.py
rename to nemo/collections/tts/metrics/classification_report.py
index 4fa46d72b054..dfcead2f8d1f 100644
--- a/nemo/collections/nlp/metrics/classification_report.py
+++ b/nemo/collections/tts/metrics/classification_report.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 from torchmetrics import Metric
 from torchmetrics.utilities.data import METRIC_EPS
 
-__all__ = ['ClassificationReport', 'MultiLabelClassificationReport']
+__all__ = ['ClassificationReport']
 
 
 class ClassificationReport(Metric):
@@ -90,7 +90,7 @@ def update(self, predictions: torch.Tensor, labels: torch.Tensor) -> None:
         Updates attributes needed for new classification report (true positive, false negative, false postive, examples per class)
 
         Args:
-            predictions: predicted labels 
+            predictions: predicted labels
             labels: actual labels
 
         Return:
@@ -184,79 +184,3 @@ def compute(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tenso
                 f'{self.mode} mode is not supported. Choose "macro" to get aggregated numbers \
             or "all" to get values for each class.'
             )
-
-
-class MultiLabelClassificationReport(ClassificationReport):
-    """
-    This metric computes the number of True Positive, False Negative, and False Positive examples per class for
-    a multi-label dataset.
-    
-    When doing distributed training/evaluation the result of res=ClassificationReport(predictions, labels) calls
-    will be all-reduced between all workers using SUM operations.
-
-    If used with PytorchLightning LightningModule, include TPs, FNs, and FPs inside validation_step results.
-    Then aggregate them at the end of validation epoch to correctly compute validation precision, recall, f1
-    using get_precision_recall_f1().
-
-    Example:
-        def validation_step(self, batch, batch_idx):
-            ...
-            tp, fn, fp, _ = self.classification_report(preds, labels)
-
-            return {'val_loss': val_loss, 'tp': tp, 'fn': fn, 'fp': fp}
-
-        def on_validation_epoch_end(self):
-            ...
-            # calculate metrics and classification report
-            precision, recall, f1, report = self.classification_report.compute()
-
-            logging.info(report)
-
-            self.log('val_loss', avg_loss, prog_bar=True)
-            self.log('precision', precision)
-            self.log('f1', f1)
-            self.log('recall', recall)
-
-    Args:
-        num_classes: number of classes in the dataset
-        label_ids (optional): label name to label id mapping
-        mode: how to compute the average
-        dist_sync_on_step: sync across ddp
-        process_group: which processes to sync across
-    Return:
-        aggregated precision, recall, f1, report
-    """
-
-    def update(self, predictions: torch.Tensor, labels: torch.Tensor) -> None:
-        """
-        Updates attributes needed for new classification report (true positive, false negative, false postive, examples per class)
-
-        Args:
-            predictions: predicted labels 
-            labels: actual labels
-
-        Return:
-            None
-        """
-        predictions = predictions.t()
-
-        TP = []
-        FN = []
-        FP = []
-
-        for label_id in range(self.num_classes):
-            current_label = labels[label_id]
-            labels_predicted = predictions[label_id]
-            TP.append((labels_predicted == current_label)[labels_predicted == 1].sum())
-            FP.append((labels_predicted != current_label)[labels_predicted == 1].sum())
-            FN.append((labels_predicted != current_label)[current_label == 1].sum())
-
-        tp = torch.tensor(TP).to(predictions.device)
-        fn = torch.tensor(FN).to(predictions.device)
-        fp = torch.tensor(FP).to(predictions.device)
-        num_examples_per_class = tp + fn
-
-        self.tp += tp
-        self.fn += fn
-        self.fp += fp
-        self.num_examples_per_class += num_examples_per_class
diff --git a/nemo/collections/nlp/metrics/prompt_learning_metrics.py b/nemo/collections/tts/metrics/score_metrics.py
similarity index 87%
rename from nemo/collections/nlp/metrics/prompt_learning_metrics.py
rename to nemo/collections/tts/metrics/score_metrics.py
index 610acbb436d0..a16300ea3a25 100644
--- a/nemo/collections/nlp/metrics/prompt_learning_metrics.py
+++ b/nemo/collections/tts/metrics/score_metrics.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 # Copyright 2019 The Google Research Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,7 +29,7 @@ def get_score(self, ground_truth, predicted_text):
 class AccuracyScore(ValMetric):
     def get_score(self, ground_truth, predicted_text):
         corrects = 0
-        for (pred, label) in zip(predicted_text, ground_truth):
+        for pred, label in zip(predicted_text, ground_truth):
             if pred == label:
                 corrects += 1
 
@@ -43,7 +43,12 @@ def __init__(self):
 
     def get_score(self, ground_truth, predicted_text):
         return {
-            'bleu_score': torch.tensor(self.scorer.corpus_score(predicted_text, [[i] for i in ground_truth],).score)
+            'bleu_score': torch.tensor(
+                self.scorer.corpus_score(
+                    predicted_text,
+                    [[i] for i in ground_truth],
+                ).score
+            )
         }
 
 
diff --git a/nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py b/nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py
index c1fb135d00f3..cc190a65972b 100644
--- a/nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py
+++ b/nemo/collections/tts/models/speechllm/megatron_base_speechllm_prompt_model.py
@@ -20,7 +20,6 @@
 from torch import Tensor
 
 from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
-from nemo.collections.nlp.metrics.prompt_learning_metrics import AccuracyScore, BLEUScore, ROUGEScores
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common import (
     PromptEncoder,
@@ -31,6 +30,7 @@
 )
 from nemo.collections.nlp.modules.common.transformer.text_generation import TextGeneration
 from nemo.collections.nlp.parts import utils_funcs
+from nemo.collections.tts.metrics.score_metrics import AccuracyScore, BLEUScore, ROUGEScores
 from nemo.utils import AppState
 
 try:
diff --git a/tests/core/test_save_restore.py b/tests/core/test_save_restore.py
index a06018839013..b58332021341 100644
--- a/tests/core/test_save_restore.py
+++ b/tests/core/test_save_restore.py
@@ -23,7 +23,6 @@
 from omegaconf import DictConfig, OmegaConf, open_dict
 
 from nemo.collections.asr.models import EncDecCTCModel, EncDecCTCModelBPE
-from nemo.collections.nlp.models import PunctuationCapitalizationModel
 from nemo.core.classes import ModelPT
 from nemo.core.connectors import save_restore_connector
 from nemo.utils.app_state import AppState
@@ -387,15 +386,6 @@ def test_EncDecCTCModelBPE_HF(self):
         cn = ModelPT.from_pretrained(model_name="nvidia/stt_en_citrinet_256_ls")
         self.__test_restore_elsewhere(model=cn, attr_for_eq_check=set(["decoder._feat_in", "decoder._num_classes"]))
 
-    @pytest.mark.with_downloads()
-    @pytest.mark.unit
-    def test_PunctuationCapitalization(self):
-        # TODO: Switch to using named configs because here we don't really care about weights
-        pn = PunctuationCapitalizationModel.from_pretrained(model_name='punctuation_en_distilbert')
-        self.__test_restore_elsewhere(
-            model=pn, attr_for_eq_check=set(["punct_classifier.log_softmax", "punct_classifier.log_softmax"])
-        )
-
     @pytest.mark.unit
     def test_mock_save_to_restore_from(self):
         with tempfile.NamedTemporaryFile('w') as empty_file: