diff --git a/nemo/collections/nlp/data/dialogue/__init__.py b/nemo/collections/nlp/data/dialogue/__init__.py
deleted file mode 100644
index a3992ef59971..000000000000
--- a/nemo/collections/nlp/data/dialogue/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.data.dialogue.data_processor.sgd_data_processor import DialogueSGDDataProcessor
-from nemo.collections.nlp.data.dialogue.dataset import (
-    DialogueBERTDataset,
-    DialogueGPTClassificationDataset,
-    DialogueSGDBERTDataset,
-    DialogueZeroShotIntentDataset,
-)
-from nemo.collections.nlp.data.dialogue.sgd.schema import Schema
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/__init__.py b/nemo/collections/nlp/data/dialogue/data_processor/__init__.py
deleted file mode 100644
index 2db92b257416..000000000000
--- a/nemo/collections/nlp/data/dialogue/data_processor/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py
deleted file mode 100644
index 92c56a4c20df..000000000000
--- a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
-from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueAssistantDataProcessor']
-
-
-class DialogueAssistantDataProcessor(DialogueDataProcessor):
-    """Data Processor for Assistant dialogues."""
-
-    def __init__(self, data_dir: str, tokenizer: object, cfg):
-        """
-        Constructs DialogueAssistantDataProcessor
-        Args:
-            data_dir: path to data directory
-            tokenizer: tokenizer object
-        """
-        # deprecation warning
-        deprecated_warning("DialogueAssistantDataProcessor")
-
-        self.data_dir = data_dir
-        self._tokenizer = tokenizer
-        self.cfg = cfg
-        self.intents = self.open_file("dict.intents.csv")
-        if self.cfg.preprocess_intent_function == 'remove_domain':
-            self.intents = [
-                DialogueAssistantDataProcessor.normalize_zero_shot_intent(intent) for intent in self.intents
-            ]
-        self.slots = self.open_file("dict.slots.csv")
-        (
-            bio_slot_ids_to_unified_slot_ids,
-            unified_slots,
-        ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(self.slots)
-        self.slots = unified_slots
-
-        self.bio_slot_ids_to_unified_slot_ids = bio_slot_ids_to_unified_slot_ids
-        self.services = sorted(list(set([intent.split('_')[0] for intent in self.intents])))
-        self.empty_slot_id = [str(idx) for idx, slot_name in enumerate(self.slots) if slot_name == "O"][0]
-
-    @staticmethod
-    def normalize_zero_shot_intent(label):
-        label = label.split('.')[1]
-        if label == 'nomatch':
-            return 'no match'
-        else:
-            return label.replace('_', ' ')
-
-    def open_file(self, filename):
-        """
-        Reads file into a list
-        """
-        filename = os.path.join(self.data_dir, filename)
-        with open(filename, "r", encoding="UTF-8") as f:
-            lines = [i.strip() for i in f.readlines()]
-        return lines
-
-    @staticmethod
-    def get_continuous_slots(slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids):
-        """
-        Extract continuous spans of slot_ids
-
-        To accomodate slots with distinct labels for B-label1 and I-label1,
-        slot_id = self.bio_slot_ids_to_unified_slot_ids[slot_id] is called to map them both to label1
-
-        Args:
-            Slot: list of int representing slot of each word token
-            For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12
-            Corresponds to "please set an alarm clock for my next meeting with the team at three pm next friday"
-            Except for the empty_slot_id (54 in this case), we hope to extract the continuous spans of tokens,
-            each containing a start position and an exclusive end position
-            E.g {18: [9, 10], 44: [11, 13], 46: [14, 16], 12: [17, 18]}
-        """
-        slot_id_stack = []
-        position_stack = []
-        for i in range(len(slot_ids)):
-            slot_id = slot_ids[i]
-
-            slot_id = bio_slot_ids_to_unified_slot_ids[slot_id]
-
-            if not slot_id_stack or slot_id != slot_id_stack[-1]:
-                slot_id_stack.append(slot_id)
-                position_stack.append([])
-            position_stack[-1].append(i)
-
-        slot_id_to_start_and_exclusive_end = {
-            slot_id_stack[i]: [position_stack[i][0], position_stack[i][-1] + 1]
-            for i in range(len(position_stack))
-            if slot_id_stack[i] != empty_slot_id
-        }
-
-        return slot_id_to_start_and_exclusive_end
-
-    @staticmethod
-    def map_bio_format_slots_to_unified_slots(slots):
-        """
-        maps BIO format slots to unified slots (meaning that B-alarm_time and I-alarm_time both map to alarm_time)
-        called even slots does not contain BIO, for unified interface
-        in that case slots == unified_slots and bio_slot_ids_to_unified_slot_ids is an identity mapping i.e. {"0": "0", "1": "1"}
-        """
-        bio_slot_ids_to_unified_slot_ids = {}
-        unified_slots = []
-        unified_idx = -1
-        for idx, slot in enumerate(slots):
-            if slot.replace('I-', '').replace('B-', '') not in unified_slots:
-                unified_idx += 1
-                unified_slots.append(slot.replace('I-', '').replace('B-', ''))
-            bio_slot_ids_to_unified_slot_ids[str(idx)] = str(unified_idx)
-        return bio_slot_ids_to_unified_slot_ids, unified_slots
-
-    def get_dialog_examples(self, dataset_split: str):
-        """
-        Process raw files into DialogueInputExample
-        Args:
-            dataset_split: {train, dev, test}
-        For the assistant dataset, there is no explicit dev set (instead uses the test set as the dev set)
-        Therefore, this function creates a dev set and a new train set from the train set.
-        This is done by taking every 10th example and putting it into the dev set,
-        with all other examples going into the new train set.
-        """
-        examples = []
-
-        dataset_split_print = {"train": "train", "dev": "train", "test": "test"}
-
-        raw_examples_intent = self.open_file("{}.tsv".format(dataset_split_print[dataset_split]))
-        # removes header of tsv file
-        raw_examples_intent = raw_examples_intent[1:]
-        raw_examples_slots = self.open_file("{}_slots.tsv".format(dataset_split_print[dataset_split]))
-
-        if dataset_split in ["train", "dev"]:
-            train_idx = []
-            dev_idx = []
-            for idx in range(len(raw_examples_intent)):
-                if idx % 10 == 0:
-                    dev_idx.append(idx)
-                else:
-                    train_idx.append(idx)
-
-        if dataset_split == "train":
-            raw_examples_intent = [raw_examples_intent[idx] for idx in train_idx]
-            raw_examples_slots = [raw_examples_slots[idx] for idx in train_idx]
-        elif dataset_split == "dev":
-            raw_examples_intent = [raw_examples_intent[idx] for idx in dev_idx]
-            raw_examples_slots = [raw_examples_slots[idx] for idx in dev_idx]
-
-        for i in range(len(raw_examples_intent)):
-            utterance, intent_id = raw_examples_intent[i].split('\t')
-            slot_ids = raw_examples_slots[i].split()
-            utterance_tokens = utterance.split()
-            intent = self.intents[int(intent_id)]
-            slot_id_to_start_and_exclusive_end = DialogueAssistantDataProcessor.get_continuous_slots(
-                slot_ids, self.empty_slot_id, self.bio_slot_ids_to_unified_slot_ids
-            )
-
-            slot_to_start_and_exclusive_end = {
-                self.slots[int(slot_id)]: position for slot_id, position in slot_id_to_start_and_exclusive_end.items()
-            }
-            slot_to_words = {
-                slot: ' '.join(utterance_tokens[position[0] : position[1]])
-                for slot, position in slot_to_start_and_exclusive_end.items()
-            }
-            input_example = {
-                "utterance": utterance,
-                "labels": {"service": intent.split('_')[0], "intent": intent, "slots": slot_to_words},
-                "label_positions": {
-                    "slots": {
-                        slot: {
-                            "start": position[0],
-                            "exclusive_end": position[1],
-                            "slot": slot,
-                        }
-                        for slot, position in slot_to_start_and_exclusive_end.items()
-                    }
-                },
-                "possible_labels": {
-                    "service": self.services,
-                    "intent": self.intents,
-                    "slots": {
-                        # this dataset does not support categorical slots (i.e. only extractive slots)
-                        # therefore use empty list for all values
-                        slot: []
-                        for slot in self.slots
-                    },
-                },
-            }
-            example = DialogueInputExample(input_example)
-            examples.append(example)
-        return examples
-
-    def get_train_examples(self):
-        """Gets a collection of `InputExample`s for the train set."""
-        return self.get_dialog_examples("train")
-
-    def get_dev_examples(self):
-        """Gets a collection of `InputExample`s for the dev set."""
-        return self.get_dialog_examples("dev")
-
-    def get_test_examples(self):
-        """Gets a collection of `InputExample`s for the test set."""
-        return self.get_dialog_examples("test")
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py
deleted file mode 100644
index c41c1f5e04ca..000000000000
--- a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import random
-
-from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueDataProcessor']
-
-
-class DialogueDataProcessor(DataProcessor):
-    """
-    Base class for Data Processing for all data sources
-
-    Data Processor is designed to be Model-independent (but Data-dependent) so that
-        - Encourages experimentation with a variety of models \
-            (BERT-style; GPT-style; T5-style), \
-            which have different tokenization/preprocessing requirements
-        - Facilitates experiments with a variety of data sources, 
-           as data is processed into a common format
-        
-    Roles 
-        1. Processes raw files into Dialogue Input Examples. 
-        2. Keeps all possibly relevant information from the raw files, which 
-            the Dataset class can then determine which labels to use
-    
-    """
-
-    def __init__(self):
-        # deprecation warning
-        deprecated_warning("DialogueDataProcessor")
-
-        raise NotImplementedError()
-
-    def get_train_examples(self):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_test_examples(self):
-        """Gets a collection of `InputExample`s for the test set."""
-        raise NotImplementedError()
-
-    @staticmethod
-    def get_relevant_idxs(dataset_split, n_samples, dev_proportion):
-        """
-        Obtain indexes for each dataset_split, when train and dev sets are not in separate files
-
-        Args:
-            dataset_split: train, dev or test
-            n_samples: total number of samples
-            dev_proportion: value from 1 to 99 that represent proportion of data in dev set
-        Returns:
-            idxs: indices for relevant samples
-        """
-
-        if dataset_split in ["train", "dev"]:
-            n_dev = int(n_samples * (dev_proportion / 100))
-            dev_idxs = random.sample(list(range(n_samples)), n_dev)
-            if dataset_split == "dev":
-                idxs = dev_idxs
-            else:
-                dev_idxs_set = set(dev_idxs)
-                train_idxs = [idx for idx in list(range(n_samples)) if idx not in dev_idxs_set]
-                idxs = train_idxs
-
-        elif dataset_split == "test":
-            idxs = list(range(n_samples))
-
-        else:
-            raise ValueError("please select dataset split from train, dev and test")
-
-        return idxs
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py
deleted file mode 100644
index 56e99c4bcfe9..000000000000
--- a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import pandas as pd
-
-from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
-from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueDesignDataProcessor']
-
-
-class DialogueDesignDataProcessor(DialogueDataProcessor):
-    """Data Processor for Design Dataset"""
-
-    def __init__(self, data_dir: str, tokenizer: object, cfg=None):
-        """
-        Constructs DialogueDesignDataProcessor
-        Args:
-            data_dir: path to data directory
-            tokenizer: tokenizer object
-            cfg: cfg container for dataset
-        """
-        # deprecation warning
-        deprecated_warning("DialogueDesignDataProcessor")
-
-        self.data_dir = data_dir
-        self._tokenizer = tokenizer
-        self.cfg = cfg
-
-    def open_csv(self, filename):
-        """
-        Reads file into a list
-        """
-        filename = os.path.join(self.data_dir, filename)
-        with open(filename, "r", encoding="UTF-8") as f:
-            df = pd.read_csv(filename)
-        return df.to_dict(orient='index')
-
-    def get_dialog_examples(self, dataset_split: str):
-        """
-        Process raw files into DialogueInputExample
-        Args:
-            dataset_split: {train, dev, test}
-        Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set
-        Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting
-        """
-
-        examples = []
-
-        raw_examples = self.open_csv('mellon_design_OV.csv')
-        # remove disabled examples
-        raw_examples = [raw_examples[i] for i in range(len(raw_examples)) if raw_examples[i]['disabled'] != 'yes']
-
-        n_samples = len(raw_examples)
-
-        idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, self.cfg.dev_proportion)
-
-        all_intents = sorted(list(set(raw_examples[i]['intent labels'] for i in range(len(raw_examples)))))
-        all_services = sorted(list(set(raw_examples[i]['domain'] for i in range(len(raw_examples)))))
-        for i in idxs:
-            raw_example = raw_examples[i]
-            utterances = [raw_example['example_{}'.format(i)] for i in range(1, 4)]
-            service = raw_example['domain']
-            intent = raw_example['intent']
-            intent_description = raw_example['intent labels']
-            system_utterance = raw_example['response']
-
-            slot_names = [raw_example['slot{}'.format(i)] for i in range(1, 3)]
-            # these are possible slot values not ground truth slot values
-            slot_values = [raw_example['slot{}_values'.format(i)] for i in range(1, 3)]
-            slot_questions = [raw_example['slot{}_values'.format(i)] for i in range(1, 3)]
-
-            for j in range(1, 3):
-                value = raw_example['slot{}'.format(j)]
-                if isinstance(value, str):
-                    system_utterance = system_utterance.replace('slot{}'.format(j), value)
-
-            valid_slots_ids = [i for i, slot in enumerate(slot_names) if isinstance(slot, str)]
-            slot_names = [slot_names[i] for i in valid_slots_ids]
-            slot_values = [slot_values[i] if isinstance(slot_values[i], str) else '' for i in valid_slots_ids]
-            slot_questions = [slot_questions[i] if isinstance(slot_questions[i], str) else '' for i in valid_slots_ids]
-
-            for utterance in utterances:
-                if not isinstance(utterance, str):
-                    continue
-                input_example = {
-                    "utterance": utterance,
-                    "system_utterance": system_utterance,
-                    "labels": {
-                        "service": service,
-                        "intent": intent_description,
-                        "slots": {
-                            slot: '' for slot in slot_names
-                        },  # dataset does not contain ground truth slot values
-                    },
-                    "possible_labels": {
-                        'intent': all_intents,
-                        "service": all_services,
-                        "slots": {slot: slot_values[i] for i, slot in enumerate(slot_names)},
-                    },
-                    "description": {
-                        "service": service,
-                        "intent": intent_description,
-                        "slots": {slot: slot_questions[i] for i, slot in enumerate(slot_names)},
-                    },
-                }
-
-                example = DialogueInputExample(input_example)
-                examples.append(example)
-        return examples
-
-    def get_train_examples(self):
-        """Gets a collection of `InputExample`s for the train set."""
-        return self.get_dialog_examples("train")
-
-    def get_dev_examples(self):
-        """Gets a collection of `InputExample`s for the dev set."""
-        return self.get_dialog_examples("dev")
-
-    def get_test_examples(self):
-        """Gets a collection of `InputExample`s for the test set."""
-        return self.get_dialog_examples("test")
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py
deleted file mode 100644
index 67d58ff5d21e..000000000000
--- a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import pandas as pd
-
-from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
-from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueMellonQADataProcessor']
-
-
-class DialogueMellonQADataProcessor(DialogueDataProcessor):
-    """Data Processor for Mellon QA dialogues."""
-
-    def __init__(self, data_dir: str, tokenizer: object, cfg=None):
-        """
-        Constructs DialogueMSMarcoDataProcessor
-        Args:
-            data_dir: path to data directory
-            tokenizer: tokenizer object
-            cfg: cfg container for dataset
-        """
-        # deprecation warning
-        deprecated_warning("DialogueMellonQADataProcessor")
-
-        self.data_dir = data_dir
-        self._tokenizer = tokenizer
-        self.cfg = cfg
-
-    def open_csv(self, filename):
-        """
-        Reads file into a list
-        """
-        filename = os.path.join(self.data_dir, filename)
-        with open(filename, "r", encoding="UTF-8") as f:
-            df = pd.read_csv(filename)
-        return df.to_dict(orient='index')
-
-    def get_dialog_examples(self, dataset_split: str):
-        """
-        Process raw files into DialogueInputExample
-        Args:
-            dataset_split: {train, dev, test}
-        For the Mellon QA dataset, there is no explicit dev set (instead uses the test set as the dev set)
-        Therefore, this function creates a dev set and a new train set from the train set.
-        Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set
-        Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting
-        """
-
-        examples = []
-
-        raw_examples = self.open_csv('mellon_qa_data.csv')
-        raw_examples = list(raw_examples.values())
-        # filter out answers with no answer
-        raw_examples = [
-            example
-            for example in raw_examples
-            if isinstance(example['Non Generative Question Answering '], str)
-            and isinstance(example['Generative Question Answering '], str)
-        ]
-
-        n_samples = len(raw_examples)
-        idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, self.cfg.dev_proportion)
-
-        for i in idxs:
-            utterance = str(raw_examples[i]['Question'])
-            answer = str(raw_examples[i]['Non Generative Question Answering '])
-            well_formed_answer = str(raw_examples[i]['Generative Question Answering '])
-            passage = raw_examples[i]['Passage']
-            input_example = {
-                "utterance": utterance,
-                "example_id": i,
-                "labels": {
-                    "response": answer,
-                    "fluent_response": well_formed_answer,
-                    "passage": passage,
-                },
-            }
-            example = DialogueInputExample(input_example)
-            examples.append(example)
-        return examples
-
-    def get_train_examples(self):
-        """Gets a collection of `InputExample`s for the train set."""
-        return self.get_dialog_examples("train")
-
-    def get_dev_examples(self):
-        """Gets a collection of `InputExample`s for the dev set."""
-        return self.get_dialog_examples("dev")
-
-    def get_test_examples(self):
-        """Gets a collection of `InputExample`s for the test set."""
-        return self.get_dialog_examples("test")
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py
deleted file mode 100644
index d09960a35d69..000000000000
--- a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from ast import literal_eval
-
-from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
-from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueMSMarcoDataProcessor']
-
-
-class DialogueMSMarcoDataProcessor(DialogueDataProcessor):
-    """Data Processor for MS Marco dialogues. (https://github.com/microsoft/MSMARCO-Question-Answering)
-    Please agree to the Terms of Use before downloading data at
-    https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz
-    https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz
-    """
-
-    def __init__(self, data_dir: str, tokenizer: object, cfg=None):
-        """
-        Constructs DialogueMSMarcoDataProcessor
-        Args:
-            data_dir: path to data directory
-            tokenizer: tokenizer object
-            debug_mode: reduce number of samples to load in order to increase speed of processing
-            cfg: cfg container for dataset
-        """
-        # deprecation warning
-        deprecated_warning("DialogueMSMarcoDataProcessor")
-
-        self.data_dir = data_dir
-        self._tokenizer = tokenizer
-        self.cfg = cfg
-
-    def open_json(self, filename):
-        """
-        Reads file into a list
-        """
-        filename = os.path.join(self.data_dir, filename)
-        with open(filename, "r", encoding="UTF-8") as f:
-            data = json.load(f)
-        return data
-
-    def get_dialog_examples(self, dataset_split: str):
-        """
-        Process raw files into DialogueInputExample
-        Args:
-            dataset_split: {train, dev, test}
-        For the MS Marco dataset, there is no explicit dev set (instead uses the test set as the dev set)
-        Therefore, this function creates a dev set and a new train set from the train set.
-        Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set
-        """
-
-        examples = []
-
-        dataset_split_print = {"train": "train", "dev": "train", "test": "dev"}
-
-        raw_examples = self.open_json("{}_v2.1.json".format(dataset_split_print[dataset_split]))
-
-        n_samples = len(raw_examples['answers'])
-
-        idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, self.cfg.dev_proportion)
-
-        if self.cfg.debug_mode:
-            idxs = idxs[:100]
-
-        for i in idxs:
-            utterance = raw_examples['query'][str(i)]
-            # answer need not be extracted from passage
-            # taking the first answer as the ground truth correct answer as only <1% has multiple answers
-            answer = raw_examples['answers'][str(i)]
-            answer = answer[0] if isinstance(answer, list) else answer
-
-            well_formed_answer = raw_examples['wellFormedAnswers'][str(i)]
-            well_formed_answer = (
-                well_formed_answer if isinstance(well_formed_answer, list) else literal_eval(well_formed_answer)
-            )
-            well_formed_answer = well_formed_answer[0] if well_formed_answer else None
-            query_type = raw_examples['query_type'][str(i)]
-            candidate_passages = raw_examples['passages'][str(i)]
-            passage = [
-                candidate_passage["passage_text"]
-                for candidate_passage in candidate_passages
-                if int(candidate_passage["is_selected"])
-            ]
-            passage = passage[0] if passage else None
-
-            possible_passages = [candidate_passage["passage_text"] for candidate_passage in candidate_passages]
-
-            input_example = {
-                "utterance": utterance,
-                "example_id": i,
-                "labels": {
-                    "service": query_type,
-                    "response": answer,
-                    "fluent_response": well_formed_answer,
-                    "passage": passage,
-                },
-                "possible_labels": {
-                    "service": "LOCATION,NUMERIC,PERSON,DESCRIPTION,ENTITY".split(','),
-                    "passage": possible_passages,
-                },
-            }
-            example = DialogueInputExample(input_example)
-            examples.append(example)
-        return examples
-
-    def get_train_examples(self):
-        """Gets a collection of `InputExample`s for the train set."""
-        return self.get_dialog_examples("train")
-
-    def get_dev_examples(self):
-        """Gets a collection of `InputExample`s for the dev set."""
-        return self.get_dialog_examples("dev")
-
-    def get_test_examples(self):
-        """Gets a collection of `InputExample`s for the test set."""
-        return self.get_dialog_examples("test")
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py
deleted file mode 100644
index 1d37c26f1c45..000000000000
--- a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py
+++ /dev/null
@@ -1,578 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This file contains code artifacts adapted from the original implementation:
-https://github.com/google-research/google-research/blob/master/schema_guided_dst/baseline/data_utils.py
-"""
-import collections
-import json
-import os
-import pickle
-import re
-from typing import List
-
-from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
-from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
-from nemo.collections.nlp.data.dialogue.sgd.schema import Schema
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-from nemo.utils.get_rank import is_global_rank_zero
-
-__all__ = ['DialogueSGDDataProcessor']
-
-FILE_RANGES = {
-    "sgd_single_domain": {"train": range(1, 44), "dev": range(1, 8), "test": range(1, 12)},
-    "sgd_multi_domain": {"train": range(44, 128), "dev": range(8, 21), "test": range(12, 35)},
-    "sgd_all": {"train": range(1, 128), "dev": range(1, 21), "test": range(1, 35)},
-    "sgd_all_single": {"train": range(1, 128), "dev": range(1, 8), "test": range(1, 12)},
-    "multiwoz": {"train": range(1, 18), "dev": range(1, 3), "test": range(1, 3)},
-    "debug_sample": {"train": range(1, 2), "dev": range(1, 2), "test": range(1, 2)},
-}
-
-
-class DialogueSGDDataProcessor(DialogueDataProcessor):
-    """Data Processor for SGD dialogues.
-
-    More information at https://arxiv.org/abs/1909.05855
-
-    ***Downloading the dataset***
-        #   git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git
-
-    ***Data format***
-    SGD data comes with a JSON schema file and dialogue files for each dataset split.
-
-    In the following we will show an example for a service entry in the schema file.
-    * service_name
-    * description
-    * slots
-        * name
-        * description
-        * is_categorical
-        * possible values
-    * intents
-        * name
-        * description
-        * required_slots (not used)
-        * is_transactional (not used)
-        * optional_slots (not used)
-        * result_slots (not used)
-
-
-    In the following we will show an example for a dialogue.
-    * dialogue_id
-    * services
-    * turns
-        * frames
-            * actions
-                * act
-                * slot
-                * values
-            * service
-            * slots
-                * exclusive_end
-                * slot
-                * start
-            * state
-                * active_intent
-                * requeste_slots
-                * slot_values
-        * speaker - [USER, SYSTEM]
-        * utterance
-
-    """
-
-    def __init__(
-        self,
-        data_dir: str,
-        dialogues_example_dir: str,
-        tokenizer: object,
-        cfg=None,
-    ):
-        """
-        Constructs DialogueSGDDataProcessor
-        Args:
-            data_dir: path to data directory
-            dialogues_example_dir: path to store processed dialogue examples
-            tokenizer: tokenizer object
-            cfg: cfg container for dataset
-        """
-        # deprecation warning
-        deprecated_warning("DialogueSGDDataProcessor")
-
-        self.data_dir = data_dir
-        self.cfg = cfg
-
-        self._task_name = self.cfg.task_name  # e.g. "sgd_single_domain"
-        self._subsample = self.cfg.subsample
-
-        all_schema_json_paths = []
-        for dataset_split in ['train', 'test', 'dev']:
-            all_schema_json_paths.append(os.path.join(self.cfg.data_dir, dataset_split, "schema.json"))
-        self.schemas = Schema(all_schema_json_paths)
-
-        self.schema_config = {
-            "MAX_NUM_CAT_SLOT": self.cfg.max_num_cat_slot,
-            "MAX_NUM_NONCAT_SLOT": self.cfg.max_num_noncat_slot,
-            "MAX_NUM_VALUE_PER_CAT_SLOT": self.cfg.max_value_per_cat_slot,
-            "MAX_NUM_INTENT": self.cfg.max_num_intent,
-            "NUM_TASKS": self.cfg.num_tasks,
-            "MAX_SEQ_LENGTH": self.cfg.max_seq_length,
-        }
-
-        train_file_range = FILE_RANGES[self._task_name]["train"]
-        dev_file_range = FILE_RANGES[self._task_name]["dev"]
-        test_file_range = FILE_RANGES[self._task_name]["test"]
-
-        self._file_ranges = {
-            "train": train_file_range,
-            "dev": dev_file_range,
-            "test": test_file_range,
-        }
-
-        self._seen_services = {
-            "train": set(),
-            "dev": set(),
-            "test": set(),
-        }
-
-        self._tokenizer = tokenizer
-
-        self._dialogues_example_dir = dialogues_example_dir
-
-        self.dial_files = {}
-
-        # slots_relation_list.np would contain the candidate list of slots for each (service, slot) which would be
-        # looked into when a switch between two services happens in the dialogue and we can not find any value for a slot in the current user utterance.
-        # This file would get generated from the dialogues in the training set.
-        self.slots_relation_file = os.path.join(
-            dialogues_example_dir, f"{self._task_name}_train_slots_relation_list.np"
-        )
-        for dataset in ["train", "dev", "test"]:
-            # Process dialogue files
-            dial_file = f"{self._task_name}_{dataset}_examples.json"
-            dial_file = os.path.join(dialogues_example_dir, dial_file)
-            self.dial_files[(self._task_name, dataset)] = dial_file
-
-            dialog_paths = DialogueSGDDataProcessor.get_dialogue_files(data_dir, dataset, self._task_name)
-            dialogs = DialogueSGDDataProcessor.load_dialogues(dialog_paths)
-            for dialog in dialogs:
-                self._seen_services[dataset].update(set(dialog['services']))
-
-        if is_global_rank_zero():
-            overwrite_dial_files = not self.cfg.use_cache
-            self.save_dialog_examples(overwrite_dial_files=overwrite_dial_files)
-
-    def save_dialog_examples(self, overwrite_dial_files: bool):
-        """
-        Preprocesses dialogues and saves to disk.
-        Args:
-            overwrite_dial_files: whether or not to overwrite saved file if already exists
-        """
-        for dataset in ["train", "dev", "test"]:
-            dial_file = self.dial_files[(self._task_name, dataset)]
-            if not os.path.exists(dial_file) or overwrite_dial_files:
-                logging.info(f"Start generating the dialogue examples for {dataset} dataset.")
-                if not os.path.exists(self._dialogues_example_dir):
-                    os.makedirs(self._dialogues_example_dir)
-                dial_examples, slots_relation_list = self._generate_dialog_examples(
-                    dataset, self.schemas, self._subsample
-                )
-
-                with open(dial_file, "w", encoding="UTF-8") as f:
-                    json.dump([i.data for i in dial_examples], f)
-
-                if dataset == "train":
-                    with open(self.slots_relation_file, "wb") as f:
-                        pickle.dump(slots_relation_list, f)
-                    logging.info(f"The slot carry-over list for train set is stored at {self.slots_relation_file}")
-
-                logging.info(f"The dialogue examples for {dataset} dataset saved at {dial_file}")
-                logging.info(f"Finish generating the dialogue examples for {dataset} dataset.")
-
-    # common interface for Data Processor
-    def get_train_examples(self):
-        """Gets a collection of `InputExample`s for the train set."""
-        return self.get_dialog_examples("train")
-
-    def get_dev_examples(self):
-        """Gets a collection of `InputExample`s for the dev set."""
-        return self.get_dialog_examples("dev")
-
-    def get_test_examples(self):
-        """Gets a collection of `InputExample`s for the test set."""
-        return self.get_dialog_examples("test")
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    def get_dialog_examples(self, dataset_split: str) -> List[object]:
-        """
-        Loads preprocessed dialogue examples from disk.
-        Args:
-            dataset_split: dataset split
-        Returns:
-            dial_examples:  list of InputExample's.
-        """
-        if (self._task_name, dataset_split) not in self.dial_files or not os.path.exists(
-            self.dial_files[(self._task_name, dataset_split)]
-        ):
-            raise ValueError(
-                f"{dataset_split} dialogue examples were not processed for {self._task_name} task. Re-initialize SGDDataProcessor and add {dataset_split} dataset split to datasets arg."
-            )
-        dial_file = self.dial_files[(self._task_name, dataset_split)]
-        logging.info(f"Loading dialogue examples from {dial_file}.")
-
-        with open(dial_file, "rb") as f:
-            dial_examples = json.load(f)
-            dial_examples = [DialogueInputExample(i) for i in dial_examples]
-        if not os.path.exists(self.slots_relation_file):
-            raise ValueError(
-                f"Slots relation file {self.slots_relation_file} does not exist. It is needed for the carry-over mechanism of state tracker for switches between services."
-            )
-        if os.path.getsize(self.slots_relation_file) > 0:
-            with open(self.slots_relation_file, "rb") as f:
-                self.schemas._slots_relation_list = pickle.load(f)
-            logging.info(
-                f"Loaded the slot relation list for value carry-over between services from {self.slots_relation_file}."
-            )
-
-        return dial_examples
-
-    def get_seen_services(self, dataset_split: str):
-        """
-        Returns list of seen services, i.e. both in given and training split
-        Args:
-            dataset_split: data split
-        Returns:
-            seen_services: list of seen services
-        """
-        seen_services = self._seen_services[dataset_split]
-        return seen_services
-
-    def _generate_dialog_examples(self, dataset_split: str, schemas: object, subsample: bool):
-        """
-        Returns a list of `InputExample`s of the data splits' dialogues.
-        Args:
-            dataset_split: data split, can be "train", "dev", or "test".
-            schemas: schema for all services of all datasets
-            subsample: whether to balance postive and negative samples in the dataset
-        Returns:
-            examples: a list of `InputExample`s.
-        """
-        logging.info(f'Creating examples and slot relation list from the dialogues started...')
-        dialog_paths = [
-            os.path.join(self.data_dir, dataset_split, "dialogues_{:03d}.json".format(i))
-            for i in self._file_ranges[dataset_split]
-        ]
-        dialogs = DialogueSGDDataProcessor.load_dialogues(dialog_paths)
-
-        examples = []
-        slot_carryover_candlist = collections.defaultdict(int)
-        for dialog_idx, dialog in enumerate(dialogs):
-            if dialog_idx % 1000 == 0:
-                logging.info(f'Processed {dialog_idx} dialogues.')
-            examples.extend(
-                self._create_examples_from_dialog(dialog, schemas, dataset_split, slot_carryover_candlist, subsample)
-            )
-
-        slots_relation_list = collections.defaultdict(list)
-        for slots_relation, relation_size in slot_carryover_candlist.items():
-            if relation_size > 0:
-                slots_relation_list[(slots_relation[0], slots_relation[1])].append(
-                    (slots_relation[2], slots_relation[3], relation_size)
-                )
-                slots_relation_list[(slots_relation[2], slots_relation[3])].append(
-                    (slots_relation[0], slots_relation[1], relation_size)
-                )
-
-        return examples, slots_relation_list
-
-    def _create_examples_from_dialog(
-        self, dialog: dict, schemas: object, dataset_split: str, slot_carryover_candlist: dict, subsample: bool
-    ):
-        """
-        Create examples for every turn in the dialogue.
-        Args:
-            dialog: dialogue example
-            schemas: schema for all services of all datasets
-            dataset_split: data split
-            slot_carryover_candlist: a dictionary to keep and count the number of carry-over cases between two slots from two different services
-            subsample: whether to balance postive and negative samples in the dataset
-        Returns:
-            examples: a list of `InputExample`s.
-        """
-        dialog_id = dialog["dialogue_id"]
-        prev_states = {}
-        examples = []
-        for turn_idx, turn in enumerate(dialog["turns"]):
-            # Generate an example for every frame in every user turn.
-            if turn["speaker"] == "USER":
-                user_utterance = turn["utterance"]
-                user_frames = {f["service"]: f for f in turn["frames"]}
-                if self.cfg.system_utterance == 'prev_turn':
-                    if turn_idx > 0:
-                        system_turn = dialog["turns"][turn_idx - 1]
-                        system_utterance = system_turn["utterance"]
-                        system_frames = {f["service"]: f for f in system_turn["frames"]}
-                    else:
-                        system_utterance = ""
-                        system_frames = {}
-                else:  # takes the system utterance of the next turn
-                    system_turn = dialog["turns"][turn_idx + 1]
-                    system_utterance = system_turn["utterance"]
-                    system_frames = {f["service"]: f for f in system_turn["frames"]}
-
-                turn_id = "{}-{}-{:02d}".format(dataset_split, dialog_id, turn_idx)
-                turn_examples, prev_states, slot_carryover_values = self._create_examples_from_turn(
-                    turn_id,
-                    system_utterance,
-                    user_utterance,
-                    system_frames,
-                    user_frames,
-                    prev_states,
-                    schemas,
-                    subsample,
-                )
-                examples.extend(turn_examples)
-
-                for value, slots_list in slot_carryover_values.items():
-                    if value in ["True", "False"]:
-                        continue
-                    if len(slots_list) > 1:
-                        for service1, slot1 in slots_list:
-                            for service2, slot2 in slots_list:
-                                if service1 == service2:
-                                    continue
-                                if service1 > service2:
-                                    service1, service2 = service2, service1
-                                    slot1, slot2 = slot2, slot1
-                                slot_carryover_candlist[(service1, slot1, service2, slot2)] += 1
-        return examples
-
-    def _get_state_update(self, current_state: dict, prev_state: dict) -> dict:
-        """
-        Updates dialogue state
-        Args:
-            current_state: slot values pairs for the current dialogue turn
-            prev_state: slot values pairs for the previous dialogue turns
-        Returns:
-            state_update: slot values pairs that are added/updated during the current dialogue turn
-        """
-        state_update = dict(current_state)
-        for slot, values in current_state.items():
-            if slot in prev_state and prev_state[slot][0] in values:
-                # Remove the slot from state if its value didn't change.
-                state_update.pop(slot)
-        return state_update
-
-    @staticmethod
-    def convert_camelcase_to_lower(label):
-        """Converts camelcase to lowercase with spaces e.g. 'HelloWorld' --> 'hello world'"""
-        if label.lower() == "none":
-            return "none"
-        label = label.split("_")[0]
-        tokens = re.findall('[A-Z][^A-Z]*', label)
-        return ' '.join([token.lower() for token in tokens])
-
-    def preprocess_intent(self, intent, schemas, service):
-        if self.cfg.preprocess_intent_function == 'default':
-            return intent
-        elif self.cfg.preprocess_intent_function == 'lowercase':
-            return DialogueSGDDataProcessor.convert_camelcase_to_lower(intent)
-        elif self.cfg.preprocess_intent_function == 'description':
-            return schemas.get_service_schema(service).intent_descriptions[intent]
-        else:
-            raise ValueError(
-                'Only default, lowercase and description are allowed for model.dataset.preprocess_intent_function for SGD task'
-            )
-
-    def _create_examples_from_turn(
-        self,
-        turn_id: int,
-        system_utterance: str,
-        user_utterance: str,
-        system_frames: dict,
-        user_frames: dict,
-        prev_states: dict,
-        schemas: object,
-        subsample: bool,
-    ):
-        """
-        Creates an example for each frame in the user turn.
-        Args:
-            turn_id: turn number
-            system_utterance: last system utterance
-            user_utterance: lst user utterance
-            system_frames: all system utterances and slot - slot value pairs
-            user_frames: all user utterances and slot - slot value pairs
-            prev_states: slot - slot value pairs from the previous turns
-            schemas: schema for all services of all datasets
-            subsample: whether to balance postive and negative samples in the dataset
-        Returns:
-            examples: a list of `InputExample`s.
-            prev_states: updated dialogue state e.g. {'Restaurants_1': {'city': ['San Jose'], 'cuisine': ['American']}}
-        """
-        system_user_utterance = system_utterance + ' ' + user_utterance
-        states = {}
-
-        examples = []
-        slot_carryover_values = collections.defaultdict(list)
-
-        for service, user_frame in user_frames.items():
-
-            state = user_frame["state"]["slot_values"]
-            state_update = self._get_state_update(state, prev_states.get(service, {}))
-            states[service] = state
-            system_frame = system_frames.get(service, None)
-            dataset_split, dialog_id, turn_id_ = turn_id.split('-')
-            dialog_id_1, dialog_id_2 = dialog_id.split('_')
-            example_id = f"{turn_id}-{service}"
-            example_id_num = [
-                int(dialog_id_1),
-                int(dialog_id_2),
-                int(turn_id_),
-                schemas.get_service_id(service),
-            ]
-            intent = user_frames[service]["state"]['active_intent']
-            all_possible_slots = schemas.get_service_schema(service).slots
-            categorical_slots = schemas.get_service_schema(service).categorical_slots
-            one_example = {
-                "example_id": example_id,
-                "example_id_num": example_id_num,
-                "utterance": user_utterance,
-                "system_utterance": system_utterance,
-                "system_slots": (
-                    {slot["slot"]: slot for slot in system_frame["slots"]} if system_frame is not None else None
-                ),
-                "system_actions": system_frame["actions"] if system_frame is not None else None,
-                "labels": {
-                    "service": service,
-                    "intent": self.preprocess_intent(intent, schemas, service),
-                    "slots": {slot: state[slot] for slot in state_update},
-                },
-                "label_positions": {"slots": {slot["slot"]: slot for slot in user_frames[service]["slots"]}},
-                "possible_labels": {
-                    "service": schemas.services,
-                    "intent": [
-                        self.preprocess_intent(intent, schemas, service)
-                        for intent in schemas.get_service_schema(service).intents
-                    ],
-                    "slots": {
-                        slot: (
-                            schemas.get_service_schema(service).get_categorical_slot_values(slot)
-                            if slot in categorical_slots
-                            else []
-                        )
-                        for slot in all_possible_slots
-                    },
-                },
-                "description": {
-                    "service": schemas.get_service_schema(service).description,
-                    "intent": schemas.get_service_schema(service).intent_descriptions[intent],
-                    "slots": {
-                        slot: schemas.get_service_schema(service).slot_descriptions[slot] for slot in state_update
-                    },
-                },
-            }
-
-            examples.append(DialogueInputExample(one_example))
-
-            if service not in prev_states and int(turn_id_) > 0:
-                for slot_name, values in state_update.items():
-                    for value in values:
-                        slot_carryover_values[value].append((service, slot_name))
-                for prev_service, prev_slot_value_list in prev_states.items():
-                    if prev_service == service:
-                        continue
-                    if prev_service in state:
-                        prev_slot_value_list = state[prev_service]
-                    for prev_slot_name, prev_values in prev_slot_value_list.items():
-                        for prev_value in prev_values:
-                            slot_carryover_values[prev_value].append((prev_service, prev_slot_name))
-
-        return examples, states, slot_carryover_values
-
-    def _find_subword_indices(
-        self,
-        slot_values: dict,
-        utterance: str,
-        char_slot_spans: dict,
-        alignments: List[int],
-        subwords: List[str],
-        bias: int,
-    ) -> dict:
-        """
-        Find indices for subwords corresponding to slot values.
-        Args:
-            slot_values: slot - slot value pairs
-            utterance: utterance
-            char_slot_spans: char - slot spans
-            alignments: alignments
-            subwords: subtokens mapping
-            bias: offset
-        Returns:
-            span_boundaries: span boundaries
-        """
-        span_boundaries = {}
-        for slot, values in slot_values.items():
-            # Get all values present in the utterance for the specified slot.
-            value_char_spans = {}
-            for slot_span in char_slot_spans:
-                if slot_span["slot"] == slot:
-                    value = utterance[slot_span["start"] : slot_span["exclusive_end"]]
-                    start_tok_idx = alignments[slot_span["start"]]
-                    end_tok_idx = alignments[slot_span["exclusive_end"] - 1]
-                    if 0 <= start_tok_idx < len(subwords):
-                        end_tok_idx = min(end_tok_idx, len(subwords) - 1)
-                        value_char_spans[value] = (start_tok_idx + bias, end_tok_idx + bias)
-            for v in values:
-                if v in value_char_spans:
-                    span_boundaries[slot] = value_char_spans[v]
-                    break
-        return span_boundaries
-
-    @classmethod
-    def load_dialogues(cls, dialog_json_filepaths: List[str]) -> List[dict]:
-        """
-        Obtain the list of all dialogues from specified json files.
-        Args:
-            dialog_json_filepaths: list of json files
-        Returns:
-            dialogs: the list of all dialogues
-        """
-        dialogs = []
-        for dialog_json_filepath in sorted(dialog_json_filepaths):
-            with open(dialog_json_filepath, 'r', encoding="UTF-8") as f:
-                dialogs.extend(json.load(f))
-                f.close()
-        return dialogs
-
-    @classmethod
-    def get_dialogue_files(cls, data_dir: str, dataset_split: str, task_name: str):
-        """
-        Obtain the list of all dialogue json files
-        Args:
-            data_dir: path to the data folder
-            dataset_split: data split
-            task_name: SGD task name, see keys of the FILE_RANGES
-        Returns:
-            dialog: the list of all dialogue json files paths
-        """
-        return [
-            os.path.join(data_dir, dataset_split, 'dialogues_{:03d}.json'.format(fid))
-            for fid in FILE_RANGES[task_name][dataset_split]
-        ]
diff --git a/nemo/collections/nlp/data/dialogue/dataset/__init__.py b/nemo/collections/nlp/data/dialogue/dataset/__init__.py
deleted file mode 100644
index 3352c7be2d9b..000000000000
--- a/nemo/collections/nlp/data/dialogue/dataset/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_bert_dataset import DialogueBERTDataset
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_gpt_classification_dataset import (
-    DialogueGPTClassificationDataset,
-)
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_sgd_bert_dataset import DialogueSGDBERTDataset
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_zero_shot_intent_dataset import DialogueZeroShotIntentDataset
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py
deleted file mode 100644
index 33d46c308e81..000000000000
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional
-
-import numpy as np
-
-from nemo.collections.nlp.data.data_utils import get_stats
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
-from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueBERTDataset', 'DialogueIntentSlotInferenceDataset']
-
-
-class DialogueBERTDataset(DialogueDataset):
-    """
-    Creates a dataset to use for the task of joint intent
-    and slot classification with pretrained model.
-
-    For a dataset to use during inference without labels, see
-    IntentSlotDataset.
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports."""
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'loss_mask': NeuralType(('B', 'T'), MaskType()),
-            'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-            'intent_labels': NeuralType(('B'), LabelsType()),
-            'slot_labels': NeuralType(('B', 'T'), LabelsType()),
-        }
-
-    def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """
-        Args:
-            dataset_split: dataset split
-            dialogues_processor: Data generator for dialogues
-            tokenizer: tokenizer
-            cfg: config container for dataset
-        """
-        # deprecation warning
-        deprecated_warning("DialogueBERTDataset")
-
-        self.cfg = cfg
-        self.all_possible_labels = dialogues_processor.intents
-        self.label_to_label_id = {self.all_possible_labels[i]: i for i in range(len(self.all_possible_labels))}
-        self.all_possible_slots = dialogues_processor.slots
-        self.slot_name_to_slot_id = {self.all_possible_slots[i]: i for i in range(len(self.all_possible_slots))}
-        self.empty_slot_name = 'O'
-
-        self.features = dialogues_processor.get_dialog_examples(dataset_split)
-        self.features = self.features if self.cfg.num_samples == -1 else self.features[: self.cfg.num_samples]
-
-        queries = [feature.data["utterance"] for feature in self.features]
-        if self.cfg.do_lowercase:
-            queries = [query.lower() for query in queries]
-        intents = [self.label_to_label_id[feature.data["labels"]["intent"]] for feature in self.features]
-        word_level_slots = [self.convert_slot_position_to_slot_ids(feature.data) for feature in self.features]
-
-        features = DialogueBERTDataset.get_features(
-            queries,
-            self.cfg.max_seq_length,
-            tokenizer,
-            pad_label=self.cfg.pad_label,
-            word_level_slots=word_level_slots,
-            ignore_extra_tokens=self.cfg.ignore_extra_tokens,
-            ignore_start_end=self.cfg.ignore_start_end,
-        )
-
-        self.all_input_ids = features[0]
-        self.all_segment_ids = features[1]
-        self.all_input_mask = features[2]
-        self.all_loss_mask = features[3]
-        self.all_subtokens_mask = features[4]
-        self.all_slots = features[5]
-        self.all_intents = intents
-
-    def convert_slot_position_to_slot_ids(self, feature):
-        slot_ids = [self.slot_name_to_slot_id[self.empty_slot_name] for i in range(len(feature["utterance"].split()))]
-        slot_name_to_positions = feature["label_positions"]["slots"]
-
-        for slot_name in slot_name_to_positions:
-            slot_id = self.slot_name_to_slot_id[slot_name]
-            start = slot_name_to_positions[slot_name]["start"]
-            exclusive_end = slot_name_to_positions[slot_name]["exclusive_end"]
-            for to_replace_position in range(start, min(exclusive_end, len(slot_ids))):
-                slot_ids[to_replace_position] = slot_id
-
-        return slot_ids
-
-    def __len__(self):
-        return len(self.all_input_ids)
-
-    def __getitem__(self, idx):
-        return (
-            np.array(self.all_input_ids[idx]),
-            np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.longlong),
-            np.array(self.all_loss_mask[idx]),
-            np.array(self.all_subtokens_mask[idx]),
-            self.all_intents[idx],
-            np.array(self.all_slots[idx]),
-        )
-
-    @staticmethod
-    def truncate_and_pad(
-        max_seq_length,
-        ignore_start_end,
-        with_label,
-        pad_label,
-        tokenizer,
-        all_slots,
-        all_subtokens,
-        all_input_mask,
-        all_loss_mask,
-        all_subtokens_mask,
-        all_input_ids,
-        all_segment_ids,
-    ):
-
-        too_long_count = 0
-
-        for i, subtokens in enumerate(all_subtokens):
-            if len(subtokens) > max_seq_length:
-                subtokens = [tokenizer.cls_token] + subtokens[-max_seq_length + 1 :]
-                all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :]
-                all_loss_mask[i] = [1 - ignore_start_end] + all_loss_mask[i][-max_seq_length + 1 :]
-                all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :]
-
-                if with_label:
-                    all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1 :]
-                too_long_count += 1
-
-            all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens])
-
-            if len(subtokens) < max_seq_length:
-                extra = max_seq_length - len(subtokens)
-                all_input_ids[i] = all_input_ids[i] + [0] * extra
-                all_loss_mask[i] = all_loss_mask[i] + [0] * extra
-                all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra
-                all_input_mask[i] = all_input_mask[i] + [0] * extra
-
-                if with_label:
-                    all_slots[i] = all_slots[i] + [pad_label] * extra
-
-            all_segment_ids.append([0] * max_seq_length)
-
-        logging.info(f'{too_long_count} are longer than {max_seq_length}')
-        return (
-            all_slots,
-            all_subtokens,
-            all_input_mask,
-            all_loss_mask,
-            all_subtokens_mask,
-            all_input_ids,
-            all_segment_ids,
-        )
-
-    @staticmethod
-    def get_features(
-        queries,
-        max_seq_length,
-        tokenizer,
-        pad_label=128,
-        word_level_slots=None,
-        ignore_extra_tokens=False,
-        ignore_start_end=False,
-    ):
-        """
-        Convert queries (utterance, intent label and slot labels) to BERT input format
-        """
-
-        all_subtokens = []
-        all_loss_mask = []
-        all_subtokens_mask = []
-        all_segment_ids = []
-        all_input_ids = []
-        all_input_mask = []
-        sent_lengths = []
-        all_slots = []
-
-        with_label = word_level_slots is not None
-
-        for i, query in enumerate(queries):
-            words = query.strip().split()
-            subtokens = [tokenizer.cls_token]
-            loss_mask = [1 - ignore_start_end]
-            subtokens_mask = [0]
-            if with_label:
-                slots = [pad_label]
-
-            for j, word in enumerate(words):
-                word_tokens = tokenizer.text_to_tokens(word)
-
-                # to handle emojis that could be neglected during tokenization
-                if len(word.strip()) > 0 and len(word_tokens) == 0:
-                    word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)]
-
-                subtokens.extend(word_tokens)
-                # mask all sub-word tokens except the first token in a word
-                # use the label for the first sub-word token as the label for the entire word to eliminate need for disambiguation
-                loss_mask.append(1)
-                loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1))
-
-                subtokens_mask.append(1)
-                subtokens_mask.extend([0] * (len(word_tokens) - 1))
-
-                if with_label:
-                    slots.extend([word_level_slots[i][j]] * len(word_tokens))
-
-            subtokens.append(tokenizer.sep_token)
-            loss_mask.append(1 - ignore_start_end)
-            subtokens_mask.append(0)
-            sent_lengths.append(len(subtokens))
-            all_subtokens.append(subtokens)
-            all_loss_mask.append(loss_mask)
-            all_subtokens_mask.append(subtokens_mask)
-            all_input_mask.append([1] * len(subtokens))
-            if with_label:
-                slots.append(pad_label)
-                all_slots.append(slots)
-        max_seq_length_data = max(sent_lengths)
-        max_seq_length = min(max_seq_length, max_seq_length_data) if max_seq_length > 0 else max_seq_length_data
-        logging.info(f'Setting max length to: {max_seq_length}')
-        get_stats(sent_lengths)
-
-        # truncate and pad samples
-        (
-            all_slots,
-            all_subtokens,
-            all_input_mask,
-            all_loss_mask,
-            all_subtokens_mask,
-            all_input_ids,
-            all_segment_ids,
-        ) = DialogueBERTDataset.truncate_and_pad(
-            max_seq_length,
-            ignore_start_end,
-            with_label,
-            pad_label,
-            tokenizer,
-            all_slots,
-            all_subtokens,
-            all_input_mask,
-            all_loss_mask,
-            all_subtokens_mask,
-            all_input_ids,
-            all_segment_ids,
-        )
-
-        # log examples for debugging
-        logging.debug("*** Some Examples of Processed Data ***")
-        for i in range(min(len(all_input_ids), 5)):
-            logging.debug("i: %s" % (i))
-            logging.debug("subtokens: %s" % " ".join(list(map(str, all_subtokens[i]))))
-            logging.debug("loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i]))))
-            logging.debug("input_mask: %s" % " ".join(list(map(str, all_input_mask[i]))))
-            logging.debug("subtokens_mask: %s" % " ".join(list(map(str, all_subtokens_mask[i]))))
-            if with_label:
-                logging.debug("slots_label: %s" % " ".join(list(map(str, all_slots[i]))))
-
-        return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots)
-
-
-class DialogueIntentSlotInferenceDataset(DialogueBERTDataset):
-    """
-    Creates dataset to use for the task of joint intent
-    and slot classification with pretrained model.
-    This is to be used during inference only.
-    It uses list of queries as the input.
-
-    Args:
-        queries (list): list of queries to run inference on
-        max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
-        tokenizer (Tokenizer): such as NemoBertTokenizer
-        pad_label (int): pad value use for slot labels.
-            by default, it's the neutral label.
-
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """
-        Returns definitions of module output ports.
-        """
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'loss_mask': NeuralType(('B', 'T'), MaskType()),
-            'subtokens_mask': NeuralType(('B', 'T'), MaskType()),
-        }
-
-    def __init__(self, queries, max_seq_length, tokenizer, do_lower_case):
-        # deprecation warning
-        deprecated_warning("DialogueIntentSlotInferenceDataset")
-
-        if do_lower_case:
-            queries = [query.lower() for query in queries]
-
-        features = DialogueBERTDataset.get_features(queries, max_seq_length, tokenizer)
-
-        self.all_input_ids = features[0]
-        self.all_segment_ids = features[1]
-        self.all_input_mask = features[2]
-        self.all_loss_mask = features[3]
-        self.all_subtokens_mask = features[4]
-
-    def __len__(self):
-        return len(self.all_input_ids)
-
-    def __getitem__(self, idx):
-        return (
-            np.array(self.all_input_ids[idx]),
-            np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.longlong),
-            np.array(self.all_loss_mask[idx]),
-            np.array(self.all_subtokens_mask[idx]),
-        )
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_dataset.py
deleted file mode 100644
index 5540dd3b19f7..000000000000
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_dataset.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.core.classes import Dataset
-
-__all__ = ['DialogueDataset']
-
-
-class DialogueDataset(Dataset):
-    '''
-    Base class for Dialogue Datasets
-        1. Performs Model-dependent (but Data-independent) operations (tokenization etc)
-        2. This can allow the same model preprocessing for multiple datasources
-        3. Users can configurate which labels to use for modelling 
-            (e.g. intent classification, slot filling or sequence generation etc)
-    '''
-
-    def __init__(self, dataset_split: str, dialogues_processor: object, **kwargs):
-        raise NotImplementedError
-
-    def __len__(self):
-        raise NotImplementedError
-
-    def __getitem__(self, idx: int):
-        raise NotImplementedError
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py
deleted file mode 100644
index f89a5013c2ae..000000000000
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import random
-from collections import defaultdict
-
-import torch
-
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-
-class DialogueGPTClassificationDataset(DialogueDataset):
-    '''
-    Designed for classification tasks such as intent/domain classification as well as slot tagging
-
-    Dataset Class
-        1. Performs Model-dependent (but Data-independent) operations (tokenization etc)
-        2. This can allow the same model preprocessing for multiple datasources
-        3. Users can configurate which labels to use for modelling
-            (e.g. intent classification, slot filling or both together etc)
-    '''
-
-    def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """Constructor
-        Args:
-            dataset_split: dataset split
-            dialogues_processor: Data generator for SGD dialogues
-            tokenizer: tokenizer
-            cfg: cfg container for dataset
-        """
-        # deprecation warning
-        deprecated_warning("DialogueGPTClassificationDataset")
-
-        self.cfg = cfg
-
-        if self.cfg.target_template == "with_slots" and self.cfg.eval_mode != "generation":
-            raise ValueError(
-                "slot-filling is not supported by eval_mode {}, please set model.dataset.eval_mode=generation instead".format(
-                    self.cfg.eval_mode
-                )
-            )
-        if self.cfg.target_template != "with_slots" and self.cfg.field == "slots":
-            raise ValueError("please set model.dataset.target_template='with_slots' if model.dataset.field='slots'")
-        self.label_type = self.cfg.field
-        if self.cfg.target_template == "with_description":
-            self.label_to_description = defaultdict(str)
-        self.all_possible_labels = set()
-        self.tokenizer = tokenizer
-        self.tokenizer.tokenizer.padding_side = "right"
-        self.max_candidates = 2
-        if not isinstance(dataset_split, str):
-            dataset_split = dataset_split[0]
-        self.features = dialogues_processor.get_dialog_examples(dataset_split)
-        for idx in range(len(self.features)):
-            self.preprocess_feature(idx)
-        if self.cfg.debug_mode:
-            self.features = self.features[:16]
-        # for few shot learning to append in the prompt
-        self.lm_features = self.get_lm_samples()
-
-    def transform(self, label):
-        """
-        Normalize labels by replacing underscore with space
-
-        Args:
-            label: str
-        Returns:
-            normalized_label: str
-        """
-        if self.cfg.task == "assistant" and self.cfg.prompt_template != "prompt_tuning":
-            label = label.replace('_', ' ')
-        return label
-
-    def __len__(self):
-        return len(self.features)
-
-    def get_n_tokens_in_sentence(self, sentence):
-        encodings_dict = self.tokenizer.tokenizer(
-            sentence, truncation=True, max_length=self.cfg.max_seq_length, padding=False, return_tensors="pt"
-        )
-        output = torch.squeeze(encodings_dict['input_ids'])
-        return len(output) if len(output.size()) > 0 else 0
-
-    def preprocess_feature(self, idx):
-        ex = self.features[idx].data
-        label = ex["labels"][self.label_type]
-        candidates = ex["possible_labels"][self.label_type]
-
-        if self.label_type in ["service", "intent"]:
-            label = self.transform(label)
-            candidates = [self.transform(candidate) for candidate in candidates]
-
-        self.features[idx].data["labels"][self.label_type] = label
-        self.features[idx].data["possible_labels"][self.label_type] = candidates
-        if self.cfg.target_template == "with_description":
-            description = ex["description"][self.label_type]
-            self.label_to_description[label] = description
-        for candidate in candidates:
-            self.all_possible_labels.add(candidate)
-        self.max_candidates = max(self.max_candidates, len(candidates))
-
-    def default_encode(self, sentence):
-        encodings_dict = self.tokenizer.tokenizer(
-            sentence, truncation=True, max_length=self.cfg.max_seq_length, padding="max_length", return_tensors="pt"
-        )
-        input_ids = torch.squeeze(encodings_dict['input_ids'])
-        attn_masks = torch.squeeze(encodings_dict['attention_mask'])
-        return encodings_dict, input_ids, attn_masks
-
-    @staticmethod
-    def linearize_slots(slots):
-        """
-        Serialize slots into a linear text
-
-        Args:
-            slots: dict with each slot_name as key and possible slot values as value
-        Returns:
-            linear_slots: text based representation of slot names and values
-        """
-        if not slots:
-            return "None"
-        return ", ".join(
-            ["{}({})".format(slot, value if isinstance(value, str) else value[0]) for slot, value in slots.items()]
-        )
-
-    def format_target(self, target, slots=None):
-        """
-        Formats the back part of the training example, after the base_template
-        for instance, "restaurant" in  "<utterance> service: restaurant"
-        or "set alarm\nslots: <slot_name1>(<slot_value1>), <slot_name1>(<slot_value1>)" in \
-        "<utterance>\nintent: set alarm\nslots: <slot_name1>(<slot_value1>), <slot_name1>(<slot_value1>)"
-        """
-        if self.cfg.target_template == "with_description":
-            return target + ' (' + self.label_to_description[target] + ')'
-        elif self.cfg.target_template == "default":
-            return target
-        elif self.cfg.target_template == "with_slots" and slots is not None and self.cfg.field == "intent":
-            return target + '\nslots: ' + DialogueGPTClassificationDataset.linearize_slots(slots)
-        elif self.cfg.target_template == "with_slots" and slots is not None and self.cfg.field == "slots":
-            return DialogueGPTClassificationDataset.linearize_slots(slots)
-        else:
-            raise ValueError("Please choose a target format from {default, with_description, with_slots}")
-
-    def get_lm_samples(self):
-        max_sample_length = 0
-        lm_features = []
-        for idx in range(len(self.features)):
-            ex = self.features[idx].data
-            utterance = ex["utterance"]
-            label = ex["labels"][self.label_type]
-            slots = ex["labels"]["slots"] if self.cfg.target_template == "with_slots" else None
-            lm_feature = self.format_prompt(utterance) + ' ' + self.format_target(label, slots=slots)
-            feature_len = self.get_n_tokens_in_sentence(lm_feature)
-            max_sample_length = max(max_sample_length, feature_len)
-            lm_features.append(lm_feature)
-        logging.info("max feature length per sample with label: ".format(max_sample_length))
-        logging.info(
-            "please adjust max seq len to at least {} * ({} + 1) = {} but not too much more for efficiency".format(
-                max_sample_length, self.cfg.few_shot, max_sample_length * (1 + self.cfg.few_shot)
-            )
-        )
-        return lm_features
-
-    def format_prompt(self, utterance, few_shot=0, idx=None):
-        if self.cfg.prompt_template == "default":
-            base_template = utterance + ' ' + self.label_type + ':'
-        elif self.cfg.prompt_template == "i_want_to":
-            base_template = utterance + ' ' + 'I want to'
-        elif self.cfg.prompt_template == "prompt_tuning":
-            base_template = utterance + '\n' + self.label_type + ':'
-        elif self.cfg.prompt_template == "prompt_tuning_with_options":
-            base_template = (
-                'possible intents: '
-                + ', '.join(sorted(list(self.all_possible_labels)))
-                + '\n\n'
-                + utterance
-                + '\n'
-                + self.label_type
-                + ':'
-            )
-
-        if few_shot > 0:
-            few_shot_indices = random.sample(range(len(self.features)), few_shot + 1)
-            few_shot_indices = [i for i in few_shot_indices if i != idx][:few_shot]
-            few_shot_samples = [self.lm_features[i] for i in few_shot_indices]
-            base_template = (
-                self.tokenizer.tokenizer.pad_token.join(few_shot_samples)
-                + self.tokenizer.tokenizer.pad_token
-                + base_template
-            )
-        return base_template
-
-    def collate_fn(self, batch):
-        """
-        Truncates elements to max length in batch
-        """
-        _, _, _, _, candidate_attn_masks, _, _, _ = zip(*batch)
-        # determine max length in batch
-        batch_max_length = 0
-        for candidate_attn_mask in candidate_attn_masks:
-            for one_attn_mask in candidate_attn_mask:
-                batch_max_length = max(batch_max_length, torch.sum(one_attn_mask).item())
-        # padding for tp=2 situation
-        if batch_max_length % 2:
-            batch_max_length += 1
-
-        all_items = []
-        for item in zip(*batch):
-            if isinstance(item[0], int):
-                item = [torch.tensor(i) for i in item]
-            item_stack = torch.stack(item)
-            # if item_stack is 1d, elements refers to indexes and there is no need to truncate
-            if len(item_stack.size()) == 1:
-                all_items.append(item_stack)
-            # otherwise, truncate last dimension to max length in batch
-            else:
-                all_items.append(item_stack[..., :batch_max_length])
-        return all_items
-
-    def __getitem__(self, idx: int):
-        '''
-        State how the input and output samples look like
-
-        This template can be changed
-
-        Training example:
-            e.g. <utterance> service: restaurant
-            e.g. <task description> <utterance> service: restaurant
-            e.g. <utterance>\nintent: set alarm\nslots: <slot_name1>(<slot_value1>), <slot_name1>(<slot_value1>)
-
-        Generation example:
-            e.g. <utterance> service:
-
-        '''
-        ex = self.features[idx].data
-
-        utterance = ex["utterance"]
-        utterance_length = self.get_n_tokens_in_sentence(utterance)
-
-        label = ex["labels"][self.label_type]
-        candidates = ex["possible_labels"][self.label_type]
-
-        slots = ex["labels"]["slots"] if self.cfg.target_template == "with_slots" else None
-
-        base_template = self.format_prompt(utterance, few_shot=self.cfg.few_shot, idx=idx)
-
-        sentence_without_answer = base_template
-
-        sentence = base_template + ' ' + self.format_target(label, slots=slots)
-
-        if self.cfg.eval_mode == "binary_score":
-            candidate_sentences = []
-            for candidate in candidates:
-                positive_answer = base_template + ' ' + candidate + ' Answer: ' + 'yes'
-                negative_answer = base_template + ' ' + candidate + ' Answer: ' + 'no'
-                if candidate == label:
-                    correct_candidate = len(candidate_sentences) // 2
-                    candidate_sentences.append(positive_answer)
-                    candidate_sentences.append(negative_answer)
-                else:
-                    candidate_sentences.append(negative_answer)
-                    candidate_sentences.append(positive_answer)
-        else:
-            correct_candidate = 0
-            candidate_sentences = [
-                base_template + ' ' + self.format_target(candidate, slots=slots) for candidate in candidates
-            ]
-
-        encodings_dict, input_ids, attn_masks = self.default_encode(sentence)
-
-        candidate_tokenized_sentences = [
-            self.default_encode(candidate_sentence) for candidate_sentence in candidate_sentences
-        ]
-
-        # ensure all samples have the same number of candidates for collating into tensor
-        while len(candidate_tokenized_sentences) < self.max_candidates:
-            candidate_tokenized_sentences.append(candidate_tokenized_sentences[0])
-
-        candidate_input_ids = torch.stack([i[1] for i in candidate_tokenized_sentences])
-        candidate_attn_masks = torch.stack([i[2] for i in candidate_tokenized_sentences])
-
-        labels = copy.copy(torch.squeeze(encodings_dict['input_ids']))
-
-        training_mask_end = self.get_n_tokens_in_sentence(sentence_without_answer)
-
-        labels.data = torch.tensor(
-            [-100 if i < training_mask_end else labels.data[i] for i in range(len(labels.data))]
-        )
-
-        return (
-            input_ids,
-            attn_masks,
-            labels,
-            candidate_input_ids,
-            candidate_attn_masks,
-            training_mask_end,
-            utterance_length,
-            correct_candidate,
-        )
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py
deleted file mode 100644
index 8ddbc2e3925e..000000000000
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-
-import torch
-
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
-from nemo.utils.decorators import deprecated_warning
-
-
-class DialogueGPTGenerationDataset(DialogueDataset):
-    def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """Constructor
-        Designed for free form generation tasks such as Dialogue Response Generation
-
-        Args:
-            dataset_split: dataset split
-            dialogues_processor: dialogues processor
-            tokenizer: tokenizer
-            cfg: cfg container for dataset
-        """
-        # deprecation warning
-        deprecated_warning("DialogueGPTGenerationDataset")
-
-        self.cfg = cfg
-        self.input_label_type = self.cfg.input_field
-        self.output_label_type = self.cfg.output_field
-        self.tokenizer = tokenizer
-        self.tokenizer.tokenizer.padding_side = "right"
-        if not isinstance(dataset_split, str):
-            dataset_split = dataset_split[0]
-
-        self.features = dialogues_processor.get_dialog_examples(dataset_split)
-        self.features = self.remove_invalid_samples(self.features)
-
-        if self.cfg.debug_mode:
-            self.features = self.features[:16]
-
-    def remove_invalid_samples(self, features):
-        valid_idxs = []
-        all_fields = self.input_label_type.split('+') + self.output_label_type.split('+')
-        for i in range(len(features)):
-            features[i].data["labels"]["utterance"] = features[i].data["utterance"]
-            all_fields_non_empty = True
-            for field in all_fields:
-                if not features[i].data["labels"][field] or not features[i].data["labels"][field].strip():
-                    all_fields_non_empty = False
-            if all_fields_non_empty:
-                valid_idxs.append(i)
-        return [features[i] for i in valid_idxs]
-
-    def __len__(self):
-        return len(self.features)
-
-    def get_n_tokens_in_sentence(self, sentence):
-        encodings_dict = self.tokenizer.tokenizer(
-            sentence, truncation=True, max_length=self.cfg.max_seq_length, padding=False, return_tensors="pt"
-        )
-        output = torch.squeeze(encodings_dict['input_ids'])
-        return len(output) if len(output.size()) > 0 else 0
-
-    def default_encode(self, sentence):
-        encodings_dict = self.tokenizer.tokenizer(
-            sentence, truncation=True, max_length=self.cfg.max_seq_length, padding="max_length", return_tensors="pt"
-        )
-        input_ids = torch.squeeze(encodings_dict['input_ids'])
-        attn_masks = torch.squeeze(encodings_dict['attention_mask'])
-        return encodings_dict, input_ids, attn_masks
-
-    def format_prompt(self, ex):
-        '''
-        Formats training prompt based on self.input_field_type
-
-        Training example:
-            e.g. response: <response> # input_label_type = response
-            e.g. utterance: <utterance> # input_label_type = utterance
-            e.g. passage: <passage> utterance: <utterance> # input_label_type = passage+utterance
-        '''
-        ex["labels"]["utterance"] = ex["utterance"]
-        parts = self.input_label_type.split('+')
-        input_sentence = ' '.join([part + ': ' + ex["labels"][part] for part in parts])
-        return input_sentence
-
-    def __getitem__(self, idx: int):
-        '''
-        For each example, this function determines the format of input and output sequences based on user-specified conguration.
-        This is controlled by model.dataset.input_field and model.dataset.output_field
-        For instance:
-            If model.dataset.input_field == response and model.dataset.output_field == fluent_response:
-                Input = "response: <response>" and output = "response: <response> fluent_response: <fluent_response>" (with loss calculated from <fluent_response> only)
-            If model.dataset.input_field == utterance and model.dataset.output_field == response:
-                Input = "utterance: <utterance>" and output = "utterance: <utterance> response: <response>" (with loss calculated from <response> only)
-            If model.dataset.input_field == passage+utterance and model.dataset.output_field == response:
-                Input = "passage: <passage> utterance: <utterance>" and output="passage: <passage> utterance: <utterance> response: <response>" (with loss calculated from <response> only)
-        '''
-        ex = self.features[idx].data
-
-        input_sentence = self.format_prompt(ex)
-
-        utterance_length = self.get_n_tokens_in_sentence(input_sentence)
-
-        output_sentence = ex["labels"][self.output_label_type]
-
-        base_template = input_sentence
-
-        sentence_without_answer = base_template + ' ' + self.output_label_type + ':'
-
-        sentence = sentence_without_answer + ' ' + output_sentence
-
-        encodings_dict, input_ids, attn_masks = self.default_encode(sentence)
-
-        labels = copy.copy(torch.squeeze(encodings_dict['input_ids']))
-
-        training_mask_end = self.get_n_tokens_in_sentence(sentence_without_answer)
-
-        labels.data = torch.tensor(
-            [-100 if i < training_mask_end else labels.data[i] for i in range(len(labels.data))]
-        )
-
-        return (input_ids, attn_masks, labels, training_mask_end, utterance_length)
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py
deleted file mode 100644
index dc123ca0e3d7..000000000000
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueNearestNeighbourDataset']
-
-
-class DialogueNearestNeighbourDataset(DialogueDataset):
-    """
-    Dataset for training a Nearest Neighbour model for zero shot intent recognition.
-    """
-
-    def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """
-        Args:
-            dataset_split: dataset split
-            dialogues_processor: Data generator for dialogues
-            tokenizer: tokenizer to split text into sub-word tokens
-        """
-        # deprecation warning
-        deprecated_warning("DialogueNearestNeighbourDataset")
-
-        self.cfg = cfg
-        self.tokenizer = tokenizer
-        self.raw_features = dialogues_processor.get_dialog_examples(dataset_split)
-        self.max_n = self.find_max_n_candidates()
-        self.examples = self._create_examples(self.raw_features)
-
-    def find_max_n_candidates(self):
-        max_n = 0
-        for idx in range(len(self.raw_features)):
-            ex = self.raw_features[idx].data
-            n = len(ex["possible_labels"]["intent"])
-            max_n = max(max_n, n)
-        return max_n
-
-    def _create_examples(self, raw_features):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        seen_utterances = set()
-        for idx in range(len(raw_features)):
-            ex = self.raw_features[idx].data
-            user_utterance = ex["utterance"]
-            if user_utterance in seen_utterances:
-                continue
-            seen_utterances.add(user_utterance)
-            intent = ex["labels"]["intent"]
-            sentences = [user_utterance]
-            labels = [-1]
-            for candidate_intent in ex["possible_labels"]["intent"]:
-                text_b = "{} {}".format(self.cfg.prompt_template, candidate_intent)
-                label = 1 if candidate_intent == intent else 0
-                labels.append(label)
-                sentences.append(text_b)
-
-            while self.max_n > len(labels) - 1:
-                labels.append(label)
-                sentences.append(text_b)
-
-            encoded_input = self.tokenizer.tokenizer(
-                sentences,
-                padding='max_length',
-                truncation=True,
-                return_tensors='pt',
-                max_length=self.cfg.max_seq_length,
-            )
-            examples.append((encoded_input['input_ids'], encoded_input['attention_mask'], torch.tensor(labels)))
-        return examples
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, idx: int):
-        return self.examples[idx]
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py
deleted file mode 100644
index df522b74e861..000000000000
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
-from nemo.utils.decorators import deprecated_warning
-
-
-class DialogueS2SGenerationDataset(DialogueDataset):
-    def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """Constructor
-        Designed for free form generation tasks such as Dialogue Response Generation
-
-        Args:
-            dataset_split: dataset split
-            dialogues_processor: dialogues processor
-            tokenizer: tokenizer
-            cfg: cfg container for dataset
-        """
-        # deprecation warning
-        deprecated_warning("DialogueS2SGenerationDataset")
-
-        self.cfg = cfg
-        self.input_label_type = self.cfg.input_field
-        self.output_label_type = self.cfg.output_field
-        self.tokenizer = tokenizer
-        if not isinstance(dataset_split, str):
-            dataset_split = dataset_split[0]
-
-        self.features = dialogues_processor.get_dialog_examples(dataset_split)
-        self.features = self.remove_invalid_samples(self.features)
-
-        if self.cfg.debug_mode:
-            self.features = self.features[:16]
-
-    @staticmethod
-    def format_actions(prompt_template, actions):
-        """
-        Formats actions based on prompt_template
-
-        Args:
-            prompt_template: determines whether acts, slot-names, slot-values are necessary in formatted actions
-            actions: list of actions, each a dict containing keys 'act', 'slot' and 'values' with their corresponding values as their attribute-values
-
-        Returns:
-            formatted_actions: string representations of actions, formatted based on the fields needed.
-        """
-        actions_str = []
-        for action in actions:
-            act = action['act'].lower()
-            slot = action['slot']
-            value = action['values'][0] if action['values'] else ''
-
-            if prompt_template == 'values':
-                action_str = value
-            elif prompt_template == 'slots_values':
-                if value:
-                    action_str = '{} ({})'.format(slot, value)
-                else:
-                    action_str = slot
-            elif prompt_template == 'acts_slots_values':
-                if value:
-                    action_str = '{} {} ({})'.format(act, slot, value)
-                elif slot:
-                    action_str = '{} {}'.format(act, slot)
-                else:
-                    action_str = act
-            else:
-                raise ValueError(
-                    "Please set model.dataset.prompt_template to acts_slots_values, slots_values or values"
-                )
-            actions_str.append(action_str)
-        return ' '.join(actions_str)
-
-    def remove_invalid_samples(self, features):
-        valid_idxs = []
-        for i in range(len(features)):
-            for field in ['utterance', 'system_utterance', 'system_actions']:
-                if field in features[i].data:
-                    features[i].data["labels"][field] = features[i].data[field]
-            all_fields = self.input_label_type.split('+') + self.output_label_type.split('+')
-            all_fields_non_empty = True
-            for field in all_fields:
-                if not features[i].data["labels"][field]:
-                    all_fields_non_empty = False
-            if all_fields_non_empty:
-                valid_idxs.append(i)
-        return [features[i] for i in valid_idxs]
-
-    def __len__(self):
-        return len(self.features)
-
-    def get_n_tokens_in_sentence(self, sentence):
-        encodings_dict = self.tokenizer.tokenizer(
-            sentence, truncation=True, max_length=self.cfg.max_seq_length, padding=False, return_tensors="pt"
-        )
-        output = torch.squeeze(encodings_dict['input_ids'])
-        return len(output) if len(output.size()) > 0 else 0
-
-    def default_encode(self, sentence):
-        encodings_dict = self.tokenizer.tokenizer(
-            sentence, truncation=True, max_length=self.cfg.max_seq_length, padding="max_length", return_tensors="pt"
-        )
-        input_ids = torch.squeeze(encodings_dict['input_ids'])
-        attn_masks = torch.squeeze(encodings_dict['attention_mask'])
-        return encodings_dict, input_ids, attn_masks
-
-    def format_prompt(self, ex):
-        '''
-        Formats training prompt based on self.input_field_type
-
-        Training example:
-            e.g. response: <response> # input_label_type = response
-            e.g. utterance: <utterance> # input_label_type = utterance
-            e.g. passage: <passage> utterance: <utterance> # input_label_type = passage+utterance
-        '''
-        parts = self.input_label_type.split('+')
-        input_sentence = ' '.join([part + ': ' + ex["labels"][part] for part in parts])
-        return input_sentence
-
-    def __getitem__(self, idx: int):
-        '''
-        State how the input and output samples look like
-
-        This template can be changed
-
-        Training example:
-            e.g. INPUT - "response: <response>" OUTPUT - "<fluent_response>"  # input_label_type = response, output_label_type = fluent_response
-            e.g. INPUT - "utterance: <utterance>" OUTPUT - "<response>" # input_label_type = utterance, output_label_type = response
-            e.g. INPUT - "passage: <passage> utterance: <utterance>" OUTPUT - "<response>" # input_label_type = passage+utterance, output_label_type = response
-        '''
-        ex = self.features[idx].data
-        for field in ['utterance', 'system_utterance']:
-            if field in ex:
-                ex["labels"][field] = ex[field]
-
-        if 'system_actions' in ex:
-            ex["labels"]['system_actions'] = DialogueS2SGenerationDataset.format_actions(
-                self.cfg.prompt_template, ex['system_actions']
-            )
-
-        input_sentence = self.format_prompt(ex)
-        output_sentence = ex["labels"][self.output_label_type]
-
-        _, input_ids, attn_masks = self.default_encode(input_sentence)
-
-        _, labels, _ = self.default_encode(output_sentence)
-
-        labels[labels == self.tokenizer.tokenizer.pad_token_id] = -100
-
-        return input_ids, attn_masks, labels
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py
deleted file mode 100644
index fcab5e91329f..000000000000
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py
+++ /dev/null
@@ -1,425 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This file contains code artifacts adapted from the original implementation:
-https://github.com/google-research/google-research/blob/master/schema_guided_dst
-"""
-
-import os
-import re
-from typing import List
-
-import numpy as np
-
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
-from nemo.collections.nlp.data.dialogue.input_example.sgd_input_example import SGDInputExample
-
-__all__ = ['DialogueSGDBERTDataset']
-
-
-class DialogueSGDBERTDataset(DialogueDataset):
-    '''
-    Dataset Class 
-        1. Performs Model-dependent (but Data-independent) operations (tokenization etc)
-        2. This can allow the same model preprocessing for multiple datasources
-        3. Users can configurate which labels to use for modelling 
-            (e.g. intent classification, slot filling or both together etc)
-    '''
-
-    def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, schemas, schema_config, cfg):
-        """ Constructor
-        Args:
-            dataset_split: dataset split
-            dialogues_processor: Data generator for SGD dialogues
-            tokenizer: tokenizer
-            schemas: SGD schema for domain, intent and slots
-            schema_config: config dict for schemas
-            cfg: cfg container for dataset
-        """
-        self.dataset_split = dataset_split
-        self.tokenizer = tokenizer
-        self.schemas = schemas
-        self.schema_config = schema_config
-        self.dialogues_processor = dialogues_processor
-        self.cfg = cfg
-        self.subsample = self.dialogues_processor._subsample
-
-        dial_file = f"{dialogues_processor._task_name}_{dataset_split}_examples_bert.processed"
-        self.dial_file = os.path.join(self.cfg.data_dir, dial_file)
-        if self.cfg.use_cache and os.path.exists(self.dial_file):
-            self.load_features()
-        else:
-            self.process_features()
-            self.save_features()
-
-    def load_features(self):
-        with open(self.dial_file, "rb") as f:
-            self.features = np.load(f, allow_pickle=True)
-
-    def process_features(self):
-        self.features = []
-        self.raw_features = self.dialogues_processor.get_dialog_examples(self.dataset_split)
-        for idx in range(len(self.raw_features)):
-            self.bert_process_one_sample(idx)
-
-    def save_features(self):
-        with open(self.dial_file, "wb") as f:
-            np.save(f, self.features)
-
-    def _tokenize(self, utterance: str):
-        """
-        Tokenize the utterance
-
-        Args:
-            utterance: A string containing the utterance to be tokenized.
-
-        Returns:
-            bert_tokens: A list of tokens obtained by word-piece tokenization of the
-                utterance.
-            alignments: A dict mapping indices of characters corresponding to start
-                and end positions of words (not subwords) to corresponding indices in
-                bert_tokens list.
-            inverse_alignments: A list of size equal to bert_tokens. Each element is a
-                tuple containing the index of the starting and inclusive ending
-                character of the word corresponding to the subword. This list is used
-                during inference to map word-piece indices to spans in the original
-                utterance.
-        """
-        # utterance = tokenization.convert_to_unicode(utterance)
-
-        # After _naive_tokenize, spaces and punctuation marks are all retained, i.e.
-        # direct concatenation of all the tokens in the sequence will be the
-        # original string.
-        tokens = DialogueSGDBERTDataset._naive_tokenize(utterance)
-        # ['I', ' ', 'am', ' ', 'feeling', ' ', 'hungry', ' ', 'so', ' ', 'I', ' ', 'would', ' ', 'like', ' ', 'to', ' ', 'find', ' ', 'a', ' ', 'place', ' ', 'to', ' ', 'eat', '.']
-        # Filter out empty tokens and obtain aligned character index for each token.
-        alignments = {}
-        char_index = 0
-        bert_tokens = (
-            []
-        )  # ['I', 'am', 'feeling', 'hungry', 'so', 'I', 'would', 'like', 'to', 'find', 'a', 'place', 'to', 'eat', '.']
-        # These lists store inverse alignments to be used during inference.
-        bert_tokens_start_chars = []
-        bert_tokens_end_chars = []
-        for token in tokens:
-            if token.strip():
-                subwords = self.tokenizer.text_to_tokens(token)
-                # Store the alignment for the index of starting character and the
-                # inclusive ending character of the token.
-                alignments[char_index] = len(bert_tokens)
-                bert_tokens_start_chars.extend([char_index] * len(subwords))
-                bert_tokens.extend(subwords)
-                # The inclusive ending character index corresponding to the word.
-                inclusive_char_end = char_index + len(token) - 1
-                alignments[inclusive_char_end] = len(bert_tokens) - 1
-                bert_tokens_end_chars.extend([inclusive_char_end] * len(subwords))
-            char_index += len(token)
-        inverse_alignments = list(zip(bert_tokens_start_chars, bert_tokens_end_chars))
-        return bert_tokens, alignments, inverse_alignments
-
-    @classmethod
-    def _naive_tokenize(cls, s: str):
-        """
-        Tokenizes a string, separating words, spaces and punctuations.
-        Args:
-            s: a string
-        Returns:
-            seq_tok: list of words, spaces and punctuations from the string
-        """
-        # Spaces and punctuation marks are all retained, i.e. direct concatenation
-        # of all the tokens in the sequence will be the original string.
-        seq_tok = [tok for tok in re.split(r"([^a-zA-Z0-9])", s) if tok]
-        return seq_tok
-
-    def __len__(self):
-        return len(self.features)
-
-    def __getitem__(self, idx: int):
-        ex = self.features[idx]
-
-        return (
-            np.array(ex.example_id_num),
-            np.array(ex.example_id_num[-1]),  # service_id
-            np.array(ex.utterance_ids),
-            np.array(ex.utterance_segment),
-            np.array(ex.utterance_mask, dtype=np.longlong),
-            np.array(ex.intent_status, dtype=np.float32),
-            np.array(ex.requested_slot_status, dtype=np.float32),
-            np.array(ex.categorical_slot_status),
-            np.array(ex.categorical_slot_value_status, dtype=np.float32),
-            np.array(ex.noncategorical_slot_status),
-            np.array(ex.noncategorical_slot_value_start),
-            np.array(ex.noncategorical_slot_value_end),
-            np.array(ex.start_char_idx),  # noncat_alignment_start
-            np.array(ex.end_char_idx),  # noncat_alignment_end
-            np.array(ex.task_mask),  # noncat_alignment_end
-        )
-
-    def bert_process_one_sample(self, idx):
-        """
-        Creates an example for each frame in the user turn.
-        Args:
-            turn_id: turn number
-            system_utterance: last system utterance
-            user_utterance: lst user utterance
-            system_frames: all system utterances and slot - slot value pairs
-            user_frames: all user utterances and slot - slot value pairs
-            prev_states: slot - slot value pairs from the previous turns
-            schemas: schema for all services of all datasets
-            subsample: whether to balance postive and negative samples in the dataset
-        Returns:
-            examples: a list of `InputExample`s.
-            prev_states: updated dialogue state e.g. {'Restaurants_1': {'city': ['San Jose'], 'cuisine': ['American']}}
-        """
-
-        ex = self.raw_features[idx].data
-        example_id_num = ex["example_id_num"]
-        example_id = ex["example_id"]
-        user_utterance = ex["utterance"]
-        system_utterance = ex["system_utterance"]
-        service = ex["labels"]["service"]
-        schemas = self.schemas
-        state_update = ex["labels"]["slots"]
-        system_slots = ex["system_slots"]
-
-        user_tokens, user_alignments, user_inv_alignments = self._tokenize(user_utterance)
-        system_tokens, system_alignments, system_inv_alignments = self._tokenize(system_utterance)
-        system_user_utterance = system_utterance + ' ' + user_utterance
-        system_user_tokens, system_user_alignments, system_user_inv_alignments = self._tokenize(system_user_utterance)
-        examples = []
-
-        base_example = SGDInputExample(schema_config=self.schema_config, tokenizer=self.tokenizer)
-        base_example.service_schema = self.schemas.get_service_schema(service)
-        base_example.service_id = example_id_num[-1]
-
-        base_example.example_id = example_id
-        base_example.example_id_num = example_id_num
-
-        for model_task in range(self.schema_config["NUM_TASKS"]):
-            if model_task == 0:
-                for intent_id, intent in enumerate(schemas.get_service_schema(service).intents):
-                    task_example = base_example.make_copy()
-                    task_example.task_mask[model_task] = 1
-                    task_example.intent_id = intent_id
-                    task_example.example_id += f"-{model_task}-{intent_id}-0"
-                    task_example.example_id_num.extend([model_task, intent_id, 0])
-                    intent_description = (
-                        intent + " " + self.schemas.get_service_schema(service).intent_descriptions[intent]
-                    )
-                    intent_tokens, intent_alignments, intent_inv_alignments = self._tokenize(intent_description)
-                    task_example.add_utterance_features(
-                        intent_tokens,
-                        intent_inv_alignments,
-                        system_user_tokens,
-                        system_user_inv_alignments,
-                        intent_description,
-                        system_user_utterance,
-                    )
-
-                    task_example.add_intents(ex)
-                    examples.append(task_example)
-
-            if model_task == 1:
-                for slot_id, slot in enumerate(schemas.get_service_schema(service).slots):
-                    task_example = base_example.make_copy()
-                    task_example.task_mask[model_task] = 1
-                    task_example.requested_slot_id = slot_id
-                    task_example.example_id += f"-{model_task}-{slot_id}-0"
-                    task_example.example_id_num.extend([model_task, slot_id, 0])
-                    slot_description = slot + " " + self.schemas.get_service_schema(service).slot_descriptions[slot]
-                    slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description)
-                    task_example.add_utterance_features(
-                        slot_tokens,
-                        slot_inv_alignments,
-                        user_tokens,
-                        user_inv_alignments,
-                        slot_description,
-                        user_utterance,
-                    )
-
-                    task_example.add_requested_slots(ex)
-                    examples.append(task_example)
-
-            if model_task == 2:
-                off_slots = []
-                on_slots = []
-                for slot_id, slot in enumerate(schemas.get_service_schema(service).categorical_slots):
-                    task_example = base_example.make_copy()
-                    task_example.task_mask[model_task] = 1
-
-                    # assert task_example.task_mask == [0, 0, 1, 0, 0, 0]
-                    task_example.categorical_slot_id = slot_id
-                    task_example.example_id += f"-{model_task}-{slot_id}-0"
-                    task_example.example_id_num.extend([model_task, slot_id, 0])
-                    slot_description = slot + " " + schemas.get_service_schema(service).slot_descriptions[slot]
-                    slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description)
-                    task_example.add_utterance_features(
-                        slot_tokens,
-                        slot_inv_alignments,
-                        system_user_tokens,
-                        system_user_inv_alignments,
-                        slot_description,
-                        system_user_utterance,
-                    )
-                    task_example.add_categorical_slots(state_update)
-
-                    if task_example.categorical_slot_status == 0:
-                        off_slots.append(task_example)
-                    else:
-                        on_slots.append(task_example)
-                        examples.append(task_example)
-                    old_example = task_example
-
-                    for value_id, value in enumerate(
-                        schemas.get_service_schema(service).get_categorical_slot_values(slot)
-                    ):
-                        if self.dataset_split != 'train' or task_example.categorical_slot_status == 1:
-                            task_example = old_example.make_copy_of_categorical_features()
-                            task_example.task_mask[3] = 1
-                            # assert task_example.task_mask == [0, 0, 0, 1, 0, 0]
-                            task_example.categorical_slot_id = slot_id
-                            task_example.categorical_slot_value_id = value_id
-                            task_example.example_id = base_example.example_id + f"-3-{slot_id}-{value_id}"
-                            task_example.example_id_num = base_example.example_id_num + [3, slot_id, value_id]
-                            slot_description = slot + " " + value  # add slot description
-                            slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description)
-                            task_example.add_utterance_features(
-                                slot_tokens,
-                                slot_inv_alignments,
-                                system_user_tokens,
-                                system_user_inv_alignments,
-                                slot_description,
-                                system_user_utterance,
-                            )
-                            task_example.add_categorical_slots(state_update)
-                            assert task_example.categorical_slot_status == old_example.categorical_slot_status
-                            examples.append(task_example)
-
-                if self.dataset_split == 'train' and self.subsample:
-                    num_on_slots = len(on_slots)
-                    examples.extend(
-                        np.random.choice(off_slots, replace=False, size=min(max(num_on_slots, 1), len(off_slots)))
-                    )
-                else:
-                    examples.extend(off_slots)
-
-            if model_task == 4:  # noncat slot status
-                off_slots = []
-                on_slots = []
-                for slot_id, slot in enumerate(schemas.get_service_schema(service).non_categorical_slots):
-                    task_example = base_example.make_copy()
-                    task_example.task_mask[model_task] = 1
-                    # assert task_example.task_mask == [0, 0, 0, 0, 1, 0]
-                    task_example.noncategorical_slot_id = slot_id
-                    task_example.example_id += f"-{model_task}-{slot_id}-0"
-                    task_example.example_id_num.extend([model_task, slot_id, 0])
-                    slot_description = slot + " " + schemas.get_service_schema(service).slot_descriptions[slot]
-                    slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description)
-                    task_example.add_utterance_features(
-                        slot_tokens,
-                        slot_inv_alignments,
-                        system_user_tokens,
-                        system_user_inv_alignments,
-                        slot_description,
-                        system_user_utterance,
-                    )
-
-                    user_span_boundaries = self._find_subword_indices(
-                        state_update,
-                        user_utterance,
-                        ex["label_positions"]["slots"],
-                        user_alignments,
-                        user_tokens,
-                        2 + len(slot_tokens) + len(system_tokens),
-                    )
-
-                    if system_slots is not None:
-                        system_span_boundaries = self._find_subword_indices(
-                            state_update,
-                            system_utterance,
-                            system_slots,
-                            system_alignments,
-                            system_tokens,
-                            2 + len(slot_tokens),
-                        )
-                    else:
-                        system_span_boundaries = {}
-
-                    task_example.add_noncategorical_slots(state_update, user_span_boundaries, system_span_boundaries)
-                    if task_example.noncategorical_slot_status == 0:
-                        off_slots.append(task_example)
-                    else:
-                        on_slots.append(task_example)
-                        examples.append(task_example)
-
-                    if self.dataset_split != 'train' or task_example.noncategorical_slot_status == 1:
-                        task_example = task_example.make_copy_of_non_categorical_features()
-                        task_example.task_mask[5] = 1
-                        # assert task_example.task_mask == [0, 0, 0, 0, 0, 1]
-                        task_example.example_id = base_example.example_id + f"-5-{slot_id}-0"
-                        task_example.example_id_num = base_example.example_id_num + [5, slot_id, 0]
-                        examples.append(task_example)
-
-                if self.dataset_split == 'train' and self.subsample:
-                    num_on_slots = len(on_slots)
-                    examples.extend(
-                        np.random.choice(off_slots, replace=False, size=min(max(num_on_slots, 1), len(off_slots)))
-                    )
-                else:
-                    examples.extend(off_slots)
-
-        for example in examples:
-            self.features.append(example)
-
-    def _find_subword_indices(
-        self,
-        slot_values: dict,
-        utterance: str,
-        char_slot_spans: dict,
-        alignments: List[int],
-        subwords: List[str],
-        bias: int,
-    ) -> dict:
-        """
-        Find indices for subwords corresponding to slot values.
-        Args:
-            slot_values: slot - slot value pairs
-            utterance: utterance
-            char_slot_spans: char - slot spans
-            alignments: alignments
-            subwords: subtokens mapping
-            bias: offset
-        Returns:
-            span_boundaries: span boundaries
-        """
-        span_boundaries = {}
-        for slot, values in slot_values.items():
-            # Get all values present in the utterance for the specified slot.
-            value_char_spans = {}
-            for key, slot_span in char_slot_spans.items():
-                # print(key, slot, slot_span, char_slot_spans)
-                if slot_span["slot"] == slot:
-                    value = utterance[slot_span["start"] : slot_span["exclusive_end"]]
-                    start_tok_idx = alignments[slot_span["start"]]
-                    end_tok_idx = alignments[slot_span["exclusive_end"] - 1]
-                    if 0 <= start_tok_idx < len(subwords):
-                        end_tok_idx = min(end_tok_idx, len(subwords) - 1)
-                        value_char_spans[value] = (start_tok_idx + bias, end_tok_idx + bias)
-            for v in values:
-                if v in value_char_spans:
-                    span_boundaries[slot] = value_char_spans[v]
-                    break
-        return span_boundaries
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py
deleted file mode 100644
index c1308238bea1..000000000000
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-from nemo.collections.nlp.data.glue_benchmark.data_processors import InputExample
-from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset
-from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueZeroShotIntentDataset']
-
-
-class DialogueZeroShotIntentDataset(GLUEDataset):
-    """
-    Dataset for training a NLI model for zero shot intent recognition. Similar to GLUE/MNLI
-    dataset, but allows the user to specify which columns in the data files contain the
-    premise, hypothesis, and gold label.
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports."""
-        return {
-            'input_ids': NeuralType(('B', 'T'), ChannelType()),
-            'segment_ids': NeuralType(('B', 'T'), ChannelType()),
-            'input_mask': NeuralType(('B', 'T'), MaskType()),
-            'labels': NeuralType(tuple('B'), CategoricalValuesType()),
-        }
-
-    def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """
-        Args:
-            dataset_split: dataset split
-            dialogues_processor: Data generator for dialogues
-            tokenizer: tokenizer to split text into sub-word tokens
-            cfg: config dict for dataset
-                num_classes: number of classes in the data (should be either 2 or 3, corresponding to
-                labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"])
-        """
-        # deprecation warning
-        deprecated_warning("DialogueZeroShotIntentDataset")
-
-        self.cfg = cfg
-        self.tokenizer = tokenizer
-        if self.cfg.num_classes not in [2, 3]:
-            raise ValueError("num_classes must be either 2 or 3!")
-        self.label_list = (
-            ["contradiction", "entailment", "neutral"]
-            if self.cfg.num_classes == 3
-            else ['not_entailment', 'entailment']
-        )
-        token_params = {
-            'bos_token': None,
-            'eos_token': tokenizer.eos_token,
-            'pad_token': tokenizer.pad_token,
-            'cls_token': tokenizer.cls_token,
-            'sep_token_extra': (
-                tokenizer.eos_token if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower() else None
-            ),
-        }
-
-        self.raw_features = dialogues_processor.get_dialog_examples(dataset_split)
-        self.examples = self._create_examples(self.raw_features, dataset_split)
-        self.features = self.convert_examples_to_features(
-            self.examples,
-            [0, 1, 2, 3],
-            self.cfg.max_seq_length,
-            tokenizer,
-            output_mode="classification",
-            **token_params,
-        )
-
-    def _create_examples(self, raw_features, dataset_split: str):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for idx in range(len(raw_features)):
-            ex = self.raw_features[idx].data
-            user_utterance = ex["utterance"]
-            intent = ex["labels"]["intent"]
-            for candidate_idx, candidate_intent in enumerate(ex["possible_labels"]["intent"]):
-                guid = "{}-{}-{}".format(dataset_split, idx, candidate_idx)
-                text_a = user_utterance
-                text_b = "{} {}".format(self.cfg.prompt_template, candidate_intent)
-                label = 1 if candidate_intent == intent else 0
-                examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-    def convert_examples_to_features(
-        self,
-        examples: List[str],
-        label_list: List[int],
-        max_seq_length: int,
-        tokenizer: TokenizerSpec,
-        output_mode: str,
-        bos_token: str = None,
-        eos_token: str = '[SEP]',
-        pad_token: str = '[PAD]',
-        cls_token: str = '[CLS]',
-        sep_token_extra: str = None,
-        cls_token_at_end: bool = False,
-        cls_token_segment_id: int = 0,
-        pad_token_segment_id: int = 0,
-        pad_on_left: bool = False,
-        mask_padding_with_zero: bool = True,
-        sequence_a_segment_id: int = 0,
-        sequence_b_segment_id: int = 1,
-    ):
-        """
-        Loads a data file into a list of `InputBatch`s.
-        The `cls_token_at_end` defines the location of the CLS token:
-
-            * False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            * True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-
-        The `cls_token_segment_id` defines the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
-
-        The convention in BERT is:
-
-            a. For sequence pairs:
-                * tokens:   [CLS] is this jack ##ville ? [SEP] no it is not . [SEP]
-                * type_ids:   0   0  0    0    0       0   0   1  1  1  1   1   1
-            b. For single sequences:
-                * tokens:   [CLS] the dog is hairy . [SEP]
-                * type_ids:   0   0   0   0  0     0   0
-
-        Where "type_ids" are used to indicate whether this is the first
-        sequence or the second sequence. The embedding vectors for `type=0`
-        and `type=1` were learned during pre-training and are added to the
-        wordpiece embedding vector (and position vector). This is
-        not *strictly* necessarysince the [SEP] token unambiguously separates
-        the sequences, but it makes it easier for the model to learn
-        the concept of sequences.
-        For classification tasks, the first vector (corresponding to [CLS])
-        is used as as the "sentence vector". Note that this only makes sense
-        because the entire model is fine-tuned.
-
-        The convention for NMT is:
-
-            a. For sequence pairs:
-                * tokens:<BOS> is this jack ##ville ? <EOS> <BOS> no it is not . <EOS>
-                * type_ids:0   0  0    0    0       0   0     1   1  1  1  1   1   1
-            b. For single sequences:
-                * tokens:   <BOS> the dog is hairy . <EOS>
-                * type_ids:   0   0   0   0  0     0   0
-
-        """
-        label_map = {label: i for i, label in enumerate(label_list)}
-
-        features = []
-        for ex_index, example in enumerate(examples):
-            if example.label == "-":  # skip examples without a consensus label (e.g. in SNLI data set)
-                continue
-            if ex_index % 10000 == 0:
-                logging.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-            if hasattr(tokenizer, 'text_to_tokens'):
-                tokens_a = tokenizer.text_to_tokens(example.text_a)
-            else:
-                tokens_a = tokenizer.tokenize(example.text_a)
-
-            tokens_b = None
-            if example.text_b:
-                if hasattr(tokenizer, 'text_to_tokens'):
-                    tokens_b = tokenizer.text_to_tokens(example.text_b)
-                else:
-                    tokens_b = tokenizer.tokenize(example.text_b)
-
-                special_tokens_count = 2 if eos_token else 0
-                special_tokens_count += 1 if sep_token_extra else 0
-                special_tokens_count += 2 if bos_token else 0
-                special_tokens_count += 1 if cls_token else 0
-                self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
-            else:
-                special_tokens_count = 1 if eos_token else 0
-                special_tokens_count += 1 if sep_token_extra else 0
-                special_tokens_count += 1 if bos_token else 0
-                if len(tokens_a) > max_seq_length - special_tokens_count:
-                    tokens_a = tokens_a[: max_seq_length - special_tokens_count]
-            # Add special tokens to sequence_a
-            tokens = tokens_a
-            if bos_token:
-                tokens = [bos_token] + tokens
-            if eos_token:
-                tokens += [eos_token]
-            segment_ids = [sequence_a_segment_id] * len(tokens)
-
-            # Add sequence separator between sequences
-            if tokens_b and sep_token_extra:
-                tokens += [sep_token_extra]
-                segment_ids += [sequence_a_segment_id]
-
-            # Add special tokens to sequence_b
-            if tokens_b:
-                if bos_token:
-                    tokens += [bos_token]
-                    segment_ids += [sequence_b_segment_id]
-                tokens += tokens_b
-                segment_ids += [sequence_b_segment_id] * (len(tokens_b))
-                if eos_token:
-                    tokens += [eos_token]
-                    segment_ids += [sequence_b_segment_id]
-
-            # Add classification token - for BERT models
-            if cls_token:
-                if cls_token_at_end:
-                    tokens += [cls_token]
-                    segment_ids += [cls_token_segment_id]
-                else:
-                    tokens = [cls_token] + tokens
-                    segment_ids = [cls_token_segment_id] + segment_ids
-            if hasattr(tokenizer, 'tokens_to_ids'):
-                input_ids = tokenizer.tokens_to_ids(tokens)
-            else:
-                input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            padding_length = max_seq_length - len(input_ids)
-
-            if hasattr(tokenizer, 'tokens_to_ids'):
-                pad_token_id = tokenizer.tokens_to_ids([pad_token])[0]
-            else:
-                pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0]
-
-            if pad_on_left:
-                input_ids = ([pad_token_id] * padding_length) + input_ids
-                input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-                segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
-            else:
-                input_ids = input_ids + ([pad_token_id] * padding_length)
-                input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-                segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
-            if len(input_ids) != max_seq_length:
-                raise ValueError("input_ids must be of length max_seq_length")
-            if len(input_mask) != max_seq_length:
-                raise ValueError("input_mask must be of length max_seq_length")
-            if len(segment_ids) != max_seq_length:
-                raise ValueError("segment_ids must be of length max_seq_length")
-            if output_mode == "classification":
-                label_id = label_map[example.label]
-            elif output_mode == "regression":
-                label_id = np.float32(example.label)
-            else:
-                raise KeyError(output_mode)
-
-            if ex_index < 5:
-                logging.info("*** Example ***")
-                logging.info("guid: %s" % (example.guid))
-                logging.info("tokens: %s" % " ".join(list(map(str, tokens))))
-                logging.info("input_ids: %s" % " ".join(list(map(str, input_ids))))
-                logging.info("input_mask: %s" % " ".join(list(map(str, input_mask))))
-                logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids))))
-                logging.info("label: %s (id = %d)" % (example.label, label_id))
-
-            features.append(
-                InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)
-            )
-
-        return features
-
-
-class InputFeatures(object):
-    """A single set of features of data.
-
-    Args:
-        input_ids: input/token ids
-        input_mask: masks out subword tokens
-        segment_ids: distinguish one sentence from the other one (if present)
-        label_ids: label for the current example
-    """
-
-    def __init__(
-        self, input_ids: List[int], input_mask: List[int], segment_ids: List[int], label_id: Union[float, int]
-    ):
-        """Initialized InputFeatures."""
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
diff --git a/nemo/collections/nlp/data/dialogue/input_example/__init__.py b/nemo/collections/nlp/data/dialogue/input_example/__init__.py
deleted file mode 100644
index de4cf417e58c..000000000000
--- a/nemo/collections/nlp/data/dialogue/input_example/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.data.dialogue.input_example.assistant_input_example import DialogueAssistantInputExample
-from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
-from nemo.collections.nlp.data.dialogue.input_example.sgd_input_example import DialogueSGDInputExample, SGDInputExample
diff --git a/nemo/collections/nlp/data/dialogue/input_example/assistant_input_example.py b/nemo/collections/nlp/data/dialogue/input_example/assistant_input_example.py
deleted file mode 100644
index c5574e8fa103..000000000000
--- a/nemo/collections/nlp/data/dialogue/input_example/assistant_input_example.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
-
-
-class DialogueAssistantInputExample(DialogueInputExample):
-    """
-    Template for DialogueAssistantInputExample
-
-    Meant as a descriptor rather than to be instantiated
-
-    Please instantiate using the base class 'DialogueInputExample'
-
-    {
-        
-        "utterance": <utterance>,
-        "labels": {
-            "service": <service>,
-            "intent": <intent>,
-            "slots": {
-                "<slot-name1>": [<slot-value1>, <slot-value2>],
-                "<slot-name2>": [<slot-value2>],
-            }
-        },
-        "label_positions":{
-            "slots": {
-                "<slot-name1>": { 
-                    # note for the Assistant dataset, start and end are word positions rather than char position
-                    # these are whitespace-delimited word positions rather than tokenization-specific sub-word tokens.
-                    "exclusive_end": 3, 
-                    "slot": "restaurant_name",
-                    "start": 1 
-                },
-            }
-        },
-        "possible_labels": {
-            "service": [<service1>, <service2>, ...],
-            "intent": [<intent1>, <intent2>, ...],
-            "slots": {
-                # all slots for categorical variables
-                # empty list for extractive slots
-                # Assistant only support extractive slots
-                "<slot-name1>": [],
-                "<slot-name2>": [],
-            }
-        }
-    }
-    """
diff --git a/nemo/collections/nlp/data/dialogue/input_example/design_input_example.py b/nemo/collections/nlp/data/dialogue/input_example/design_input_example.py
deleted file mode 100644
index 80f3152cd82e..000000000000
--- a/nemo/collections/nlp/data/dialogue/input_example/design_input_example.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
-
-
-class DialogueDesignInputExample(DialogueInputExample):
-    """
-    Template for DialogueDesignInputExample
-
-    Meant as a descriptor rather than to be instantiated
-
-    Please instantiate using the base class 'DialogueInputExample'
-
-    {
-        "utterance": <utterance>,
-        "system_utterance": <system_utterance>,
-        "labels": {
-            "service": <service>,
-            "intent": <intent_description>,
-            "slots": {
-                <slot-name1>: '',
-                <slot-name2>: '',
-            },  # dataset does not contain ground truth slot values
-        },
-        "possible_labels": {
-            'intent': [<intent1>, <intent2>, ...],
-            "service": [<service1>, <service2>, ...],
-            "slots": {
-                "<slot-name1>": [<slot-value1>, <slot-value2>, ...],
-                "<slot-name2>": [<slot-value1>, <slot-value2>, ...],
-            }
-        },
-        "description": {
-            "service": <service>,
-            "intent": <intent_description>,
-            "slots": {
-                "<slot-name1>": "<slot-question1>",
-                "<slot-name2>": "<slot-question2>",
-            }
-        },
-    }
-    """
diff --git a/nemo/collections/nlp/data/dialogue/input_example/input_example.py b/nemo/collections/nlp/data/dialogue/input_example/input_example.py
deleted file mode 100644
index 4920c2927f46..000000000000
--- a/nemo/collections/nlp/data/dialogue/input_example/input_example.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ['DialogueInputExample']
-
-
-class DialogueInputExample(object):
-    """
-    Generic Dialogue Input Example
-    Uses data: dict as a flexible interface to support various input types.
-    This ranges from classification labels, to complex nested labels such as those in SGD
-
-    {
-        "utterance": <utterance>,
-        "labels": { 
-            "intent": <intent>,
-            "slots": { ... },
-        }
-    }
-    """
-
-    def __init__(self, data: dict):
-        self.data = data
-
-    def __repr__(self):
-        return self.data
-
-    def __str__(self):
-        return self.data
diff --git a/nemo/collections/nlp/data/dialogue/input_example/mellon_qa_input_example.py b/nemo/collections/nlp/data/dialogue/input_example/mellon_qa_input_example.py
deleted file mode 100644
index e6576d40460b..000000000000
--- a/nemo/collections/nlp/data/dialogue/input_example/mellon_qa_input_example.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
-
-
-class MellonQAInputExample(DialogueInputExample):
-    """
-    Template for MellonQAInputExample
-
-    Meant as a descriptor rather than to be instantiated
-
-    Please instantiate using the base class 'DialogueInputExample'
-
-    {
-        "utterance": <utterance>,
-        "labels": {
-            "example_id": <example_id>,
-            "response": <response>,
-            "fluent_response": <fluent_response>, # written version of the response that is more fluent
-            "passage": <passage>, # passage which supports generating the response (answer) to the utterance (question)
-        }
-    }
-    """
diff --git a/nemo/collections/nlp/data/dialogue/input_example/ms_marco_input_example.py b/nemo/collections/nlp/data/dialogue/input_example/ms_marco_input_example.py
deleted file mode 100644
index ded84d3ece67..000000000000
--- a/nemo/collections/nlp/data/dialogue/input_example/ms_marco_input_example.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
-
-
-class DialogueMSMarcoInputExample(DialogueInputExample):
-    """
-    Template for DialogueMSMarcoInputExample
-
-    Meant as a descriptor rather than to be instantiated
-
-    Please instantiate using the base class 'DialogueInputExample'
-
-    {
-        
-        "utterance": <utterance>,
-        "labels": {
-            "service": <service>, # this is the domain
-            "example_id": <example_id>,
-            "response": <response>,
-            "fluent_response": <fluent_response>, # written version of the response that is more fluent
-            "passage": <passage>, # passage which supports generating the response (answer) to the utterance (question)
-        },
-        "possible_labels": {
-            "service": [<service1>, <service2>, ...],
-            "passage": [<passage1>, <passage2>, ...],
-        }
-    }
-    """
diff --git a/nemo/collections/nlp/data/dialogue/input_example/sgd_input_example.py b/nemo/collections/nlp/data/dialogue/input_example/sgd_input_example.py
deleted file mode 100644
index 9862a07baccd..000000000000
--- a/nemo/collections/nlp/data/dialogue/input_example/sgd_input_example.py
+++ /dev/null
@@ -1,481 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This file contains code artifacts adapted from the original implementation:
-https://github.com/google-research/google-research/blob/master/schema_guided_dst/baseline/data_utils.py
-"""
-
-from typing import List
-
-from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
-from nemo.utils import logging
-
-__all__ = [
-    'SGDInputExample',
-    'STR_DONTCARE',
-    'STATUS_OFF',
-    'STATUS_ACTIVE',
-    'STATUS_DONTCARE',
-]
-
-
-class DialogueSGDInputExample(DialogueInputExample):
-
-    """
-    Template for DialogueSGDInputExample
-
-    Meant as a descriptor rather than to be instantiated
-
-    Please instantiate using the base class 'DialogueInputExample' 
-
-    {
-        "example_id": <example_id>,
-        "example_id_num": <example_id_num>,
-        "utterance": <utterance>,
-        "system_utterance": <system_utterance>,
-        "system_slots": None or {
-                    "<slot-name1>": {
-                        "exclusive_end": 46,
-                        "slot": "restaurant_name",
-                        "start": 34
-            },
-        "system_actions": None or [{
-                "act": "INFORM",
-                "canonical_values": [
-                  "2019-03-02"
-                ],
-                "slot": "date",
-                "values": [
-                  "March 2nd"
-                ]
-              }, ...]
-        "labels": {
-            "service": <service>,
-            "intent": <intent>,
-            "slots": {
-                #only non-empty slots
-                #most slot values are list of length 1 
-                #but there are some of length 2 as both are accepted
-                #e.g. 1930 and 7:30 pm
-                "<slot-name1>": [<slot-value1>, <slot-value2>],
-                "<slot-name2>": [<slot-value2>],
-            }
-        },
-        "label_positions":{
-            "slots": {
-                "<slot-name1>": {
-                    "exclusive_end": 46,
-                    "slot": "restaurant_name",
-                    "start": 34
-              },
-            }
-        },
-        "possible_labels": {
-            "service": [<service1>, <service2>, ...],
-            "intent": [<intent1>, <intent2>, ...],
-            "slots": {
-                #all slots including empty
-                "<slot-name1>": [<slot-value1>, <slot-value2>, ...],
-                "<slot-name2>": [<slot-value1>, <slot-value2>, ...],
-            }
-        },
-        "description": {
-            "service": <service description>,
-            "intent": <intent description>,
-            "slots": {
-                #only non-empty slots
-                "<slot-name1>": <slot-name1 description>,
-                "<slot-name2>": <slot-name2 description>,
-            }
-        }
-    }
-
-    """
-
-
-STR_DONTCARE = "dontcare"
-
-# These are used to represent the status of slots (off, active, dontcare) and
-# intents (off, active) in dialogue state tracking.
-STATUS_OFF = 0
-STATUS_ACTIVE = 1
-STATUS_DONTCARE = 2
-
-
-class SGDInputExample(object):
-    """An example for training/inference."""
-
-    def __init__(
-        self,
-        schema_config: dict,
-        tokenizer: object,
-        service_schema: object = None,
-        example_id: str = "NONE",
-        example_id_num: List[int] = [],
-    ):
-        """
-        Constructs an InputExample.
-        Args:
-            schema_config: configuration
-            tokenizer: tokenizer object
-            service_schema: A ServiceSchema object wrapping the schema for the service
-                corresponding to this example.
-            example_id: Unique identifier for the example, like: 'train-1_00000-00-Restaurants_1'
-            example_id_num: dialogue_id and turn_id combined and service id combined into a list of ints,
-                like: [1, 0, 0, 18]
-        """
-        self.schema_config = schema_config
-        self.service_schema = service_schema
-        self.service_id = None
-        if service_schema:
-            self.service_id = service_schema.service_id
-        self.example_id = example_id
-        self.example_id_num = example_id_num
-        self._max_seq_length = schema_config["MAX_SEQ_LENGTH"]
-        self._tokenizer = tokenizer
-        if self._tokenizer is None:
-            raise ValueError("Must specify tokenizer")
-
-        self.user_utterance = ''
-        self.system_utterance = ''
-        # The id of each subword in the vocabulary for BERT.
-        self.utterance_ids = [0] * self._max_seq_length
-        # Denotes the identity of the sequence. Takes values 0 (schema description) and 1 (system and user utterance).
-        self.utterance_segment = [0] * self._max_seq_length
-        # Mask which takes the value 0 for padded tokens and 1 otherwise.
-        self.utterance_mask = [0] * self._max_seq_length
-        # Start and inclusive end character indices in the original utterance
-        # corresponding to the tokens. This is used to obtain the character indices
-        # from the predicted subword indices during inference.
-        # NOTE: A positive value indicates the character indices in the schema description
-        # whereas a negative value indicates the character indices in the
-        # utterance. The indices are offset by 1 to prevent ambiguity in the
-        # 0 index, which could be in either the schema description or utterance by the
-        # above convention. Now the 0 index corresponds to padded tokens.
-        self.start_char_idx = [0] * self._max_seq_length
-        self.end_char_idx = [0] * self._max_seq_length
-
-        # Id of categorical slot present in the example or 0 if not present.
-        self.categorical_slot_id = 0
-        # Id of non categorical slot present in the example or 0 if not present.
-        self.noncategorical_slot_id = 0
-        # The status of categorical slot in the example.
-        self.categorical_slot_status = STATUS_OFF
-        # The status of non categorical slot in the example.
-        self.noncategorical_slot_status = STATUS_OFF
-        # Masks out tasks not represented by example
-        self.task_mask = [0] * schema_config["NUM_TASKS"]
-
-        # The index of the starting subword corresponding to the slot span
-        # for a non-categorical slot value.
-        self.noncategorical_slot_value_start = 0
-        # The index of the ending (inclusive) subword corresponding to the slot span
-        # for a non-categorical slot value.
-        self.noncategorical_slot_value_end = 0
-
-        # Id of categorical slot value present in the example or 0 if not present.
-        self.categorical_slot_value_id = 0
-        # The status of categorical slot value in the example.
-        self.categorical_slot_value_status = STATUS_OFF
-        # Id of requested slot present in the example or 0 if not present.
-        self.requested_slot_id = 0
-        # Takes value 1 if the corresponding slot is requested, 0 otherwise.
-        self.requested_slot_status = STATUS_OFF
-
-        # ID of intent present in the example.
-        self.intent_id = 0
-        # Takes value 1 if the intent is active, 0 otherwise.
-        self.intent_status = STATUS_OFF
-
-    @property
-    def readable_summary(self):
-        """Get a readable dict that summarizes the attributes of an InputExample."""
-        seq_length = sum(self.utterance_mask)
-        utt_toks = self._tokenizer.ids_to_tokens(self.utterance_ids[:seq_length])
-        utt_tok_mask_pairs = list(zip(utt_toks, self.utterance_segment[:seq_length]))
-        active_intent = (
-            self.service_schema.get_intent_from_id(self.intent_id) if self.intent_status == STATUS_ACTIVE else ""
-        )
-        slot_values_in_state = {}
-        if self.categorical_slot_status == STATUS_ACTIVE:
-            slot_values_in_state[
-                self.service_schema.get_categorical_slot_from_id(self.categorical_slot_id)
-            ] = self.service_schema.get_categorical_slot_value_from_id(
-                self.categorical_slot_id, self.categorical_slot_value_id
-            )
-        elif self.categorical_slot_status == STATUS_DONTCARE:
-            slot_values_in_state[
-                self.service_schema.get_categorical_slot_from_id(self.categorical_slot_id)
-            ] = STR_DONTCARE
-        if self.noncategorical_slot_status == STATUS_ACTIVE:
-            slot = self.service_schema.get_non_categorical_slot_from_id(self.noncategorical_slot_id)
-            start_id = self.noncategorical_slot_value_start[slot]
-            end_id = self.noncategorical_slot_value_end[slot]
-            # Token list is consisted of the subwords that may start with "##". We
-            # remove "##" to reconstruct the original value. Note that it's not a
-            # strict restoration of the original string. It's primarily used for
-            # debugging.
-            # ex. ["san", "j", "##ose"] --> "san jose"
-            readable_value = " ".join(utt_toks[start_id : end_id + 1]).replace(" ##", "")
-            slot_values_in_state[slot] = readable_value
-        elif self.noncategorical_slot_status == STATUS_DONTCARE:
-            slot = self.service_schema.get_non_categorical_slot_from_id(self.noncategorical_slot_id)
-            slot_values_in_state[slot] = STR_DONTCARE
-
-        summary_dict = {
-            "utt_tok_mask_pairs": utt_tok_mask_pairs,
-            "utt_len": seq_length,
-            "categorical_slot_id": self.categorical_slot_id,
-            "noncategorical_slot_id": self.noncategorical_slot_id,
-            "intent_id": self.intent_id,
-            "service_name": self.service_schema.service_name,
-            "active_intent": active_intent,
-            "slot_values_in_state": slot_values_in_state,
-        }
-        return summary_dict
-
-    def add_utterance_features(
-        self, system_tokens, system_inv_alignments, user_tokens, user_inv_alignments, system_utterance, user_utterance
-    ):
-        """Add utterance related features input to InputExample.
-
-        Note: this method modifies the system tokens and user_tokens in place to
-        make their total length <= the maximum input length for BERT model.
-
-        Args:
-          system_tokens: a list of strings which represents schema description.
-          system_inv_alignments: a list of tuples which denotes the start and end
-            charater of the tpken that a bert token originates from in the original
-            schema description.
-          user_tokens: a list of strings which represents utterance.
-          user_inv_alignments: a list of tuples which denotes the start and end
-            charater of the token that a bert token originates from in the original
-            system and user utterance.
-        """
-        # Input sequence length for utterance BERT encoder
-        max_utt_len = self._max_seq_length
-
-        # Modify lengths of schema description & utterance so that length of total utt
-        # (including cls_token, setp_token, sep_token) is no more than max_utt_len
-        is_too_long = truncate_seq_pair(system_tokens, user_tokens, max_utt_len - 3)
-        if is_too_long:
-            logging.debug(
-                f'Utterance sequence truncated in example id - {self.example_id} from {len(system_tokens) + len(user_tokens)}.'
-            )
-
-        # Construct the tokens, segment mask and valid token mask which will be
-        # input to BERT, using the tokens for schema description (sequence A) and
-        # system and user utterance (sequence B).
-        utt_subword = []
-        utt_seg = []
-        utt_mask = []
-        start_char_idx = []
-        end_char_idx = []
-
-        utt_subword.append(self._tokenizer.cls_token)
-        utt_seg.append(0)
-        utt_mask.append(1)
-        start_char_idx.append(0)
-        end_char_idx.append(0)
-
-        for subword_idx, subword in enumerate(system_tokens):
-            utt_subword.append(subword)
-            utt_seg.append(0)
-            utt_mask.append(1)
-            st, en = system_inv_alignments[subword_idx]
-            start_char_idx.append(-(st + 1))
-            end_char_idx.append(-(en + 1))
-
-        utt_subword.append(self._tokenizer.sep_token)
-        utt_seg.append(0)
-        utt_mask.append(1)
-        start_char_idx.append(0)
-        end_char_idx.append(0)
-
-        for subword_idx, subword in enumerate(user_tokens):
-            utt_subword.append(subword)
-            utt_seg.append(1)
-            utt_mask.append(1)
-            st, en = user_inv_alignments[subword_idx]
-            start_char_idx.append(st + 1)
-            end_char_idx.append(en + 1)
-
-        utt_subword.append(self._tokenizer.sep_token)
-        utt_seg.append(1)
-        utt_mask.append(1)
-        start_char_idx.append(0)
-        end_char_idx.append(0)
-
-        utterance_ids = self._tokenizer.tokens_to_ids(utt_subword)
-
-        # Zero-pad up to the BERT input sequence length.
-        while len(utterance_ids) < max_utt_len:
-            utterance_ids.append(0)
-            utt_seg.append(0)
-            utt_mask.append(0)
-            start_char_idx.append(0)
-            end_char_idx.append(0)
-        self.utterance_ids = utterance_ids
-        self.utterance_segment = utt_seg
-        self.utterance_mask = utt_mask
-        self.start_char_idx = start_char_idx
-        self.end_char_idx = end_char_idx
-
-        self.user_utterance = user_utterance
-        self.system_utterance = system_utterance
-
-    def make_copy(self):
-        """Make a copy of the current example with utterance features."""
-        new_example = SGDInputExample(
-            schema_config=self.schema_config,
-            service_schema=self.service_schema,
-            example_id=self.example_id,
-            example_id_num=self.example_id_num.copy(),
-            tokenizer=self._tokenizer,
-        )
-        return new_example
-
-    def make_copy_of_categorical_features(self):
-        """Make a copy of the current example with utterance and categorical features."""
-        new_example = self.make_copy()
-
-        new_example.categorical_slot_status = self.categorical_slot_status
-        return new_example
-
-    def make_copy_of_non_categorical_features(self):
-        """Make a copy of the current example with utterance features and non categorical features."""
-        new_example = self.make_copy()
-        new_example.noncategorical_slot_id = self.noncategorical_slot_id
-        new_example.noncategorical_slot_status = self.noncategorical_slot_status
-        new_example.utterance_ids = list(self.utterance_ids)
-        new_example.utterance_segment = list(self.utterance_segment)
-        new_example.utterance_mask = list(self.utterance_mask)
-        new_example.start_char_idx = list(self.start_char_idx)
-        new_example.end_char_idx = list(self.end_char_idx)
-        new_example.user_utterance = self.user_utterance
-        new_example.system_utterance = self.system_utterance
-        new_example.noncategorical_slot_status = self.noncategorical_slot_status
-        new_example.noncategorical_slot_value_start = self.noncategorical_slot_value_start
-        new_example.noncategorical_slot_value_end = self.noncategorical_slot_value_end
-        return new_example
-
-    def add_categorical_slots(self, state_update: dict):
-        """Add features for categorical slots.
-        Args:
-            state_update: slot value pairs of the state update
-        """
-
-        categorical_slots = self.service_schema.categorical_slots
-        if not categorical_slots:
-            return
-        slot = categorical_slots[self.categorical_slot_id]
-        values = state_update.get(slot, [])
-
-        if not values:
-            self.categorical_slot_status = STATUS_OFF
-        elif values[0] == STR_DONTCARE:
-            self.categorical_slot_status = STATUS_DONTCARE
-        else:
-            self.categorical_slot_status = STATUS_ACTIVE
-            self.categorical_slot_value_status = (
-                self.categorical_slot_value_id == self.service_schema.get_categorical_slot_value_id(slot, values[0])
-            )
-
-    def add_noncategorical_slots(self, state_update: dict, system_span_boundaries: dict, user_span_boundaries: dict):
-        """Add features for non-categorical slots.
-        Args:
-            state_update: slot value pairs of state update
-            system_span_boundaries: span boundaries of schema description
-            user_span_boundaries: span boundaries of utterance 
-        """
-
-        noncategorical_slots = self.service_schema.non_categorical_slots
-        slot = noncategorical_slots[self.noncategorical_slot_id]
-
-        values = state_update.get(slot, [])
-        if not values:
-            self.noncategorical_slot_status = STATUS_OFF
-        elif values[0] == STR_DONTCARE:
-            self.noncategorical_slot_status = STATUS_DONTCARE
-        else:
-            self.noncategorical_slot_status = STATUS_ACTIVE
-            # Add indices of the start and end tokens for the first encountered
-            # value. Spans in user utterance are prioritized over the system
-            # utterance. If a span is not found, the slot value is ignored.
-            if slot in user_span_boundaries:
-                start, end = user_span_boundaries[slot]
-            elif slot in system_span_boundaries:
-                start, end = system_span_boundaries[slot]
-            else:
-                # A span may not be found because the value was cropped out or because
-                # the value was mentioned earlier in the dialogue. Since this model
-                # only makes use of the last two utterances to predict state updates,
-                # it will fail in such cases.
-                logging.debug(
-                    f'"Slot values {str(values)} not found in user or system utterance in example with id - {self.example_id}.'
-                )
-                start = 0
-                end = 0
-            self.noncategorical_slot_value_start = start
-            self.noncategorical_slot_value_end = end
-
-    def add_requested_slots(self, frame: dict):
-        """Add requested slots to InputExample
-        Args:
-            frame: frame object from which requested slots are extracted
-        """
-        all_slots = self.service_schema.slots
-        slot = all_slots[self.requested_slot_id]
-        if slot in frame["labels"]["slots"]:
-            self.requested_slot_status = STATUS_ACTIVE
-
-    def add_intents(self, frame):
-        """Add intents to InputExample
-        Args:
-            frame: frame object from which intents are extracted
-        """
-        all_intents = self.service_schema.intents
-        intent = all_intents[self.intent_id]
-        if intent == frame["labels"]["intent"]:
-            self.intent_status = STATUS_ACTIVE
-
-
-# Modified from run_classifier._truncate_seq_pair in the public bert model repo.
-# https://github.com/google-research/bert/blob/master/run_classifier.py.
-def truncate_seq_pair(tokens_a: List[int], tokens_b: List[int], max_length: int) -> bool:
-    """Truncate a seq pair in place so that their total length <= max_length.
-    Args:
-        tokens_a: first token sequence
-        tokens_b: second token sequence
-        max_length: truncated sequence length
-    Returns:
-        is_too_long: whether combined sequences exceed maximum sequence length
-    """
-    is_too_long = False
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        is_too_long = True
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-    return is_too_long
diff --git a/nemo/collections/nlp/data/dialogue/sgd/__init__.py b/nemo/collections/nlp/data/dialogue/sgd/__init__.py
deleted file mode 100644
index 9bc88d075659..000000000000
--- a/nemo/collections/nlp/data/dialogue/sgd/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.data.dialogue.sgd.evaluate import evaluate, get_in_domain_services
-from nemo.collections.nlp.data.dialogue.sgd.schema import Schema
diff --git a/nemo/collections/nlp/data/dialogue/sgd/evaluate.py b/nemo/collections/nlp/data/dialogue/sgd/evaluate.py
deleted file mode 100644
index 0829543dcc51..000000000000
--- a/nemo/collections/nlp/data/dialogue/sgd/evaluate.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Evaluate predictions JSON file, w.r.t. ground truth file.
-This file contains code artifacts adapted from the original implementation:
-https://github.com/google-research/google-research/blob/master/schema_guided_dst/evaluate.py
-"""
-
-import collections
-import glob
-import json
-import os
-
-import numpy as np
-
-from nemo.collections.nlp.metrics.sgd_metrics import (
-    ACTIVE_INTENT_ACCURACY,
-    JOINT_CAT_ACCURACY,
-    JOINT_GOAL_ACCURACY,
-    JOINT_NONCAT_ACCURACY,
-    NAN_VAL,
-    REQUESTED_SLOTS_F1,
-    REQUESTED_SLOTS_PRECISION,
-    REQUESTED_SLOTS_RECALL,
-    SLOT_TAGGING_F1,
-    SLOT_TAGGING_PRECISION,
-    SLOT_TAGGING_RECALL,
-    get_active_intent_accuracy,
-    get_average_and_joint_goal_accuracy,
-    get_requested_slots_f1,
-    get_slot_tagging_f1,
-)
-from nemo.utils import logging
-
-__all__ = ['get_in_domain_services']
-
-ALL_SERVICES = "#ALL_SERVICES"
-SEEN_SERVICES = "#SEEN_SERVICES"
-UNSEEN_SERVICES = "#UNSEEN_SERVICES"
-
-# Name of the file containing all predictions and their corresponding frame metrics.
-PER_FRAME_OUTPUT_FILENAME = "dialogues_and_metrics.json"
-
-
-def get_service_set(schema_path: str) -> set:
-    """
-    Get the set of all services present in a schema.
-    Args:
-        schema_path: schema file path
-    Returns:
-        service_set: set of services in file
-    """
-    service_set = set()
-    with open(schema_path, encoding="UTF-8") as f:
-        schema = json.load(f)
-        for service in schema:
-            service_set.add(service["service_name"])
-        f.close()
-    return service_set
-
-
-def get_in_domain_services(schema_path: str, service_set: set) -> set:
-    """Get the set of common services between a schema and set of services.
-    Args:
-        schema_path: path to schema file
-        service_set: set of services
-    Returns: 
-        joint_services: joint services between schema path file and service set
-    """
-    joint_services = get_service_set(schema_path) & service_set
-    return joint_services
-
-
-def get_dataset_as_dict(file_path_patterns) -> dict:
-    """Read the DSTC8/SGD json dialogue data as dictionary with dialog ID as keys.
-    Args:
-        file_path_patterns: list or directory of files 
-    Returns:
-        dataset_dict: dataset dictionary with dialog ID as keys
-    """
-    dataset_dict = {}
-    if isinstance(file_path_patterns, list):
-        list_fp = file_path_patterns
-    else:
-        list_fp = sorted(glob.glob(file_path_patterns))
-    for fp in list_fp:
-        if PER_FRAME_OUTPUT_FILENAME in fp:
-            continue
-        logging.debug("Loading file: %s", fp)
-        with open(fp, encoding="UTF-8") as f:
-            data = json.load(f)
-            if isinstance(data, list):
-                for dial in data:
-                    dataset_dict[dial["dialogue_id"]] = dial
-            elif isinstance(data, dict):
-                dataset_dict.update(data)
-            f.close()
-    return dataset_dict
-
-
-def get_metrics(
-    dataset_ref: dict,
-    dataset_hyp: dict,
-    service_schemas: dict,
-    in_domain_services: set,
-    joint_acc_across_turn: bool,
-    use_fuzzy_match: bool,
-):
-    """Calculate the DSTC8/SGD metrics.
-    Args:
-        dataset_ref: The ground truth dataset represented as a dict mapping dialogue id to the corresponding dialogue.
-        dataset_hyp: The predictions in the same format as `dataset_ref`.
-        service_schemas: A dict mapping service name to the schema for the service.
-        in_domain_services: The set of services which are present in the training set.
-        joint_acc_across_turn: Whether to compute joint accuracy across turn instead of across service. Should be set to True when conducting multiwoz style evaluation.
-        use_fuzzy_match: Whether to use fuzzy string matching when comparing non-categorical slot values. Should be set to False when conducting multiwoz style evaluation.
-
-    Returns:
-        all_metric_aggregate: A dict mapping a metric collection name to a dict containing the values
-            for various metrics. Each metric collection aggregates the metrics across a specific set of frames in the dialogues.
-        per_frame_metric: metrics aggregated for each frame
-    """
-    # Metrics can be aggregated in various ways, eg over all dialogues, only for
-    # dialogues containing unseen services or for dialogues corresponding to a
-    # single service. This aggregation is done through metric_collections, which
-    # is a dict mapping a collection name to a dict, which maps a metric to a list
-    # of values for that metric. Each value in this list is the value taken by
-    # the metric on a frame.
-    metric_collections = collections.defaultdict(lambda: collections.defaultdict(list))
-
-    # Ensure the dialogs in dataset_hyp also occur in dataset_ref.
-    assert set(dataset_hyp.keys()).issubset(set(dataset_ref.keys()))
-    logging.debug("len(dataset_hyp)=%d, len(dataset_ref)=%d", len(dataset_hyp), len(dataset_ref))
-
-    # Store metrics for every frame for debugging.
-    per_frame_metric = {}
-
-    for dial_id, dial_hyp in dataset_hyp.items():
-        dial_ref = dataset_ref[dial_id]
-
-        if set(dial_ref["services"]) != set(dial_hyp["services"]):
-            raise ValueError(
-                "Set of services present in ground truth and predictions don't match "
-                "for dialogue with id {}".format(dial_id)
-            )
-
-        joint_metrics = [JOINT_GOAL_ACCURACY, JOINT_CAT_ACCURACY, JOINT_NONCAT_ACCURACY]
-        for turn_id, (turn_ref, turn_hyp) in enumerate(zip(dial_ref["turns"], dial_hyp["turns"])):
-            metric_collections_per_turn = collections.defaultdict(lambda: collections.defaultdict(lambda: 1.0))
-            if turn_ref["speaker"] != turn_hyp["speaker"]:
-                raise ValueError("Speakers don't match in dialogue with id {}".format(dial_id))
-
-            # Skip system turns because metrics are only computed for user turns.
-            if turn_ref["speaker"] != "USER":
-                continue
-
-            if turn_ref["utterance"] != turn_hyp["utterance"]:
-                logging.error("Ref utt: %s", turn_ref["utterance"])
-                logging.error("Hyp utt: %s", turn_hyp["utterance"])
-                raise ValueError("Utterances don't match for dialogue with id {}".format(dial_id))
-
-            hyp_frames_by_service = {frame["service"]: frame for frame in turn_hyp["frames"]}
-
-            # Calculate metrics for each frame in each user turn.
-            for frame_ref in turn_ref["frames"]:
-                service_name = frame_ref["service"]
-                if service_name not in hyp_frames_by_service:
-                    raise ValueError(
-                        "Frame for service {} not found in dialogue with id {}".format(service_name, dial_id)
-                    )
-                service = service_schemas[service_name]
-                frame_hyp = hyp_frames_by_service[service_name]
-
-                active_intent_acc = get_active_intent_accuracy(frame_ref, frame_hyp)
-                slot_tagging_f1_scores = get_slot_tagging_f1(frame_ref, frame_hyp, turn_ref["utterance"], service)
-                requested_slots_f1_scores = get_requested_slots_f1(frame_ref, frame_hyp)
-                goal_accuracy_dict = get_average_and_joint_goal_accuracy(
-                    frame_ref, frame_hyp, service, use_fuzzy_match
-                )
-
-                frame_metric = {
-                    ACTIVE_INTENT_ACCURACY: active_intent_acc,
-                    REQUESTED_SLOTS_F1: requested_slots_f1_scores.f1,
-                    REQUESTED_SLOTS_PRECISION: requested_slots_f1_scores.precision,
-                    REQUESTED_SLOTS_RECALL: requested_slots_f1_scores.recall,
-                }
-                if slot_tagging_f1_scores is not None:
-                    frame_metric[SLOT_TAGGING_F1] = slot_tagging_f1_scores.f1
-                    frame_metric[SLOT_TAGGING_PRECISION] = slot_tagging_f1_scores.precision
-                    frame_metric[SLOT_TAGGING_RECALL] = slot_tagging_f1_scores.recall
-                frame_metric.update(goal_accuracy_dict)
-
-                frame_id = "{:s}-{:03d}-{:s}".format(dial_id, turn_id, frame_hyp["service"])
-                per_frame_metric[frame_id] = frame_metric
-                # Add the frame-level metric result back to dialogues.
-                frame_hyp["metrics"] = frame_metric
-
-                # Get the domain name of the service.
-                domain_name = frame_hyp["service"].split("_")[0]
-                domain_keys = [ALL_SERVICES, frame_hyp["service"], domain_name]
-                if frame_hyp["service"] in in_domain_services:
-                    domain_keys.append(SEEN_SERVICES)
-
-                else:
-                    domain_keys.append(UNSEEN_SERVICES)
-                for domain_key in domain_keys:
-                    for metric_key, metric_value in frame_metric.items():
-                        if metric_value != NAN_VAL:
-                            if joint_acc_across_turn and metric_key in joint_metrics:
-                                metric_collections_per_turn[domain_key][metric_key] *= metric_value
-                            else:
-                                metric_collections[domain_key][metric_key].append(metric_value)
-            if joint_acc_across_turn:
-                # Conduct multiwoz style evaluation that computes joint goal accuracy
-                # across all the slot values of all the domains for each turn.
-                for domain_key in metric_collections_per_turn:
-                    for metric_key, metric_value in metric_collections_per_turn[domain_key].items():
-                        metric_collections[domain_key][metric_key].append(metric_value)
-
-    all_metric_aggregate = {}
-    for domain_key, domain_metric_vals in metric_collections.items():
-        domain_metric_aggregate = {}
-        for metric_key, value_list in domain_metric_vals.items():
-            if value_list:
-                # Metrics are macro-averaged across all frames.
-                domain_metric_aggregate[metric_key] = round(float(np.mean(value_list)) * 100.0, 2)
-            else:
-                domain_metric_aggregate[metric_key] = NAN_VAL
-        all_metric_aggregate[domain_key] = domain_metric_aggregate
-    return all_metric_aggregate, per_frame_metric
-
-
-def evaluate(
-    prediction_dir: str,
-    data_dir: str,
-    eval_dataset: str,
-    in_domain_services: set,
-    joint_acc_across_turn: bool,
-    use_fuzzy_match: bool,
-) -> dict:
-    """Calculate the DSTC8/SGD metrics for given data.
-
-    Args:
-        prediction_dir: prediction location
-        data_dir: ground truth data location.
-        eval_dataset: evaluation data split
-        in_domain_services: The set of services which are present in the training set.
-        joint_acc_across_turn: Whether to compute joint goal accuracy across turn instead of across service. Should be set to True when conducting multiwoz style evaluation.
-        use_fuzzy_match: Whether to use fuzzy string matching when comparing non-categorical slot values. Should be set to False when conducting multiwoz style evaluation.
-
-    Returns:
-        A dict mapping a metric collection name to a dict containing the values
-        for various metrics for all dialogues and all services
-    """
-
-    with open(os.path.join(data_dir, eval_dataset, "schema.json"), encoding="UTF-8") as f:
-        eval_services = {}
-        list_services = json.load(f)
-        for service in list_services:
-            eval_services[service["service_name"]] = service
-        f.close()
-
-    dataset_ref = get_dataset_as_dict(os.path.join(data_dir, eval_dataset, "dialogues_*.json"))
-    dataset_hyp = get_dataset_as_dict(os.path.join(prediction_dir, "*.json"))
-
-    # has ALLSERVICE, SEEN_SERVICES, UNSEEN_SERVICES, SERVICE, DOMAIN
-    all_metric_aggregate, _ = get_metrics(
-        dataset_ref, dataset_hyp, eval_services, in_domain_services, joint_acc_across_turn, use_fuzzy_match
-    )
-    if SEEN_SERVICES in all_metric_aggregate:
-        logging.info(f'Dialog metrics for {SEEN_SERVICES}  : {sorted(all_metric_aggregate[SEEN_SERVICES].items())}')
-    if UNSEEN_SERVICES in all_metric_aggregate:
-        logging.info(f'Dialog metrics for {UNSEEN_SERVICES}: {sorted(all_metric_aggregate[UNSEEN_SERVICES].items())}')
-    if ALL_SERVICES in all_metric_aggregate:
-        logging.info(f'Dialog metrics for {ALL_SERVICES}   : {sorted(all_metric_aggregate[ALL_SERVICES].items())}')
-
-    # Write the per-frame metrics values with the corrresponding dialogue frames.
-    with open(os.path.join(prediction_dir, PER_FRAME_OUTPUT_FILENAME), "w", encoding="UTF-8") as f:
-        json.dump(dataset_hyp, f, indent=2, separators=(",", ": "))
-        f.close()
-    return all_metric_aggregate[ALL_SERVICES]
diff --git a/nemo/collections/nlp/data/dialogue/sgd/prediction_utils.py b/nemo/collections/nlp/data/dialogue/sgd/prediction_utils.py
deleted file mode 100644
index c9ddd2fd6f23..000000000000
--- a/nemo/collections/nlp/data/dialogue/sgd/prediction_utils.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Prediction and evaluation-related utility functions.
-This file contains code artifacts adapted from the original implementation:
-https://github.com/google-research/google-research/blob/master/schema_guided_dst/baseline/pred_utils.py
-"""
-
-import json
-import os
-from collections import OrderedDict, defaultdict
-from typing import Dict, List, Optional
-
-from nemo.collections.nlp.data.dialogue.input_example.sgd_input_example import (
-    STATUS_ACTIVE,
-    STATUS_DONTCARE,
-    STR_DONTCARE,
-)
-from nemo.utils import logging
-
-REQ_SLOT_THRESHOLD = 0.5
-
-
-__all__ = ['write_predictions_to_file']
-
-
-def set_cat_slot(predictions_status: dict, predictions_value: dict, cat_slot_values: Dict[str, List[str]]) -> dict:
-    """
-    Extract predicted categorical slot information 
-    Args:
-        predictions_status: predicted statuses
-        predictions_value: predicted slot values
-        cat_slot_values: possible categorical slots and their potential values for this service
-    Returns:
-        out_dict: predicted slot value pairs
-    """
-    out_dict = {}
-    for slot_idx, slot in enumerate(cat_slot_values):
-        slot_status = predictions_status[slot_idx][0]["cat_slot_status"]
-        if slot_status == STATUS_DONTCARE:
-            out_dict[slot] = STR_DONTCARE
-        elif slot_status == STATUS_ACTIVE:
-            tmp = predictions_value[slot_idx]
-            value_idx = max(tmp, key=lambda k: tmp[k]['cat_slot_value_status'][0].item())
-            out_dict[slot] = cat_slot_values[slot][value_idx]
-    return out_dict
-
-
-def set_noncat_slot(
-    predictions_status: dict,
-    predictions_value: dict,
-    non_cat_slots: List[str],
-    user_utterance: str,
-    sys_slots_agg: Optional[dict] = None,
-) -> dict:
-    """
-    Extract predicted non categorical slot information 
-    Args:
-        predictions_status: predicted statuses
-        predictions_value: predicted slot values
-        non_cat_slots: list of possible non categorical slots for this service
-        user_utterance: system and user utterance
-        sys_slots_agg: system retrieval lookup table. Contains for each slot the most recent value seen in the history
-    Returns:
-        out_dict: predicted slot value pairs
-    """
-    out_dict = {}
-    for slot_idx, slot in enumerate(non_cat_slots):
-        slot_status = predictions_status[slot_idx][0]["noncat_slot_status"]
-        if slot_status == STATUS_DONTCARE:
-            out_dict[slot] = STR_DONTCARE
-        elif slot_status == STATUS_ACTIVE:
-            tok_start_idx = predictions_value[slot_idx][0]["noncat_slot_start"]
-            tok_end_idx = predictions_value[slot_idx][0]["noncat_slot_end"]
-            ch_start_idx = predictions_value[slot_idx][0]["noncat_alignment_start"][tok_start_idx]
-            ch_end_idx = predictions_value[slot_idx][0]["noncat_alignment_end"][tok_end_idx]
-            if ch_start_idx > 0 and ch_end_idx > 0:
-                # Add span from the utterance.
-                out_dict[slot] = user_utterance[ch_start_idx - 1 : ch_end_idx]
-            elif sys_slots_agg and slot in sys_slots_agg:
-                # system retrieval
-                out_dict[slot] = sys_slots_agg[slot]
-    return out_dict
-
-
-def get_predicted_dialog(dialog: dict, all_predictions: dict, schemas: object, state_tracker: str) -> dict:
-    """Overwrite the labels in the turn with the predictions from the model. For test set, these labels are missing from the data and hence they are added. 
-    Args:
-        dialog: ground truth dialog
-        all_predictions: predictions
-        schemas: schema object of all services of all datasets
-        state_tracker: state tracker option, e.g. nemotracker
-    Returns:
-        dialog: dialog overwritten with prediction information
-    """
-    dialog_id = dialog["dialogue_id"]
-    if state_tracker == "baseline":
-        sys_slots_agg = {}
-    else:
-        sys_slots_agg = defaultdict(OrderedDict)
-    all_slot_values = defaultdict(dict)
-    for turn_idx, turn in enumerate(dialog["turns"]):
-        if turn["speaker"] == "SYSTEM" and state_tracker == 'nemotracker':
-            for frame in turn["frames"]:
-                if frame["service"] not in sys_slots_agg:
-                    sys_slots_agg[frame["service"]] = OrderedDict()
-                for action in frame["actions"]:
-                    if action["slot"] and len(action["values"]) > 0:
-                        sys_slots_agg[frame["service"]][action["slot"]] = action["values"][0]
-        if turn["speaker"] == "USER":
-            user_utterance = turn["utterance"]
-            system_utterance = dialog["turns"][turn_idx - 1]["utterance"] if turn_idx else ""
-            system_user_utterance = system_utterance + ' ' + user_utterance
-            turn_id = "{:02d}".format(turn_idx)
-            for frame in turn["frames"]:
-
-                predictions = all_predictions[(dialog_id, turn_id, frame["service"])]
-                slot_values = all_slot_values[frame["service"]]
-                service_schema = schemas.get_service_schema(frame["service"])
-                # Remove the slot spans and state if present.
-                frame.pop("slots", None)
-                frame.pop("state", None)
-
-                # The baseline model doesn't predict slot spans. Only state predictions
-                # are added.
-                state = {}
-
-                # Add prediction for active intent. No Offset is subtracted since schema has now NONE intent at index 0
-                state["active_intent"] = get_predicted_intent(
-                    predictions=predictions[0], intents=service_schema.intents
-                )
-                # Add prediction for requested slots.
-                state["requested_slots"] = get_requested_slot(predictions=predictions[1], slots=service_schema.slots)
-
-                # Add prediction for user goal (slot values).
-                # Categorical slots.
-                cat_out_dict = set_cat_slot(
-                    predictions_status=predictions[2],
-                    predictions_value=predictions[3],
-                    cat_slot_values=service_schema.categorical_slot_values,
-                )
-                for k, v in cat_out_dict.items():
-                    slot_values[k] = v
-
-                # Non-categorical slots.
-                noncat_out_dict = set_noncat_slot(
-                    predictions_status=predictions[4],
-                    predictions_value=predictions[5],
-                    non_cat_slots=service_schema.non_categorical_slots,
-                    user_utterance=system_user_utterance,
-                    sys_slots_agg=sys_slots_agg.get(frame["service"], None),
-                )
-                for k, v in noncat_out_dict.items():
-                    slot_values[k] = v
-                # Create a new dict to avoid overwriting the state in previous turns
-                # because of use of same objects.
-                state["slot_values"] = {s: [v] for s, v in slot_values.items()}
-                frame["state"] = state
-    return dialog
-
-
-def get_predicted_intent(predictions: dict, intents: List[str]) -> str:
-    """
-    Returns intent name with maximum score
-    Args:
-        predictions: predictions
-        intents: list of possible intents for this service
-    Returns:
-        intent: predicted intent
-    """
-    assert len(predictions) == len(intents)
-    active_intent_id = max(predictions, key=lambda k: predictions[k][0]['intent_status'])
-    intent = intents[active_intent_id]
-    return intent
-
-
-def get_requested_slot(predictions: dict, slots: List[str]) -> List[str]:
-    """
-    Returns list of slots which are predicted to be requested
-    Args:
-        predictions: predictions
-        slots: list of possible slots
-    Returns:
-        requested_slots: list of requested slots
-    """
-    active_indices = [k for k in predictions if predictions[k][0]["req_slot_status"] > REQ_SLOT_THRESHOLD]
-    requested_slots = list(map(lambda k: slots[k], active_indices))
-    return requested_slots
-
-
-def write_predictions_to_file(
-    predictions: List[dict],
-    input_json_files: List[str],
-    output_dir: str,
-    schemas: object,
-    state_tracker: str,
-    eval_debug: bool,
-    in_domain_services: set,
-):
-    """Save predicted dialogues as json files.
-
-    Args:
-        predictions: An iterator containing model predictions. This is the output of
-            the predict method in the estimator.
-        input_json_files: A list of json paths containing the dialogues to run
-            inference on.
-        output_dir: The directory where output json files will be created.
-        schemas: Schemas to all services in the dst dataset
-        state_tracker: state tracker option
-        eval_debug: output evaluation debugging information
-        in_domain_services: in domain services
-    """
-    logging.info(f"Writing predictions to {output_dir} started.")
-
-    # Index all predictions.
-    all_predictions = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
-    for idx, prediction in enumerate(predictions):
-        eval_dataset, dialog_id, turn_id, service_name, model_task, slot_intent_id, value_id = prediction[
-            'example_id'
-        ].split('-')
-        all_predictions[(dialog_id, turn_id, service_name)][int(model_task)][int(slot_intent_id)][
-            int(value_id)
-        ] = prediction
-    logging.info(f'Predictions for {idx} examples in {eval_dataset} dataset are getting processed.')
-
-    # Read each input file and write its predictions.
-    for input_file_path in input_json_files:
-        with open(input_file_path, encoding="UTF-8") as f:
-            dialogs = json.load(f)
-            logging.debug(f'{input_file_path} file is loaded')
-            pred_dialogs = []
-            for d in dialogs:
-                pred_dialog = get_predicted_dialog(d, all_predictions, schemas, state_tracker)
-                pred_dialogs.append(pred_dialog)
-        input_file_name = os.path.basename(input_file_path)
-        output_file_path = os.path.join(output_dir, input_file_name)
-        with open(output_file_path, "w", encoding="UTF-8") as f:
-            json.dump(pred_dialogs, f, indent=2, separators=(",", ": "), sort_keys=True)
diff --git a/nemo/collections/nlp/data/dialogue/sgd/schema.py b/nemo/collections/nlp/data/dialogue/sgd/schema.py
deleted file mode 100644
index b12a11fdb63c..000000000000
--- a/nemo/collections/nlp/data/dialogue/sgd/schema.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Wrappers for schemas of different services.
-This file contains code artifacts adapted from the original implementation:
-https://github.com/google-research/google-research/blob/master/schema_guided_dst/schema.py
-"""
-
-import json
-from typing import List, Optional, Union
-
-from nemo.utils import logging
-
-__all__ = ['Schema']
-
-
-class ServiceSchema(object):
-    """A wrapper for schema for a service."""
-
-    def __init__(self, schema_json: dict, service_id: Optional[int] = None):
-        """
-        Constructor for ServiceSchema.
-        Args:
-            schema_json: schema json dict
-            service_id: service ID
-        """
-        self._service_name = schema_json["service_name"]
-        self._description = schema_json["description"]
-        self._schema_json = schema_json
-        self._service_id = service_id
-
-        # Construct the vocabulary for intents, slots, categorical slots,
-        # non-categorical slots and categorical slot values.
-        self._intents = ["NONE"] + sorted(i["name"] for i in schema_json["intents"])
-        self._intent_descriptions = {i["name"]: i["description"] for i in schema_json["intents"]}
-        self._intent_descriptions["NONE"] = "none"
-        self._slots = sorted(s["name"] for s in schema_json["slots"])
-        self._slots_descriptions = {s["name"]: s["description"] for s in schema_json["slots"]}
-        self._categorical_slots = sorted(
-            s["name"] for s in schema_json["slots"] if s["is_categorical"] and s["name"] in self.state_slots
-        )
-        self._non_categorical_slots = sorted(
-            s["name"] for s in schema_json["slots"] if not s["is_categorical"] and s["name"] in self.state_slots
-        )
-        slot_schemas = {s["name"]: s for s in schema_json["slots"]}
-        categorical_slot_values = {}
-        categorical_slot_value_ids = {}
-        categorical_slot_ids = {}
-        non_categorical_slot_ids = {}
-        for slot_id, slot in enumerate(self._categorical_slots):
-            slot_schema = slot_schemas[slot]
-            values = sorted(slot_schema["possible_values"])
-            categorical_slot_values[slot] = values
-            value_ids = {value: idx for idx, value in enumerate(values)}
-            categorical_slot_value_ids[slot] = value_ids
-            categorical_slot_ids[slot] = slot_id
-
-        for slot_id, slot in enumerate(self._non_categorical_slots):
-            non_categorical_slot_ids[slot] = slot_id
-
-        self._categorical_slot_values = categorical_slot_values
-        self._categorical_slot_value_ids = categorical_slot_value_ids
-
-        self._categorical_slot_ids = categorical_slot_ids
-        self._non_categorical_slot_ids = non_categorical_slot_ids
-
-    @property
-    def schema_json(self) -> dict:
-        """Returns schema json dictionary"""
-        return self._schema_json
-
-    @property
-    def state_slots(self) -> set:
-        """Set of slots which are permitted to be in the dialogue state."""
-        state_slots = set()
-        for intent in self._schema_json["intents"]:
-            state_slots.update(intent["required_slots"])
-            state_slots.update(intent["optional_slots"])
-        return state_slots
-
-    @property
-    def service_name(self):
-        return self._service_name
-
-    @property
-    def service_id(self):
-        return self._service_id
-
-    @property
-    def description(self):
-        return self._description
-
-    @property
-    def slots(self):
-        return self._slots
-
-    @property
-    def intents(self):
-        return self._intents
-
-    @property
-    def intent_descriptions(self):
-        return self._intent_descriptions
-
-    @property
-    def slot_descriptions(self):
-        return self._slots_descriptions
-
-    @property
-    def categorical_slots(self):
-        return self._categorical_slots
-
-    @property
-    def non_categorical_slots(self):
-        return self._non_categorical_slots
-
-    @property
-    def categorical_slot_values(self):
-        return self._categorical_slot_values
-
-    def get_categorical_slot_values(self, slot):
-        return self._categorical_slot_values[slot]
-
-    def get_slot_from_id(self, slot_id):
-        return self._slots[slot_id]
-
-    def get_intent_from_id(self, intent_id):
-        return self._intents[intent_id]
-
-    def get_categorical_slot_from_id(self, slot_id):
-        return self._categorical_slots[slot_id]
-
-    def get_non_categorical_slot_from_id(self, slot_id):
-        return self._non_categorical_slots[slot_id]
-
-    def get_categorical_slot_value_from_id(self, slot_id, value_id):
-        slot = self._categorical_slots[slot_id]
-        return self._categorical_slot_values[slot][value_id]
-
-    def get_categorical_slot_value_id(self, slot, value):
-        return self._categorical_slot_value_ids[slot][value]
-
-    def get_categorical_slot_id(self, slot):
-        return self._categorical_slot_ids[slot]
-
-    def get_non_categorical_slot_id(self, slot):
-        return self._non_categorical_slot_ids[slot]
-
-
-class Schema(object):
-    """Wrapper for schemas for all services in a dataset."""
-
-    def __init__(self, schema_json_paths: Union[str, List[str]]):
-        """
-        schema_json_paths: list of .json path to schema files of a single str with path to the json file.
-        """
-        # Load the schema from the json file.
-        if isinstance(schema_json_paths, str):
-            with open(schema_json_paths, "r") as f:
-                all_schemas = json.load(f)
-                f.close()
-        else:
-            # load multiple schemas from the list of the json files
-            all_schemas = []
-            completed_services = []
-            for schema_json_path in schema_json_paths:
-                with open(schema_json_path, "r") as f:
-                    schemas = json.load(f)
-                    f.close()
-                    logging.debug("Num of services in %s: %s", schema_json_path, len(schemas))
-
-                for service in schemas:
-                    if service['service_name'] not in completed_services:
-                        completed_services.append(service['service_name'])
-                        all_schemas.append(service)
-
-        self._services = sorted(schema["service_name"] for schema in all_schemas)
-        self._services_vocab = {v: k for k, v in enumerate(self._services)}
-        self._services_id_to_vocab = {v: k for k, v in self._services_vocab.items()}
-        service_schemas = {}
-        for schema in all_schemas:
-            service = schema["service_name"]
-            service_schemas[service] = ServiceSchema(schema, service_id=self.get_service_id(service))
-
-        self._service_schemas = service_schemas
-        self._schemas = all_schemas
-        self._slots_relation_list = {}
-
-    def get_service_id(self, service: str):
-        return self._services_vocab[service]
-
-    def get_service_from_id(self, service_id: int):
-        return self._services[service_id]
-
-    def get_service_schema(self, service: str):
-        return self._service_schemas[service]
-
-    @property
-    def services(self):
-        return self._services
-
-    def save_to_file(self, file_path):
-        """
-        Saves schema object to file
-        Args:
-            file_path: path to store schema object at
-        """
-        with open(file_path, "w") as f:
-            json.dump(self._schemas, f, indent=2)
diff --git a/nemo/collections/nlp/metrics/__init__.py b/nemo/collections/nlp/metrics/__init__.py
index 18414412d91c..fda3c1f799b9 100644
--- a/nemo/collections/nlp/metrics/__init__.py
+++ b/nemo/collections/nlp/metrics/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.nlp.metrics.classification_report import ClassificationReport, MultiLabelClassificationReport
-from nemo.collections.nlp.metrics.dialogue_metrics import DialogueClassificationMetrics
-from nemo.collections.nlp.metrics.qa_metrics import QAMetrics
-from nemo.collections.nlp.metrics.sequence_perplexity import SequencePerplexity
+from nemo.collections.nlp.metrics.classification_report import ClassificationReport  # noqa: F401
+from nemo.collections.nlp.metrics.classification_report import MultiLabelClassificationReport  # noqa: F401
+from nemo.collections.nlp.metrics.qa_metrics import QAMetrics  # noqa: F401
+from nemo.collections.nlp.metrics.sequence_perplexity import SequencePerplexity  # noqa: F401
diff --git a/nemo/collections/nlp/metrics/dialogue_metrics.py b/nemo/collections/nlp/metrics/dialogue_metrics.py
deleted file mode 100644
index 7330a1c90611..000000000000
--- a/nemo/collections/nlp/metrics/dialogue_metrics.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from collections import Counter
-
-import numpy as np
-from sacrebleu import corpus_bleu
-
-
-class DialogueGenerationMetrics(object):
-    @staticmethod
-    def save_predictions(
-        filename, generated_field, ground_truth_field, inputs,
-    ):
-        """
-        Save predictions as a jsonl file
-
-        Args:
-            Each arg is a list of strings (all args have the same length)
-        """
-        docs = []
-        for i in range(len(inputs)):
-            docs.append(
-                {"input": inputs[i], "ground_truth": ground_truth_field[i], "generated": generated_field[i],}
-            )
-        with open(filename, 'w', encoding="UTF-8") as f:
-            for item in docs:
-                f.write(json.dumps(item) + "\n")
-
-    @staticmethod
-    def _get_one_f1(generated_field, ground_truth_field):
-        """
-        Get precision, recall, f1 based on token overlap between generated and ground_truth sequence
-        """
-        generated_tokens = generated_field.split()
-        ground_truth_tokens = ground_truth_field.split()
-
-        common = Counter(generated_tokens) & Counter(ground_truth_tokens)
-        num_same = sum(common.values())
-        if num_same == 0:
-            return 0, 0, 0
-        precision = 1.0 * num_same / len(generated_tokens)
-        recall = 1.0 * num_same / len(ground_truth_tokens)
-        f1 = (2 * precision * recall) / (precision + recall)
-        return np.array([precision * 100, recall * 100, f1 * 100])
-
-    @staticmethod
-    def get_f1(generated_fields, ground_truth_fields):
-        total_p_r_f1 = np.array(
-            [
-                DialogueGenerationMetrics._get_one_f1(generated_fields[i], ground_truth_fields[i])
-                for i in range(len(ground_truth_fields))
-            ]
-        )
-        avg_p_r_f1 = np.mean(total_p_r_f1, axis=0)
-        return avg_p_r_f1
-
-    @staticmethod
-    def get_bleu(generated_field, ground_truth_field):
-        """
-        Referenced from NMT evaluation
-        Note 13a is the default tokenizer for English for WMT
-        Known issue that it doesn't hand edge case of None or '' 
-        https://github.com/mjpost/sacrebleu/issues/161
-        """
-        valid_indices = [i for i in range(len(generated_field)) if generated_field[i] and ground_truth_field[i]]
-        generated_field = [generated_field[i] for i in valid_indices]
-        ground_truth_field = [ground_truth_field[i] for i in valid_indices]
-        sacre_bleu = corpus_bleu(generated_field, [ground_truth_field], tokenize="13a")
-        return sacre_bleu.score
-
-
-class DialogueClassificationMetrics(object):
-    @staticmethod
-    def save_predictions(
-        filename,
-        generated_labels,
-        generated_slots,
-        ground_truth_labels,
-        ground_truth_slots,
-        generated_field,
-        ground_truth_field,
-        inputs,
-    ):
-        """
-        Save predictions as a jsonl file
-
-        Args:
-            Each arg is a list of strings (all args have the same length)
-        """
-        docs = []
-        for i in range(len(inputs)):
-            docs.append(
-                {
-                    "input": inputs[i],
-                    "ground_truth": ground_truth_field[i],
-                    "ground_truth_slots": ground_truth_slots[i],
-                    "ground_truth_labels": ground_truth_labels[i],
-                    "generated": generated_field[i],
-                    "generated_slots": generated_slots[i],
-                    "generated_labels": generated_labels[i],
-                }
-            )
-        with open(filename, 'w', encoding="UTF-8") as f:
-            for item in docs:
-                f.write(json.dumps(item) + "\n")
-
-    @staticmethod
-    def split_label_and_slots(fields, with_slots=False):
-        """
-        Split target into label and slots when doing joint label (i.e. intent) classificaiton and slot filling
-
-        For instance, split "reserve_restaurant\nslots: time_of_day(7pm), number_of_people(3)" into 
-        label = "reserve_restaurant" and slots = ["time_of_day(7pm)", "number_of_people(3)"]
-        Args:
-            fields: list of strings 
-        """
-        labels = []
-        slots_list = []
-        for field in fields:
-            if with_slots:
-                combo = [i.strip() for i in field.split('slots:', 1)]
-                label = 'none'
-                if len(combo) == 2:
-                    label, slots = combo
-                elif len(combo) == 1:
-                    slots = combo[0]
-                    label = 'none'
-                if isinstance(slots, str):
-                    # temporary patch for purnendu model output
-                    if 'possible intents:' in slots:
-                        slots = slots.split('possible intents:')[0]
-                    slots = slots.split(', ')
-                else:
-                    slots = ['None']
-            else:
-                label = field
-                slots = []
-            slots_list.append(slots)
-            labels.append(label)
-
-        return labels, slots_list
-
-    @staticmethod
-    def get_slot_filling_metrics(generated_slots, ground_truth_slots):
-        """
-        Args:
-            generated_slots: list of list of strings. 
-                Each string is slot-name and slot-value pair e.g. location(Seattle)
-            ground_truth_slots: list of list of strings
-        """
-        all_recall = []
-        all_precision = []
-        all_joint_goal_accuracy = []
-
-        for i in range(len(generated_slots)):
-            # depulicate and sort
-            ground_truth = sorted(list(set(ground_truth_slots[i])))
-            predicted = sorted(list(set(generated_slots[i])))
-            correct = [item for item in predicted if item in ground_truth]
-            recall = len(correct) / len(ground_truth) if len(ground_truth) > 0 else 0
-            precision = len(correct) / len(predicted) if len(predicted) > 0 else 0
-            joint_goal_accuracy = int(ground_truth == predicted)
-            all_recall.append(recall)
-            all_precision.append(precision)
-            all_joint_goal_accuracy.append(joint_goal_accuracy)
-
-        avg_joint_goal_accuracy = np.mean(all_joint_goal_accuracy) * 100
-        avg_precision = np.mean(all_precision) * 100
-        avg_recall = np.mean(all_recall) * 100
-        avg_f1 = 2 * (avg_recall * avg_precision) / (avg_recall + avg_precision + 1e-20)
-
-        return avg_precision, avg_recall, avg_f1, avg_joint_goal_accuracy
diff --git a/nemo/collections/nlp/models/dialogue/__init__.py b/nemo/collections/nlp/models/dialogue/__init__.py
deleted file mode 100644
index 2b75ee1a778a..000000000000
--- a/nemo/collections/nlp/models/dialogue/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.nlp.models.dialogue.dialogue_gpt_classification_model import DialogueGPTClassificationModel
-from nemo.collections.nlp.models.dialogue.dialogue_zero_shot_intent_model import DialogueZeroShotIntentModel
-from nemo.collections.nlp.models.dialogue.intent_slot_classification_model import IntentSlotClassificationModel
-from nemo.collections.nlp.models.dialogue.sgdqa_model import SGDQAModel
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py
deleted file mode 100644
index 6c7472b95c42..000000000000
--- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py
+++ /dev/null
@@ -1,805 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import copy
-import os
-import random
-from typing import Dict, Optional, Union
-
-import numpy as np
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig
-from torch.utils.data import DataLoader
-from transformers import AutoModelWithLMHead
-
-from nemo.collections.nlp.data.dialogue import DialogueGPTClassificationDataset, DialogueSGDDataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.design_data_processor import DialogueDesignDataProcessor
-from nemo.collections.nlp.metrics.classification_report import ClassificationReport
-from nemo.collections.nlp.metrics.dialogue_metrics import DialogueClassificationMetrics
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import (
-    MegatronGPTPromptLearningModel,
-)
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.collections.nlp.modules.common import VirtualPromptSource, VirtualPromptStyle
-from nemo.collections.nlp.modules.common.text_generation_utils import (
-    get_default_sampling_params,
-    megatron_gpt_generate,
-)
-from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam
-from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
-from nemo.core.classes.common import PretrainedModelInfo
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueGPTClassificationModel']
-
-
-class DialogueGPTClassificationModel(NLPModel):
-    def __init__(
-        self,
-        cfg: DictConfig,
-        trainer: Trainer = None,
-    ):
-        # deprecation warning
-        deprecated_warning("DialogueGPTClassificationModel")
-
-        self.cfg = cfg
-        self.eval_mode = cfg.dataset.eval_mode
-        self.data_prepared = False
-        self.epoch_number = 0
-        self.prompt_learning = self.cfg.prompt_learning
-        super().__init__(cfg=cfg, trainer=trainer, no_lm_init=True)
-
-        if self.cfg.library == "huggingface":
-            self.language_model = AutoModelWithLMHead.from_pretrained(cfg.language_model.pretrained_model_name)
-            self.language_model.resize_token_embeddings(len(self.tokenizer.tokenizer))
-            self.unreduced_loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
-        elif self.cfg.library == "megatron":
-            if self.prompt_learning:
-                if os.path.exists(cfg.prompt_learning_nemo_path):
-                    self.language_model = MegatronGPTPromptLearningModel.restore_from(
-                        cfg.prompt_learning_nemo_path,
-                        trainer=trainer,
-                        save_restore_connector=NLPSaveRestoreConnector(),
-                    )
-                else:
-                    # removing tokenizer cfg as this triggers tokenizer construction which is not helpful here as we have a separate tokenizer
-                    new_cfg = copy.copy(cfg)
-                    del new_cfg.tokenizer
-                    new_cfg.nemo_path = cfg.prompt_learning_nemo_path
-                    self.language_model = MegatronGPTPromptLearningModel(new_cfg, trainer)
-            else:
-                self.language_model = MegatronGPTModel.restore_from(cfg.language_model.lm_checkpoint, trainer=trainer)
-
-        all_labels = list(
-            self._train_dl.dataset.all_possible_labels.union(
-                self._validation_dl.dataset.all_possible_labels, self._test_dl.dataset.all_possible_labels
-            )
-        )
-        self.label_to_ids = collections.defaultdict(int)
-
-        for i in range(len(all_labels)):
-            self.label_to_ids[all_labels[i]] = i
-
-        self.all_existing_labels = set(self.label_to_ids.keys())
-
-        self.token_to_words = {}
-        self.classification_report = ClassificationReport(
-            num_classes=len(self.label_to_ids) + 1, mode='micro', label_ids=self.label_to_ids, dist_sync_on_step=True
-        )
-
-    def setup_optimizer_param_groups(self):
-        """
-        ModelPT override for prompt learning.
-        Optimizer will get self._optimizer_param_groups.
-        Makes two optimizer param groups, one for the frozen model params
-        and one for the prompt-table/prompt-encoder params. The learning
-        rate for the frozen model's params will always be zero effectively
-        freezing the model's params but still allowing for the needed gradients
-        to be passed around in pipeline parallel models. The prompt-encoder
-        and/or prompt table will use the learning rate set by the user.
-        """
-        if not self.prompt_learning:
-            super().setup_optimizer_param_groups()
-            return
-        # Freeze frozen model
-        for param in self.language_model.frozen_model.parameters():
-            param.requires_grad = False
-
-        virtual_prompt_params = {'params': []}
-
-        if self.language_model.frozen_model.model.pre_process:
-            virtual_prompt_params['params'].extend([param for param in self.language_model.prompt_table.parameters()])
-
-            if self.language_model.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
-                virtual_prompt_params['params'].extend(
-                    [param for param in self.language_model.prompt_encoder.parameters()]
-                )
-        self._optimizer_param_groups = (virtual_prompt_params,)
-
-    def training_step(self, batch, batch_idx):
-        (
-            input_ids,
-            attn_masks,
-            labels,
-            candidate_input_ids,
-            candidate_attn_masks,
-            template_length,
-            utterance_length,
-            correct_candidate,
-        ) = batch
-        # construct training samples as generating " Answer: yes/no" after "<utterance> <label_type>: <candidate_label>"
-        if self.eval_mode == "binary_score":
-            new_input_ids = []
-            new_attn_masks = []
-            for i in range(candidate_input_ids.size(0)):
-                # in some datasets like assistant, there might be 60+ possible intents with 1 correct intent
-                # therefore we might not want to use all possible intents as negative samples
-                # instead use {binary_score_subsample_ratio} negative samples for every positive sample
-                if self.cfg.dataset.binary_score_subsample:
-                    new_input_ids.append(candidate_input_ids[i, 2 * correct_candidate[i].item(), :])
-                    new_attn_masks.append(candidate_attn_masks[i, 2 * correct_candidate[i].item(), :])
-                    possible_negatives = []
-                    for j in range(0, candidate_input_ids.size(1), 2):
-                        if j > 0 and torch.equal(candidate_input_ids[i, j, :], candidate_input_ids[i, 0, :]):
-                            break
-                        if j != 2 * correct_candidate[i].item():
-                            possible_negatives.append(j)
-                    negative_samples = random.choices(
-                        possible_negatives, k=int(self.cfg.dataset.binary_score_subsample_ratio)
-                    )
-                    for negative_sample in negative_samples:
-                        new_input_ids.append(candidate_input_ids[i, negative_sample, :])
-                        new_attn_masks.append(candidate_attn_masks[i, negative_sample, :])
-
-                else:
-                    for j in range(0, candidate_input_ids.size(1), 2):
-                        if j > 0 and torch.equal(candidate_input_ids[i, j, :], candidate_input_ids[i, 0, :]):
-                            break
-                        new_input_ids.append(candidate_input_ids[i, j, :])
-                        new_attn_masks.append(candidate_attn_masks[i, j, :])
-            input_ids = torch.stack(new_input_ids)
-            attn_masks = torch.stack(new_attn_masks)
-            labels = self.get_binary_score_labels(input_ids)
-
-        loss, _ = self(input_ids, attn_masks, labels, inference=False)
-        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
-        return {'loss': loss}
-
-    def validation_step(self, batch, batch_idx):
-        loss = self.eval_step_helper(batch=batch)
-        self.validation_step_outputs.append(loss)
-        return loss
-
-    def on_validation_epoch_end(self):
-        self.eval_epoch_end(self.validation_step_outputs, mode='val')
-        self.validation_step_outputs.clear()  # free memory
-
-    def on_test_epoch_end(self):
-        self.eval_epoch_end(self.test_step_outputs, mode='test')
-        self.test_step_outputs.clear()  # free memory
-
-    def eval_epoch_end(self, outputs, mode='val'):
-
-        generated_field = []
-        ground_truth_field = []
-        inputs = []
-        for output in outputs:
-            generated_field += output["generated_field"]
-            ground_truth_field += output["ground_truth_field"]
-            inputs += output["input"]
-
-        with_slots = self.cfg.dataset.target_template == "with_slots"
-
-        generated_labels, generated_slots = DialogueClassificationMetrics.split_label_and_slots(
-            generated_field, with_slots=with_slots
-        )
-        ground_truth_labels, ground_truth_slots = DialogueClassificationMetrics.split_label_and_slots(
-            ground_truth_field, with_slots=with_slots
-        )
-
-        os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True)
-        filename = os.path.join(
-            self.cfg.dataset.dialogues_example_dir, f"{mode}_predictions_epoch{self.epoch_number}.jsonl"
-        )
-
-        DialogueClassificationMetrics.save_predictions(
-            filename,
-            generated_labels,
-            generated_slots,
-            ground_truth_labels,
-            ground_truth_slots,
-            generated_field,
-            ground_truth_field,
-            inputs,
-        )
-
-        label_acc = np.mean([int(generated_labels[i] == ground_truth_labels[i]) for i in range(len(generated_labels))])
-
-        generated_field_ids = torch.tensor([self.label_to_ids[label] for label in generated_labels], dtype=int).to(
-            self.classification_report.device
-        )
-
-        ground_truth_field_ids = torch.tensor(
-            [self.label_to_ids[label] for label in ground_truth_labels], dtype=int
-        ).to(self.classification_report.device)
-
-        tp, fn, fp, _ = self.classification_report(generated_field_ids, ground_truth_field_ids)
-
-        precision, recall, f1, report = self.classification_report.compute()
-        self.classification_report.reset()
-
-        (
-            slot_precision,
-            slot_recall,
-            slot_f1,
-            slot_joint_goal_accuracy,
-        ) = DialogueClassificationMetrics.get_slot_filling_metrics(generated_slots, ground_truth_slots)
-
-        logging.info(report)
-
-        self.log('{}_precision'.format(self.cfg.dataset.field), precision)
-        self.log('{}_f1'.format(self.cfg.dataset.field), f1)
-        self.log('{}_recall'.format(self.cfg.dataset.field), recall)
-        self.log('{}_{}_accuracy'.format(mode, self.cfg.dataset.field), label_acc * 100)
-        self.log('slot_precision', slot_precision)
-        self.log('slot_recall', slot_recall)
-        self.log('slot_f1', slot_f1)
-        self.log('slot_joint_goal_accuracy', slot_joint_goal_accuracy)
-
-        if mode == 'val':
-            self.epoch_number += 1
-            if self.cfg.save_model:
-                filename = '{}/epoch-{}-model.bin'.format(self.cfg.dataset.dialogues_example_dir, self.epoch_number)
-                torch.save(self.language_model.state_dict(), filename)
-
-    def test_step(self, batch, batch_idx):
-        loss = self.eval_step_helper(batch=batch, mode='test')
-        self.test_step_outputs.append(loss)
-        return loss
-
-    # for inference only
-    def predict_step(self, batch, batch_idx, dataloader_idx=None):
-        # return self(batch)
-        raise NotImplementedError()
-
-    def on_train_end(self):
-        if self.prompt_learning:
-            self.language_model.on_train_end()
-
-    def get_prompt_token_labels_for_megatron_gpt(self, input_ids, num_prompt_tokens):
-
-        prompt_token_labels = torch.full(
-            size=(input_ids.size(0), num_prompt_tokens),
-            fill_value=self.tokenizer.tokenizer.pad_token_id,
-            dtype=torch.long,
-        )
-
-        if self.prompt_learning:
-            prompt_token_labels.data = torch.LongTensor(
-                np.tile(np.array(self.language_model.pseudo_token_ids), (input_ids.size(0), 1))
-            )
-
-        prompt_token_labels = prompt_token_labels.to(input_ids.device)
-
-        return prompt_token_labels
-
-    def get_virtual_prompt_ids_for_megatron_gpt(self, input_ids):
-        if (
-            self.cfg.virtual_prompt_style == VirtualPromptStyle.P_TUNING
-            or not self.prompt_learning
-            or self.trainer.testing
-        ):
-            prompt_ids = torch.tensor([0] * input_ids.size(0)).to(input_ids.device) if self.prompt_learning else None
-        else:
-            total_virtual_tokens = self.cfg.task_templates[0].total_virtual_tokens
-            init_text = self.cfg.task_templates[0].taskname
-            init_text_ids = self.tokenizer.text_to_ids(init_text)
-            init_text_ids = torch.tensor(init_text_ids).to(input_ids.device)
-            prompt_ids = init_text_ids.repeat(input_ids.size(0), 1)[:, :total_virtual_tokens]
-        return prompt_ids
-
-    def forward(self, input_ids, attention_mask, labels, inference=True):
-
-        if self.cfg.library == "huggingface":
-            output = self.language_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
-            loss = output['loss']
-            # calculate loss per sample
-            b_logits = output['logits']
-            shift_logits = b_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            unreduced_loss = self.unreduced_loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
-            )
-            loss_per_sample = torch.mean(unreduced_loss.view(shift_labels.size()), dim=-1)
-        elif self.cfg.library == "megatron":
-            num_prompt_tokens = (
-                len(self.language_model.pseudo_token_ids) if hasattr(self.language_model, 'pseudo_token_ids') else 0
-            )
-            position_ids = torch.arange(
-                start=0,
-                end=num_prompt_tokens + input_ids.size(1),
-                dtype=torch.long,
-                device=input_ids.device,
-            )
-
-            prompt_ids = self.get_virtual_prompt_ids_for_megatron_gpt(input_ids)
-
-            attn_mask_add_on = torch.ones((attention_mask.size(0), num_prompt_tokens), device=attention_mask.device)
-            full_attention_mask = torch.cat([attn_mask_add_on, attention_mask], axis=-1)
-            full_attention_mask_expand = torch.tril(
-                full_attention_mask.unsqueeze(2).tile(full_attention_mask.size(1))
-            ).unsqueeze(1)
-
-            attn_mask = full_attention_mask_expand <= 0
-
-            prompt_token_labels = self.get_prompt_token_labels_for_megatron_gpt(input_ids, num_prompt_tokens)
-
-            input_ids_new = torch.cat([prompt_token_labels, input_ids], axis=1)
-            make_up_last_column_input_ids = (
-                torch.ones_like(input_ids_new[:, -1:]) * self.tokenizer.tokenizer.pad_token_id
-            )
-            left_shifted_input_ids = torch.cat([input_ids_new[:, 1:], make_up_last_column_input_ids], axis=-1)
-            if self.prompt_learning:
-                unmasked_unreduced_loss = self.language_model(
-                    input_ids_new,
-                    position_ids,
-                    attn_mask,
-                    labels=left_shifted_input_ids,
-                    taskname_ids=prompt_ids,
-                    inference=inference,
-                )
-            else:
-                unmasked_unreduced_loss = self.language_model(
-                    input_ids, position_ids, attn_mask, labels=left_shifted_input_ids
-                )
-
-            if isinstance(unmasked_unreduced_loss, tuple):
-                unmasked_unreduced_loss = unmasked_unreduced_loss[0]
-
-            labels = torch.cat([prompt_token_labels, labels], axis=1)
-            make_up_last_column_labels = torch.ones_like(labels[:, -1:]) * self.tokenizer.tokenizer.pad_token_id
-            new_labels = torch.cat([labels[:, 1:], make_up_last_column_labels], axis=-1)
-            filler = torch.zeros_like(new_labels)
-            labels_mask_0 = torch.where(new_labels != -100, new_labels, filler)
-            labels_mask = labels_mask_0 > 0
-
-            loss = self.mask_and_reduce_loss(labels_mask, unmasked_unreduced_loss)
-            loss_per_sample = self.mask_and_reduce_loss_per_sample(labels_mask, unmasked_unreduced_loss)
-
-        return loss, loss_per_sample
-
-    def mask_and_reduce_loss_per_sample(self, loss_mask, unmasked_unreduced_loss):
-        """
-        Mask and reduce loss based on each sample in batch
-        Useful for ranking candidates with the same prompt in batch based on loss
-        """
-        losses = unmasked_unreduced_loss.float()
-        loss_mask = loss_mask.view(-1).float()
-        masked_loss = losses.view(-1) * loss_mask
-        loss_per_sample = torch.mean(masked_loss.view(unmasked_unreduced_loss.size()), dim=-1)
-        return loss_per_sample
-
-    def mask_and_reduce_loss(self, loss_mask, output_tensor):
-        losses = output_tensor.float()
-        loss_mask = loss_mask.view(-1).float()
-        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
-        return loss
-
-    def decode(self, tokens):
-        if tokens not in self.token_to_words:
-            self.token_to_words[tokens] = self.tokenizer.tokenizer.decode(tokens)
-        return self.token_to_words[tokens]
-
-    def binary_score_candidates(
-        self,
-        candidate_input_ids,
-        candidate_attn_masks,
-        utterance_length,
-        labels,
-        template_length,
-        correct_candidate,
-        minus_negative=True,
-        inference=False,
-    ):
-        best_candidate_input_ids = []
-
-        for i in range(candidate_input_ids.size(0)):
-            best_j = 0
-
-            lowest_loss = float("inf")
-
-            for j in range(0, candidate_input_ids.size(1), 2):
-
-                if j > 0 and torch.equal(candidate_input_ids[i, j, :], candidate_input_ids[i, 0, :]):
-                    break
-
-                start_yes = j if j // 2 == correct_candidate[i].item() else j + 1
-
-                cand_loss = self(
-                    candidate_input_ids[i, start_yes : start_yes + 1, :],
-                    candidate_attn_masks[i, start_yes : start_yes + 1, :],
-                    self.get_binary_score_labels(candidate_input_ids[i, start_yes : start_yes + 1, :]),
-                    inference=inference,
-                )
-
-                considered_loss = cand_loss.item()
-
-                if minus_negative:
-                    start_no = j + 1 if j // 2 == correct_candidate[i].item() else j
-
-                    negative_cand_loss = self(
-                        candidate_input_ids[i, start_no : start_no + 1, :],
-                        candidate_attn_masks[i, start_no : start_no + 1, :],
-                        self.get_binary_score_labels(candidate_input_ids[i, start_no : start_no + 1, :]),
-                        inference=inference,
-                    )
-                    considered_loss -= negative_cand_loss.item()
-
-                if considered_loss < lowest_loss:
-                    best_j = start_yes
-                    lowest_loss = considered_loss
-
-            best_candidate_input_ids.append(candidate_input_ids[i, best_j, :])
-
-        candidate_tokens = torch.stack(best_candidate_input_ids)
-        generated_field, ground_truth_field = self.process_into_structured_fields(
-            candidate_tokens, labels, template_length=template_length
-        )
-        return generated_field, ground_truth_field
-
-    def get_binary_score_labels(self, input_ids):
-        # mask out every token except the last token for yes/no/true/false
-        labels = torch.zeros_like(input_ids)
-
-        for i in range(input_ids.size(0)):
-            for j in range(input_ids.size(1)):
-                if input_ids.data[0, j] == self.tokenizer.tokenizer.pad_token_id:
-                    stop_point = j
-                    break
-            last_point = stop_point - 1
-            labels.data[i, last_point] = input_ids[i, last_point]
-
-        return labels
-
-    def rank_candidates(
-        self,
-        candidate_input_ids,
-        candidate_attn_masks,
-        utterance_length,
-        labels,
-        template_length,
-        minus_prior=True,
-        inference=False,
-    ):
-        best_candidate_input_ids = []
-
-        for i in range(candidate_input_ids.size(0)):
-            # candidates are padded with first candidate to ensure equal number of candidates in batch
-            # run for loop to strip redundant candidates
-            last_j = candidate_input_ids.size(1)
-            for j in range(1, candidate_input_ids.size(1)):
-                if torch.equal(candidate_input_ids[i, j, :], candidate_input_ids[i, 0, :]):
-                    last_j = j
-                    break
-
-            utterance_end = utterance_length[i].item()
-            # this might cause GPU memory pressure there are many candidates
-            # if OOM, re-write to do this in a for loop with as many as train_ds.batch_size
-            _, loss_per_sample = self(
-                candidate_input_ids[i, :last_j, :],
-                candidate_attn_masks[i, :last_j, :],
-                candidate_input_ids[i, :last_j, :],
-                inference=inference,
-            )
-
-            if minus_prior:
-                _, utterance_free_cand_loss_per_sample = self(
-                    candidate_input_ids[i, :last_j, utterance_end:],
-                    candidate_attn_masks[i, :last_j, utterance_end:],
-                    candidate_input_ids[i, :last_j, utterance_end:],
-                    inference=inference,
-                )
-                considered_loss = loss_per_sample - utterance_free_cand_loss_per_sample
-            else:
-                considered_loss = loss_per_sample
-            best_j = torch.argmin(considered_loss)
-            best_candidate_input_ids.append(candidate_input_ids[i, best_j, :])
-
-        candidate_tokens = torch.stack(best_candidate_input_ids)
-        generated_field, ground_truth_field = self.process_into_structured_fields(
-            candidate_tokens, labels, template_length=template_length
-        )
-        return generated_field, ground_truth_field
-
-    def generate_candidates(self, labels, template_length, input_ids, attn_masks):
-
-        tokens_to_generate = self.cfg.tokens_to_generate
-
-        if self.cfg.library == "huggingface":
-            generated_tokens = []
-            max_length = 0
-            for i in range(input_ids.size(0)):
-                param_dict = {
-                    "input_ids": input_ids[i : i + 1, : template_length[i]],
-                    "max_length": template_length[i] + tokens_to_generate,
-                    "pad_token_id": self.tokenizer.tokenizer.pad_token_id,
-                }
-                generated_tokens.append(self.language_model.generate(**param_dict))
-                max_length = max(max_length, generated_tokens[-1].size(1))
-
-            # pad each generated to ensure they are of same length in dim 1, therefore stack-able
-            generated_tokens = [
-                torch.cat(
-                    [i, torch.ones((1, max_length - i.size(1))).to(i.device) * self.tokenizer.tokenizer.pad_token_id],
-                    axis=-1,
-                )
-                for i in generated_tokens
-            ]
-            generated_tokens = torch.cat(generated_tokens, axis=0)
-            num_prompt_tokens = 0
-
-        elif self.cfg.library == "megatron":
-
-            prompt_ids = self.get_virtual_prompt_ids_for_megatron_gpt(input_ids)
-
-            num_prompt_tokens = (
-                len(self.language_model.pseudo_token_ids) if hasattr(self.language_model, 'pseudo_token_ids') else 0
-            )
-
-            prompt_token_labels = self.get_prompt_token_labels_for_megatron_gpt(input_ids, num_prompt_tokens)
-            input_ids_without_answers = [
-                torch.cat(
-                    [
-                        input_ids[i, : template_length[i]],
-                        torch.ones((input_ids.size(1) - template_length[i].item(),)).to(input_ids.device)
-                        * self.tokenizer.tokenizer.pad_token_id,
-                    ],
-                    axis=-1,
-                ).type(input_ids.dtype)
-                for i in range(input_ids.size(0))
-            ]
-            input_ids_without_answers = torch.stack(input_ids_without_answers)
-            input_ids_new = torch.cat(
-                [
-                    prompt_token_labels,
-                    input_ids_without_answers,
-                    torch.ones((input_ids.size(0), tokens_to_generate)).to(input_ids.device)
-                    * self.tokenizer.tokenizer.pad_token_id,
-                ],
-                axis=1,
-            ).type(input_ids.dtype)
-
-            tokens_for_generation = (input_ids_new, template_length + num_prompt_tokens)
-
-            length_param: LengthParam = {"min_length": 0, "max_length": tokens_to_generate}
-
-            generated_dict = megatron_gpt_generate(
-                self.language_model,
-                tokens_for_generation,
-                self.tokenizer,
-                length_param,
-                get_default_sampling_params(),
-                task_ids=prompt_ids,
-            )
-            generated_tokens = torch.LongTensor(generated_dict['token_ids'])
-
-        generated_field, ground_truth_field = self.process_into_structured_fields(
-            generated_tokens, labels, template_length=template_length + num_prompt_tokens
-        )
-        return generated_field, ground_truth_field
-
-    def eval_step_helper(self, batch, mode='val'):
-        (
-            input_ids,
-            attn_masks,
-            labels,
-            candidate_input_ids,
-            candidate_attn_masks,
-            template_length,
-            utterance_length,
-            correct_candidate,
-        ) = batch
-
-        inference = mode == 'test'
-        loss, _ = self(input_ids, attn_masks, labels, inference=inference)
-        self.log("{}_loss".format(mode), loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
-
-        # ranking using perplexity of candidates following the "<utterance> <label_type>:"
-        if self.eval_mode == "ranking":
-            generated_field, ground_truth_field = self.rank_candidates(
-                candidate_input_ids,
-                candidate_attn_masks,
-                utterance_length,
-                labels,
-                template_length,
-                inference=inference,
-            )
-        # autoregressively generate candidates (possibly with constraint)
-        elif self.eval_mode == "generation":
-            generated_field, ground_truth_field = self.generate_candidates(
-                labels, template_length, input_ids, attn_masks
-            )
-        # comparing likelihood based on the perplexity of generating " Answer: yes" after "<utterance> <label_type>: <candidate_label>"
-        # (optionally, the difference of that with " Answer: no" using the flag minus_negative=True)
-        elif self.eval_mode == "binary_score":
-            generated_field, ground_truth_field = self.binary_score_candidates(
-                candidate_input_ids,
-                candidate_attn_masks,
-                utterance_length,
-                labels,
-                template_length,
-                correct_candidate,
-                inference=inference,
-            )
-
-        else:
-            raise ValueError(
-                "{} is not among supported options (ranking, generation, binary_score)".format(self.eval_mode)
-            )
-
-        return {
-            'loss': loss,
-            'input': self.tokenizer.tokenizer.batch_decode(input_ids, skip_special_tokens=True),
-            'generated_field': generated_field,
-            'ground_truth_field': ground_truth_field,
-        }
-
-    def process_into_structured_fields(self, generated_tokens, labels, template_length=None):
-
-        generated_field = []
-
-        for i in range(generated_tokens.size(0)):
-            start_point = 0 if template_length is None else template_length[i].item()
-            stop_point = generated_tokens.size(1)
-
-            for j in range(start_point, stop_point):
-                if generated_tokens.data[i, j] == self.tokenizer.tokenizer.pad_token_id:
-                    stop_point = j
-                    break
-
-            # this is to account for the tokens ' Answer: ' + 'yes'/'no'/'true'/'false'
-            if self.eval_mode == "binary_score":
-                stop_point -= 3
-
-            one_generated_field = self.decode(generated_tokens[i, start_point:stop_point]).strip()
-            generated_field.append(one_generated_field)
-
-        ground_truth_field = self.process_ground_truth_field(labels)
-
-        return generated_field, ground_truth_field
-
-    def process_ground_truth_field(self, labels):
-        ground_truth_field = []
-
-        for i in range(labels.size(0)):
-            correct_label = tuple(
-                [j for j in labels.data[i] if j != self.tokenizer.tokenizer.pad_token_id and j != -100]
-            )
-            ground_truth_field.append(self.decode(correct_label).strip())
-
-        return ground_truth_field
-
-    def prepare_data(self):
-        """
-        Preprocessed schema and dialogues and caches this
-        """
-        if self.data_prepared:
-            return
-
-        if self._cfg.dataset.task == 'sgd':
-            self.dialogues_processor = DialogueSGDDataProcessor(
-                data_dir=self._cfg.dataset.data_dir,
-                dialogues_example_dir=self._cfg.dataset.dialogues_example_dir,
-                tokenizer=self.tokenizer,
-                cfg=self._cfg.dataset,
-            )
-        elif self._cfg.dataset.task in ['assistant', "zero_shot"]:
-            self.dialogues_processor = DialogueAssistantDataProcessor(
-                data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset
-            )
-        elif self._cfg.dataset.task == 'design':
-            self.dialogues_processor = DialogueDesignDataProcessor(
-                data_dir=self._cfg.dataset.data_dir,
-                tokenizer=self.tokenizer,
-                cfg=self._cfg.dataset,
-            )
-        else:
-            raise ValueError("Only sgd, assistant, zero_shot, design supported for Dialogue GPT Classification Model")
-
-        self.data_prepared = True
-
-    def setup(self, stage=None):
-        super().setup(stage)
-        if self.cfg.library == "megatron" and self.prompt_learning and stage == "fit":
-            if self.cfg.virtual_prompt_style == VirtualPromptStyle.P_TUNING:
-                self.language_model.init_prompt_encoder()
-            else:
-                raise ValueError(
-                    "Use model.virtual_prompt_style='p-tuning' with model.p_tuning.encoder_type='embedding' to enable prompt-tuning."
-                )
-
-    def update_data_dirs(self, data_dir: str, dialogues_example_dir: str):
-        """
-        Update data directories
-
-        Args:
-            data_dir: path to data directory
-            dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created.
-        """
-        if not os.path.exists(data_dir):
-            raise ValueError(f"{data_dir} is not found")
-        self._cfg.dataset.data_dir = data_dir
-        self._cfg.dataset.dialogues_example_dir = dialogues_example_dir
-        logging.info(f'Setting model.dataset.data_dir to {data_dir}.')
-        logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.')
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, split=train_data_config.ds_item)
-
-    def setup_multiple_validation_data(self, val_data_config: Optional[DictConfig] = None):
-        return self.setup_validation_data(val_data_config)
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, split=val_data_config.ds_item)
-
-    def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict]):
-        self.setup_test_data(test_data_config)
-
-    def setup_test_data(self, test_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, split=test_data_config.ds_item)
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig, split: str) -> DataLoader:
-        dataset_cfg = self._cfg.dataset
-        data_dir = dataset_cfg.data_dir
-
-        if not os.path.exists(data_dir):
-            raise FileNotFoundError(f"Data directory is not found at: {data_dir}.")
-
-        dataset = DialogueGPTClassificationDataset(
-            dataset_split=split,
-            dialogues_processor=self.dialogues_processor,
-            tokenizer=self.dialogues_processor._tokenizer,
-            cfg=dataset_cfg,
-        )
-
-        dl = torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=cfg.batch_size,
-            collate_fn=dataset.collate_fn,
-            drop_last=cfg.drop_last,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.num_workers,
-            pin_memory=cfg.pin_memory,
-        )
-        return dl
-
-    @classmethod
-    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
-        """
-        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
-
-        Returns:
-            List of available pre-trained models.
-        """
-        result = []
-        return result
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py
deleted file mode 100644
index 7fb0ba770189..000000000000
--- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py
+++ /dev/null
@@ -1,441 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import os
-from typing import Dict, Optional, Union
-
-import numpy as np
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig
-from torch.utils.data import DataLoader
-from transformers import AutoModelWithLMHead
-
-from nemo.collections.nlp.data.dialogue.data_processor.mellon_qa_data_processor import DialogueMellonQADataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.ms_marco_data_processor import DialogueMSMarcoDataProcessor
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_gpt_generation_dataset import DialogueGPTGenerationDataset
-from nemo.collections.nlp.metrics.dialogue_metrics import DialogueGenerationMetrics
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import (
-    MegatronGPTPromptLearningModel,
-)
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.core.classes.common import PretrainedModelInfo
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueGPTGenerationModel']
-
-NUM_TASKS = 1  # focussing on intent currently 6  # number of multi-head tasks
-
-
-class DialogueGPTGenerationModel(NLPModel):
-    def __init__(
-        self,
-        cfg: DictConfig,
-        trainer: Trainer = None,
-    ):
-        # deprecation warning
-        deprecated_warning("DialogueGPTGenerationModel")
-
-        self.cfg = cfg
-        self.data_prepared = False
-
-        self.setup_tokenizer(cfg.tokenizer)
-        self.tokenizer.tokenizer.pad_token = self.tokenizer.tokenizer.eos_token
-        self.epoch_number = 0
-        self.prompt_learning = self.cfg.prompt_learning
-        super().__init__(cfg=cfg, trainer=trainer, no_lm_init=True)
-
-        if self.cfg.library == "huggingface":
-            self.language_model = AutoModelWithLMHead.from_pretrained(cfg.language_model.pretrained_model_name)
-            self.language_model.resize_token_embeddings(len(self.tokenizer.tokenizer))
-            if self.cfg.language_model.lm_checkpoint:
-                self.language_model.load_state_dict(torch.load(self.cfg.language_model.lm_checkpoint))
-        elif self.cfg.library == "megatron":
-            if self.prompt_learning:
-                # removing tokenizer cfg as this triggers tokenizer construction which is not helpful here as we have a separate tokenizer
-                new_cfg = copy.copy(cfg)
-                del new_cfg.tokenizer
-                self.language_model = MegatronGPTPromptLearningModel(new_cfg, trainer)
-            else:
-                self.language_model = MegatronGPTModel.restore_from(cfg.language_model.lm_checkpoint, trainer=trainer)
-
-    def training_step(self, batch, batch_idx):
-        input_ids, attn_masks, labels, _, _ = batch
-
-        loss = self(input_ids, attn_masks, labels, inference=False)
-        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
-        return {'loss': loss}
-
-    def validation_step(self, batch, batch_idx):
-        loss = self.eval_step_helper(batch=batch)
-        self.validation_step_outputs.append(loss)
-        return loss
-
-    def on_validation_epoch_end(self):
-        self.eval_epoch_end(self.validation_step_outputs, mode='val')
-        self.validation_step_outputs.clear()  # free memory
-
-    def on_test_epoch_end(self):
-        self.eval_epoch_end(self.test_step_outputs, mode='test')
-        self.test_step_outputs.clear()  # free memory
-
-    def eval_epoch_end(self, outputs, mode='val'):
-
-        generated_field = []
-        ground_truth_field = []
-        inputs = []
-        loss = []
-
-        for output in outputs:
-            generated_field += output["generated_field"]
-            ground_truth_field += output["ground_truth_field"]
-            inputs += output["input"]
-            loss.append(output["loss"].item())
-
-        os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True)
-        filename = os.path.join(
-            self.cfg.dataset.dialogues_example_dir, f"{mode}_predictions_epoch{self.epoch_number}.jsonl"
-        )
-
-        DialogueGenerationMetrics.save_predictions(
-            filename,
-            generated_field,
-            ground_truth_field,
-            inputs,
-        )
-
-        label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))])
-        precision, recall, f1 = DialogueGenerationMetrics.get_f1(generated_field, ground_truth_field)
-        bleu = DialogueGenerationMetrics.get_bleu(generated_field, ground_truth_field)
-        avg_loss = np.mean(loss)
-        ppl = np.exp(avg_loss)
-
-        self.log('{}_accuracy'.format(mode), label_acc * 100)
-        self.log('precision', precision)
-        self.log('recall', recall)
-        self.log('f1', f1)
-        self.log('bleu', bleu)
-        self.log('{}_loss'.format(mode), avg_loss)
-        self.log('{}_ppl'.format(mode), ppl)
-
-        if mode == 'val':
-            self.epoch_number += 1
-            if self.cfg.save_model:
-                filename = '{}/val_loss-{}-epoch-{}-answer-extender.bin'.format(
-                    self.cfg.dataset.dialogues_example_dir, avg_loss, self.epoch_number
-                )
-                torch.save(self.language_model.state_dict(), filename)
-
-    def test_step(self, batch, batch_idx):
-        loss = self.eval_step_helper(batch=batch, mode='test')
-        self.test_step_outputs.append(loss)
-        return loss
-
-    # for inference only
-    def predict_step(self, batch, batch_idx, dataloader_idx=None):
-        # return self(batch)
-        raise NotImplementedError()
-
-    def forward(self, input_ids, attention_mask, labels, inference=True):
-
-        if self.cfg.library == "huggingface":
-            output = self.language_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
-            loss = output['loss']
-
-        elif self.cfg.library == "megatron":
-            num_prompt_tokens = (
-                len(self.language_model.pseudo_token_ids) if hasattr(self.language_model, 'pseudo_token_ids') else 0
-            )
-
-            position_ids = torch.arange(
-                start=0,
-                end=num_prompt_tokens + input_ids.size(1),
-                dtype=torch.long,
-                device=input_ids.device,
-            )
-
-            position_ids = position_ids.unsqueeze(0).repeat(input_ids.size(0), 1)
-
-            prompt_ids = torch.tensor([0] * input_ids.size(0)) if self.prompt_learning else None
-
-            attn_mask_add_on = torch.ones((attention_mask.size(0), num_prompt_tokens), device=attention_mask.device)
-            full_attention_mask = torch.cat([attn_mask_add_on, attention_mask], axis=-1)
-            full_attention_mask_expand = torch.tril(
-                full_attention_mask.unsqueeze(2).tile(full_attention_mask.size(1))
-            ).unsqueeze(1)
-
-            attn_mask = full_attention_mask_expand > 0
-
-            prompt_token_labels = torch.full(
-                size=(input_ids.size(0), num_prompt_tokens),
-                fill_value=self.tokenizer.tokenizer.pad_token_id,
-                dtype=torch.long,
-            )
-
-            if self.prompt_learning:
-                prompt_token_labels.data = torch.LongTensor(
-                    np.tile(np.array(self.language_model.pseudo_token_ids), (input_ids.size(0), 1))
-                )
-
-            prompt_token_labels = prompt_token_labels.to(input_ids.device)
-
-            input_ids_new = torch.cat([torch.zeros_like(prompt_token_labels), input_ids], axis=1)
-            make_up_last_column_input_ids = (
-                torch.ones_like(input_ids_new[:, -1:]) * self.tokenizer.tokenizer.pad_token_id
-            )
-            left_shifted_input_ids = torch.cat([input_ids_new[:, 1:], make_up_last_column_input_ids], axis=-1)
-            if self.prompt_learning:
-                unmasked_unreduced_loss = self.language_model(
-                    input_ids_new,
-                    position_ids,
-                    attn_mask,
-                    labels=left_shifted_input_ids,
-                    taskname_ids=prompt_ids,
-                    inference=inference,
-                )
-            else:
-                unmasked_unreduced_loss = self.language_model(
-                    input_ids, position_ids, attn_mask, labels=left_shifted_input_ids
-                )
-
-            if isinstance(unmasked_unreduced_loss, tuple):
-                unmasked_unreduced_loss = unmasked_unreduced_loss[0]
-
-            labels = torch.cat([prompt_token_labels, labels], axis=1)
-            make_up_last_column_labels = torch.ones_like(labels[:, -1:]) * self.tokenizer.tokenizer.pad_token_id
-            new_labels = torch.cat([labels[:, 1:], make_up_last_column_labels], axis=-1)
-            filler = torch.zeros_like(new_labels)
-            labels_mask_0 = torch.where(new_labels != -100, new_labels, filler)
-            labels_mask = labels_mask_0 > 0
-
-            loss = self.mask_and_reduce_loss(labels_mask, unmasked_unreduced_loss)
-        return loss
-
-    def mask_and_reduce_loss(self, loss_mask, output_tensor):
-        losses = output_tensor.float()
-        loss_mask = loss_mask.view(-1).float()
-        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
-        return loss
-
-    def setup(self, stage=None):
-        super().setup(stage)
-        if self.cfg.library == "megatron" and self.prompt_learning:
-            self.language_model.init_new_prompts()
-
-    def prepare_megatron_generation(self, labels, input_ids, template_length):
-        """
-        # adapted from MegatronGPTModel._bucketize_gpt_inference
-        """
-        batch_size = labels.size(0)
-        prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_learning else None
-        batch_tokens = input_ids.tolist()
-
-        # unpad tokens
-        lens = template_length
-        indxs = [index for index in range(batch_size)]
-        for lenn, index in zip(lens, indxs):
-            batch_tokens[index] = batch_tokens[index][:lenn]
-
-        # chunk tokens by same length
-        pre_buckets, lens = [], list(set(lens.tolist()))
-        for lenn in lens:
-            pre_buckets.append([(tokens, index) for index, tokens in enumerate(batch_tokens) if len(tokens) == lenn])
-
-        buckets, positions, bucket_prompt_tags = [], [], []
-
-        # get buckets and prompts initial positions
-        for bucket in pre_buckets:
-            buckets.append(torch.tensor([item[0] for item in bucket]).to(device=labels.device))
-            positions.append([item[1] for item in bucket])
-
-            # bucket prompt tags identically to their corresponding examples
-            if prompt_tags:
-                bucket_prompt_tags.append([prompt_tags[item[1]] for item in bucket])
-
-        # Flatten position list
-        positions = [item for sublist in positions for item in sublist]
-
-        # Flatten buckets and bucket_prompt_tags # temp fix for megatron complete issue. However, this is also slower than bucketized inference
-        buckets = [item.unsqueeze(0) for sublist in buckets for item in sublist]
-        bucket_prompt_tags = [[item] for sublist in bucket_prompt_tags for item in sublist]
-
-        request = {"tokens": buckets, "prompt_tags": bucket_prompt_tags}
-
-        return positions, request
-
-    def post_process_megatron_generation(self, outputs):
-        text_outputs = [output[0] for output in outputs]
-        generated_tokens = self.tokenizer.tokenizer(text_outputs, padding=True, return_tensors="pt").data["input_ids"]
-        return generated_tokens
-
-    def generate_candidates(self, labels, template_length, input_ids, attn_masks):
-
-        tokens_to_generate = self.cfg.tokens_to_generate
-        if self.cfg.library == "huggingface":
-            generated_tokens = []
-            max_length = 0
-            for i in range(input_ids.size(0)):
-                param_dict = {
-                    "input_ids": input_ids[i : i + 1, : template_length[i]],
-                    "max_length": template_length[i] + tokens_to_generate,
-                    "pad_token_id": self.tokenizer.tokenizer.pad_token_id,
-                }
-                generated_tokens.append(self.language_model.generate(**param_dict))
-                max_length = max(max_length, generated_tokens[-1].size(1))
-
-            # pad each generated to ensure they are of same length in dim 1, therefore stack-able
-            generated_tokens = [
-                torch.cat(
-                    [i, torch.ones((1, max_length - i.size(1))).to(i.device) * self.tokenizer.tokenizer.pad_token_id],
-                    axis=-1,
-                )
-                for i in generated_tokens
-            ]
-            generated_tokens = torch.cat(generated_tokens, axis=0)
-
-        elif self.cfg.library == "megatron":
-            positions, request = self.prepare_megatron_generation(labels, input_ids, template_length)
-            outputs = self.language_model.complete(request, positions, tokens_to_generate)
-            generated_tokens = self.post_process_megatron_generation(outputs)
-
-        generated_field = self.process_into_structured_fields(generated_tokens, template_length=template_length)
-
-        ground_truth_field = self.process_into_structured_fields(labels, template_length=template_length)
-
-        return generated_field, ground_truth_field
-
-    def process_into_structured_fields(self, full_seq_ids, template_length=None):
-
-        structured_field = []
-        for i in range(full_seq_ids.size(0)):
-            start_point = 0 if template_length is None else template_length[i].item()
-            stop_point = full_seq_ids.size(1)
-
-            for j in range(start_point, stop_point):
-                if full_seq_ids.data[i, j] == self.tokenizer.tokenizer.pad_token_id:
-                    stop_point = j
-                    break
-            one_generated_field = self.tokenizer.tokenizer.decode(full_seq_ids[i, start_point:stop_point]).strip()
-            structured_field.append(one_generated_field)
-        return structured_field
-
-    def eval_step_helper(self, batch, mode='val'):
-
-        input_ids, attn_masks, labels, template_length, utterance_length = batch
-
-        loss = self(input_ids, attn_masks, labels)
-        self.log("{}_loss".format(mode), loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
-
-        # autoregressively generate candidates (possibly with constraint)
-        generated_field, ground_truth_field = self.generate_candidates(labels, template_length, input_ids, attn_masks)
-
-        return {
-            'loss': loss,
-            'input': self.tokenizer.tokenizer.batch_decode(input_ids, skip_special_tokens=True),
-            'generated_field': generated_field,
-            'ground_truth_field': ground_truth_field,
-        }
-
-    def prepare_data(self):
-        """
-        Preprocessed schema and dialogues and caches this
-        """
-        if self.data_prepared:
-            return
-
-        if self._cfg.dataset.task == "ms_marco":
-            self.dialogues_processor = DialogueMSMarcoDataProcessor(
-                data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset
-            )
-        elif self._cfg.dataset.task == "mellon_qa":
-            self.dialogues_processor = DialogueMellonQADataProcessor(
-                data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset
-            )
-        else:
-            raise ValueError("Only ms_marco and mellon_qa supported for Dialogue GPT Generation Model")
-
-        self.data_prepared = True
-
-    def update_data_dirs(self, data_dir: str, dialogues_example_dir: str):
-        """
-        Update data directories
-
-        Args:
-            data_dir: path to data directory
-            dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created.
-        """
-        if not os.path.exists(data_dir):
-            raise ValueError(f"{data_dir} is not found")
-        self._cfg.dataset.data_dir = data_dir
-        self._cfg.dataset.dialogues_example_dir = dialogues_example_dir
-        logging.info(f'Setting model.dataset.data_dir to {data_dir}.')
-        logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.')
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, split=train_data_config.ds_item)
-
-    def setup_multiple_validation_data(self, val_data_config: Optional[DictConfig] = None):
-        return self.setup_validation_data(val_data_config)
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, split=val_data_config.ds_item)
-
-    def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict]):
-        self.setup_test_data(test_data_config)
-
-    def setup_test_data(self, test_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, split=test_data_config.ds_item)
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig, split: str) -> DataLoader:
-        dataset_cfg = self._cfg.dataset
-        data_dir = dataset_cfg.data_dir
-
-        if not os.path.exists(data_dir):
-            raise FileNotFoundError(f"Data directory is not found at: {data_dir}.")
-
-        dataset = DialogueGPTGenerationDataset(
-            dataset_split=split,
-            dialogues_processor=self.dialogues_processor,
-            tokenizer=self.dialogues_processor._tokenizer,
-            cfg=dataset_cfg,
-        )
-
-        dl = torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=cfg.batch_size,
-            collate_fn=dataset.collate_fn,
-            drop_last=cfg.drop_last,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.num_workers,
-            pin_memory=cfg.pin_memory,
-        )
-        return dl
-
-    @classmethod
-    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
-        """
-        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
-
-        Returns:
-            List of available pre-trained models.
-        """
-        result = []
-        return result
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py
deleted file mode 100644
index 9bf7ae2a9116..000000000000
--- a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Optional
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig
-from transformers import AutoModel
-
-from nemo.collections.nlp.data.dialogue import DialogueSGDDataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.design_data_processor import DialogueDesignDataProcessor
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_nearest_neighbour_dataset import (
-    DialogueNearestNeighbourDataset,
-)
-from nemo.collections.nlp.metrics.classification_report import ClassificationReport
-from nemo.collections.nlp.metrics.dialogue_metrics import DialogueGenerationMetrics
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.core.classes.common import PretrainedModelInfo
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueNearestNeighbourModel']
-
-
-class DialogueNearestNeighbourModel(NLPModel):
-    """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions"""
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        # deprecation warning
-        deprecated_warning("DialogueNearestNeighbourModel")
-
-        self.cfg = cfg
-        super().__init__(cfg=cfg, trainer=trainer)
-        if self.cfg.library == "huggingface":
-            self.language_model = AutoModel.from_pretrained(self.cfg.language_model.pretrained_model_name)
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig, dataset_split) -> 'torch.utils.data.DataLoader':
-        if self._cfg.dataset.task == "zero_shot":
-            self.data_processor = DialogueAssistantDataProcessor(
-                self.cfg.data_dir, self.tokenizer, cfg=self.cfg.dataset
-            )
-        elif self._cfg.dataset.task == "design":
-            self.data_processor = DialogueDesignDataProcessor(
-                data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset
-            )
-        elif self._cfg.dataset.task == 'sgd':
-            self.data_processor = DialogueSGDDataProcessor(
-                data_dir=self._cfg.dataset.data_dir,
-                dialogues_example_dir=self._cfg.dataset.dialogues_example_dir,
-                tokenizer=self.tokenizer,
-                cfg=self._cfg.dataset,
-            )
-        else:
-            raise ValueError("Only zero_shot, design and sgd supported for Zero Shot Intent Model")
-
-        dataset = DialogueNearestNeighbourDataset(
-            dataset_split,
-            self.data_processor,
-            self.tokenizer,
-            self.cfg.dataset,  # this is the model.dataset cfg, which is diff from train_ds cfg etc
-        )
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            collate_fn=dataset.collate_fn,
-            batch_size=cfg.batch_size,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.get("num_workers", 0),
-            pin_memory=cfg.get("pin_memory", False),
-            drop_last=cfg.get("drop_last", False),
-        )
-
-    def forward(self, input_ids, attention_mask):
-        if self.cfg.library == 'huggingface':
-            output = self.language_model(input_ids=input_ids, attention_mask=attention_mask)
-        return output
-
-    def training_step(self, batch, batch_idx):
-        raise NotImplementedError
-
-    def test_step(self, batch, batch_idx):
-        loss = self.validation_step(batch, batch_idx, mode='test')
-        self.test_step_outputs.append(loss)
-        return loss
-
-    @staticmethod
-    def mean_pooling(model_output, attention_mask):
-        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-
-    def validation_step(self, batch, batch_idx, mode='val'):
-        """
-        Lightning calls this inside the validation loop with the data from the validation dataloader
-        passed in as `batch`.
-        """
-        input_ids, input_mask, labels = batch
-        preds = []
-        gts = []
-        inputs = []
-        for i in range(input_ids.size(0)):
-            output = self.forward(input_ids=input_ids[i], attention_mask=input_mask[i])
-            sentence_embeddings = DialogueNearestNeighbourModel.mean_pooling(output, input_mask[i])
-            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
-            cos_sim = F.cosine_similarity(sentence_embeddings[:1, :], sentence_embeddings[1:, :])
-            pred = torch.argmax(cos_sim).item() + 1
-            gt = torch.argmax(labels[i][1:]).item() + 1
-
-            preds.append(input_ids[i, pred])
-            gts.append(input_ids[i, gt])
-            inputs.append(input_ids[i, 0])
-
-        loss = {'preds': torch.stack(preds), 'labels': torch.stack(gts), 'inputs': torch.stack(inputs)}
-        self.validation_step_outputs.append(loss)
-        return loss
-
-    def multi_test_epoch_end(self, outputs, dataloader_idx):
-        return self.on_validation_epoch_end()
-
-    def on_validation_epoch_end(self):
-        """
-        Get metrics based on the candidate label with the highest predicted likelihood and the ground truth label for intent
-        """
-        prefix = "test" if self.trainer.testing else "val"
-        if prefix == "val":
-            outputs = self.validation_step_outputs
-        else:
-            outputs = self.test_step_outputs
-        output_preds = torch.cat([output['preds'] for output in outputs], dim=0)
-        output_labels = torch.cat([output['labels'] for output in outputs], dim=0)
-        inputs = torch.cat([output['inputs'] for output in outputs], dim=0)
-
-        decoded_preds = self.tokenizer.tokenizer.batch_decode(output_preds, skip_special_tokens=True)
-        decoded_labels = self.tokenizer.tokenizer.batch_decode(output_labels, skip_special_tokens=True)
-        decoded_inputs = self.tokenizer.tokenizer.batch_decode(inputs, skip_special_tokens=True)
-
-        prompt_len = len(self.cfg.dataset.prompt_template.strip())
-        predicted_labels = [i[prompt_len:].strip() for i in decoded_preds]
-        ground_truth_labels = [i[prompt_len:].strip() for i in decoded_labels]
-
-        os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True)
-        filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl")
-
-        DialogueGenerationMetrics.save_predictions(
-            filename,
-            predicted_labels,
-            ground_truth_labels,
-            decoded_inputs,
-        )
-
-        label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))}
-        self.classification_report = ClassificationReport(
-            num_classes=len(label_to_ids), mode='micro', label_ids=label_to_ids, dist_sync_on_step=True
-        ).to(output_preds[0].device)
-
-        predicted_label_ids = torch.tensor([label_to_ids[label] for label in predicted_labels]).to(
-            output_preds[0].device
-        )
-        ground_truth_label_ids = torch.tensor([label_to_ids[label] for label in ground_truth_labels]).to(
-            output_preds[0].device
-        )
-
-        tp, fn, fp, _ = self.classification_report(predicted_label_ids, ground_truth_label_ids)
-
-        precision, recall, f1, report = self.classification_report.compute()
-        label_acc = np.mean([int(predicted_labels[i] == ground_truth_labels[i]) for i in range(len(predicted_labels))])
-
-        logging.info(report)
-
-        self.log('unified_precision', precision)
-        self.log('unified_f1', f1)
-        self.log('unified_recall', recall)
-        self.log('unfied_accuracy', label_acc * 100)
-
-        self.classification_report.reset()
-        self.validation_step_outputs.clear() if prefix == 'val' else self.test_step_outputs.clear()
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig]):
-        if not train_data_config:
-            logging.info(
-                f"Dataloader config or file_name for the training set is missing, so no data loader for test is created!"
-            )
-            self._test_dl = None
-            return
-        self._train_dl = self._setup_dataloader_from_config(train_data_config, "train")
-
-        # self.create_loss_module()
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig]):
-        if not val_data_config:
-            logging.info(
-                f"Dataloader config or file_path for the validation data set is missing, so no data loader for test is created!"
-            )
-            self._test_dl = None
-            return
-        self._validation_dl = self._setup_dataloader_from_config(val_data_config, "dev")
-
-    def setup_multiple_test_data(self, test_data_config: Optional[DictConfig]):
-        self.setup_test_data(test_data_config)
-
-    def setup_test_data(self, test_data_config: Optional[DictConfig]):
-        if not test_data_config:
-            logging.info(
-                f"Dataloader config or file_path for the test data set is missing, so no data loader for test is created!"
-            )
-            self._test_dl = None
-            return
-        self._test_dl = self._setup_dataloader_from_config(test_data_config, "test")
-
-    @classmethod
-    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
-        """
-        This method returns a list of pre-trained models which can be instantiated directly from NVIDIA's NGC cloud.
-
-        Returns:
-            List of available pre-trained models.
-        """
-        result = []
-        return result
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
deleted file mode 100644
index 3f0d09d7dc66..000000000000
--- a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Dict, Optional, Union
-
-import numpy as np
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig, OmegaConf, open_dict
-from torch.utils.data import DataLoader
-from transformers import AutoModelForSeq2SeqLM
-
-from nemo.collections.nlp.data.dialogue import DialogueSGDDataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.mellon_qa_data_processor import DialogueMellonQADataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.ms_marco_data_processor import DialogueMSMarcoDataProcessor
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_s2s_generation_dataset import DialogueS2SGenerationDataset
-from nemo.collections.nlp.metrics.dialogue_metrics import DialogueGenerationMetrics
-from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.core.classes.common import PretrainedModelInfo
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-try:
-    from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator
-
-except (ImportError, ModuleNotFoundError):
-    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
-    from apex.transformer.pipeline_parallel.utils import (
-        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
-    )
-
-__all__ = ['DialogueS2SGenerationModel']
-
-
-class DialogueS2SGenerationModel(NLPModel):
-    def __init__(
-        self,
-        cfg: DictConfig,
-        trainer: Trainer = None,
-    ):
-        # deprecation warning
-        deprecated_warning("DialogueS2SGenerationModel")
-
-        self.cfg = cfg
-        self.data_prepared = False
-        self.epoch_number = 0
-        if self.cfg.library == "huggingface":
-            self.setup_tokenizer(cfg.tokenizer)
-        elif self.cfg.library == "megatron":
-            # supporting MegatronT5Model in precision = fp16
-            t5_cfg = MegatronT5Model.restore_from(
-                restore_path=cfg.language_model.lm_checkpoint, trainer=trainer, return_config=True
-            )
-            # Override the T5 configuration with the one from the config file.
-            OmegaConf.set_struct(t5_cfg, True)
-            with open_dict(t5_cfg):
-                t5_cfg.masked_softmax_fusion = False
-                t5_cfg.precision = 16
-                t5_cfg.encoder_arch = 'transformer'
-                t5_cfg.decoder_arch = 'transformer'
-
-            language_model = MegatronT5Model.restore_from(
-                restore_path=cfg.language_model.lm_checkpoint, trainer=trainer, override_config_path=t5_cfg
-            )
-            self.tokenizer = language_model.tokenizer
-
-        super().__init__(cfg=cfg, trainer=trainer, no_lm_init=True)
-
-        if self.cfg.library == "huggingface":
-            self.language_model = AutoModelForSeq2SeqLM.from_pretrained(cfg.language_model.pretrained_model_name)
-            self.language_model.resize_token_embeddings(len(self.tokenizer.tokenizer))
-            if self.cfg.language_model.lm_checkpoint:
-                self.language_model.load_state_dict(torch.load(self.cfg.language_model.lm_checkpoint))
-        elif self.cfg.library == "megatron":
-            self.language_model = language_model
-
-    def training_step(self, batch, batch_idx):
-        input_ids, attn_masks, labels = batch
-        loss = self(input_ids, attn_masks, labels)
-        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
-        return {'loss': loss}
-
-    def validation_step(self, batch, batch_idx):
-        loss = self.eval_step_helper(batch=batch)
-        self.validation_step_outputs.append(loss)
-        return loss
-
-    def on_validation_epoch_end(self):
-        self.eval_epoch_end(self.validation_step_outputs, mode='val')
-        self.validation_step_outputs.clear()  # free memory
-
-    def on_test_epoch_end(self):
-        self.eval_epoch_end(self.test_step_outputs, mode='test')
-        self.test_step_outputs.clear()  # free memory
-
-    def eval_epoch_end(self, outputs, mode='val'):
-
-        generated_field = []
-        ground_truth_field = []
-        inputs = []
-        loss = []
-
-        for output in outputs:
-            generated_field += output["generated_field"]
-            ground_truth_field += output["ground_truth_field"]
-            inputs += output["input"]
-            loss.append(output["loss"].item())
-
-        os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True)
-        filename = os.path.join(
-            self.cfg.dataset.dialogues_example_dir, f"{mode}_predictions_epoch{self.epoch_number}.jsonl"
-        )
-
-        DialogueGenerationMetrics.save_predictions(
-            filename,
-            generated_field,
-            ground_truth_field,
-            inputs,
-        )
-
-        label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))])
-        precision, recall, f1 = DialogueGenerationMetrics.get_f1(generated_field, ground_truth_field)
-        bleu = DialogueGenerationMetrics.get_bleu(generated_field, ground_truth_field)
-        avg_loss = np.mean(loss)
-        ppl = np.exp(avg_loss)
-
-        self.log('{}_accuracy'.format(mode), label_acc * 100)
-        self.log('precision', precision)
-        self.log('recall', recall)
-        self.log('f1', f1)
-        self.log('bleu', bleu)
-        self.log('{}_loss'.format(mode), avg_loss)
-        self.log('{}_ppl'.format(mode), ppl)
-
-        if mode == 'val':
-            self.epoch_number += 1
-            if self.cfg.save_model:
-                filename = '{}/val_loss-{}-epoch-{}-answer-extender.bin'.format(
-                    self.cfg.dataset.dialogues_example_dir, avg_loss, self.epoch_number
-                )
-                torch.save(self.language_model.state_dict(), filename)
-
-    def test_step(self, batch, batch_idx):
-        loss = self.eval_step_helper(batch=batch, mode='test')
-        self.test_step_outputs.append(loss)
-        return loss
-
-    # for inference only
-    def predict_step(self, batch, batch_idx, dataloader_idx=None):
-        # return self(batch)
-        raise NotImplementedError()
-
-    def forward(self, input_ids, attention_masks, labels):
-        if self.cfg.library == "huggingface":
-            output = self.language_model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
-            loss = output['loss']
-        elif self.cfg.library == "megatron":
-
-            labels = torch.where(labels != -100, labels, torch.zeros_like(labels))
-            decoder_attn_masks = torch.where(labels > 0, torch.ones_like(labels), torch.zeros_like(labels))
-
-            unmasked_unreduced_loss = self.language_model(
-                input_ids, labels[:, :-1], attention_masks, decoder_attn_masks[:, :-1], lm_labels=labels[:, 1:]
-            )
-            loss = self.language_model.loss_func(decoder_attn_masks[:, 1:].contiguous(), unmasked_unreduced_loss)
-        return loss
-
-    def prepare_megatron_generation(self, labels, input_ids, template_length):
-        """
-        # adapted from MegatronGPTModel._bucketize_gpt_inference
-        """
-        batch_size = labels.size(0)
-        prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_tags else None
-        batch_tokens = input_ids.tolist()
-
-        # unpad tokens
-        lens = template_length
-        indxs = [index for index in range(batch_size)]
-        for lenn, index in zip(lens, indxs):
-            batch_tokens[index] = batch_tokens[index][:lenn]
-
-        # chunk tokens by same length
-        pre_buckets, lens = [], list(set(lens.tolist()))
-        for lenn in lens:
-            pre_buckets.append([(tokens, index) for index, tokens in enumerate(batch_tokens) if len(tokens) == lenn])
-
-        buckets, positions, bucket_prompt_tags = [], [], []
-
-        # get buckets and prompts initial positions
-        for bucket in pre_buckets:
-            buckets.append(torch.tensor([item[0] for item in bucket]).to(device=labels.device))
-            positions.append([item[1] for item in bucket])
-
-            # bucket prompt tags identically to their corresponding examples
-            if prompt_tags:
-                bucket_prompt_tags.append([prompt_tags[item[1]] for item in bucket])
-
-        # Flatten position list
-        positions = [item for sublist in positions for item in sublist]
-
-        # Flatten buckets and bucket_prompt_tags # temp fix for megatron complete issue. However, this is also slower than bucketized inference
-        buckets = [item.unsqueeze(0) for sublist in buckets for item in sublist]
-        bucket_prompt_tags = [[item] for sublist in bucket_prompt_tags for item in sublist]
-
-        request = {"tokens": buckets, "prompt_tags": bucket_prompt_tags}
-
-        return positions, request
-
-    def post_process_megatron_generation(self, outputs):
-        text_outputs = [output[0] for output in outputs]
-        generated_tokens = self.tokenizer.tokenizer(text_outputs, padding=True, return_tensors="pt").data["input_ids"]
-        return generated_tokens
-
-    def generate_candidates(self, input_ids, attn_masks, labels):
-
-        tokens_to_generate = self.cfg.tokens_to_generate
-        if self.cfg.library == "huggingface":
-
-            param_dict = {
-                "input_ids": input_ids,
-                "attention_mask": attn_masks,
-                "max_length": tokens_to_generate,
-            }
-            generated_tokens = self.language_model.generate(**param_dict)
-
-        elif self.cfg.library == 'megatron':
-            reconfigure_num_microbatches_calculator(
-                rank=0,  # This doesn't matter since it is only used for logging
-                rampup_batch_size=None,
-                global_batch_size=1,
-                micro_batch_size=1,  # Make sure that there is no "grad acc" while decoding.
-                data_parallel_size=1,  # We check above to make sure that dataparallel size is always 1 at inference.
-            )
-            generated_tokens, _ = self.language_model.decode(input_ids, attn_masks, tokens_to_generate)
-
-        generated_field = self.process_into_structured_fields(generated_tokens)
-        ground_truth_field = self.process_into_structured_fields(labels)
-
-        return generated_field, ground_truth_field
-
-    def process_into_structured_fields(self, full_seq_ids, template_length=None):
-
-        structured_field = []
-        for i in range(full_seq_ids.size(0)):
-            start_point = 0 if template_length is None else template_length[i].item()
-            stop_point = full_seq_ids.size(1)
-
-            for j in range(start_point, stop_point):
-                if full_seq_ids.data[i, j] in [self.tokenizer.tokenizer.pad_token_id, -100] and j != 0:
-                    stop_point = j
-                    break
-            token_ids = full_seq_ids[i, start_point:stop_point]
-            one_generated_field = self.tokenizer.tokenizer.decode(token_ids, skip_special_tokens=True).strip()
-            structured_field.append(one_generated_field)
-        return structured_field
-
-    def eval_step_helper(self, batch, mode='val'):
-
-        input_ids, attn_masks, labels = batch
-
-        loss = self(input_ids, attn_masks, labels)
-        self.log("{}_loss".format(mode), loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
-
-        generated_field, ground_truth_field = self.generate_candidates(input_ids, attn_masks, labels)
-
-        return {
-            'loss': loss,
-            'input': self.tokenizer.tokenizer.batch_decode(input_ids, skip_special_tokens=True),
-            'generated_field': generated_field,
-            'ground_truth_field': ground_truth_field,
-        }
-
-    def prepare_data(self):
-        """
-        Preprocessed schema and dialogues and caches this
-        """
-        if self.data_prepared:
-            return
-
-        if self._cfg.dataset.task == "ms_marco":
-            self.dialogues_processor = DialogueMSMarcoDataProcessor(
-                data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset
-            )
-        elif self._cfg.dataset.task == "sgd_generation":
-            self.dialogues_processor = DialogueSGDDataProcessor(
-                data_dir=self._cfg.dataset.data_dir,
-                dialogues_example_dir=self._cfg.dataset.dialogues_example_dir,
-                tokenizer=self.tokenizer,
-                cfg=self._cfg.dataset,
-            )
-        elif self._cfg.dataset.task == "mellon_qa":
-            self.dialogues_processor = DialogueMellonQADataProcessor(
-                data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset
-            )
-        else:
-            raise ValueError("Only ms_marco, sgd_generation and mellon_qa supported for Dialogue GPT Generation Model")
-
-        self.data_prepared = True
-
-    def update_data_dirs(self, data_dir: str, dialogues_example_dir: str):
-        """
-        Update data directories
-
-        Args:
-            data_dir: path to data directory
-            dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created.
-        """
-        if not os.path.exists(data_dir):
-            raise ValueError(f"{data_dir} is not found")
-        self._cfg.dataset.data_dir = data_dir
-        self._cfg.dataset.dialogues_example_dir = dialogues_example_dir
-        logging.info(f'Setting model.dataset.data_dir to {data_dir}.')
-        logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.')
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, split=train_data_config.ds_item)
-
-    def setup_multiple_validation_data(self, val_data_config: Optional[DictConfig] = None):
-        return self.setup_validation_data(val_data_config)
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, split=val_data_config.ds_item)
-
-    def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict]):
-        self.setup_test_data(test_data_config)
-
-    def setup_test_data(self, test_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, split=test_data_config.ds_item)
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig, split: str) -> DataLoader:
-        dataset_cfg = self._cfg.dataset
-        data_dir = dataset_cfg.data_dir
-
-        if not os.path.exists(data_dir):
-            raise FileNotFoundError(f"Data directory is not found at: {data_dir}.")
-
-        dataset = DialogueS2SGenerationDataset(
-            dataset_split=split,
-            dialogues_processor=self.dialogues_processor,
-            tokenizer=self.dialogues_processor._tokenizer,
-            cfg=dataset_cfg,
-        )
-
-        dl = torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=cfg.batch_size,
-            collate_fn=dataset.collate_fn,
-            drop_last=cfg.drop_last,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.num_workers,
-            pin_memory=cfg.pin_memory,
-        )
-        return dl
-
-    @classmethod
-    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
-        """
-        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
-
-        Returns:
-            List of available pre-trained models.
-        """
-        result = []
-        return result
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py
deleted file mode 100644
index 1df19cf8a556..000000000000
--- a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py
+++ /dev/null
@@ -1,454 +0,0 @@
-# Copyright 2018 The HuggingFace Inc. team.
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from collections import defaultdict
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-
-from nemo.collections.nlp.data.dialogue import DialogueSGDDataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.design_data_processor import DialogueDesignDataProcessor
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_zero_shot_intent_dataset import DialogueZeroShotIntentDataset
-from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import (
-    ZeroShotIntentInferenceDataset,
-    calc_class_weights_from_dataloader,
-)
-from nemo.collections.nlp.metrics.classification_report import ClassificationReport
-from nemo.collections.nlp.metrics.dialogue_metrics import DialogueGenerationMetrics
-from nemo.collections.nlp.models import TextClassificationModel
-from nemo.core.classes.common import PretrainedModelInfo
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['DialogueZeroShotIntentModel']
-
-
-class DialogueZeroShotIntentModel(TextClassificationModel):
-    """TextClassificationModel to be trained on two- or three-class textual entailment data, to be used for zero shot intent recognition."""
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        # deprecation warning
-        deprecated_warning("DialogueZeroShotIntentModel")
-
-        self.cfg = cfg
-        super().__init__(cfg=cfg, trainer=trainer)
-
-        if self.cfg.library == 'megatron':
-            # zero shot intent classification loading
-            # cannot directly load as .nemo uses the pre-refactor model
-            # therefore transfer its attributes over
-            if self.cfg.original_nemo_checkpoint is not None:
-                original_model = DialogueZeroShotIntentModel.restore_from(self.cfg.original_nemo_checkpoint)
-                self.classifier = original_model.classifier
-                self.bert_model = original_model.bert_model
-                self.loss = original_model.loss
-                self.classification_report = original_model.classification_report
-        elif self.cfg.library == "huggingface":
-            self.nli_model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')
-            self.bert_model = self.nli_model.model
-            self.classifier = self.nli_model.classification_head
-            original_model = DialogueZeroShotIntentModel.restore_from(self.cfg.original_nemo_checkpoint)
-            self.loss = original_model.loss
-            self.classification_report = original_model.classification_report
-            self.tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
-            self.tokenizer.max_seq_length = self.cfg.dataset.max_seq_length
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig, dataset_split) -> 'torch.utils.data.DataLoader':
-        if self._cfg.dataset.task == "zero_shot":
-            self.data_processor = DialogueAssistantDataProcessor(
-                self.cfg.data_dir, self.tokenizer, cfg=self.cfg.dataset
-            )
-        elif self._cfg.dataset.task == "design":
-            self.data_processor = DialogueDesignDataProcessor(
-                data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset
-            )
-        elif self._cfg.dataset.task == 'sgd':
-            self.data_processor = DialogueSGDDataProcessor(
-                data_dir=self._cfg.dataset.data_dir,
-                dialogues_example_dir=self._cfg.dataset.dialogues_example_dir,
-                tokenizer=self.tokenizer,
-                cfg=self._cfg.dataset,
-            )
-        else:
-            raise ValueError("Only zero_shot, design and sgd supported for Zero Shot Intent Model")
-
-        dataset = DialogueZeroShotIntentDataset(
-            dataset_split,
-            self.data_processor,
-            self.tokenizer,
-            self.cfg.dataset,  # this is the model.dataset cfg, which is diff from train_ds cfg etc
-        )
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            collate_fn=dataset.collate_fn,
-            batch_size=cfg.batch_size,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.get("num_workers", 0),
-            pin_memory=cfg.get("pin_memory", False),
-            drop_last=cfg.get("drop_last", False),
-        )
-
-    def forward(self, input_ids, attention_mask, token_type_ids):
-        if self.cfg.library == 'megatron':
-            hidden_states = self.bert_model(
-                input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-            )
-            if isinstance(hidden_states, tuple):
-                hidden_states = hidden_states[0]
-            logits = self.classifier(hidden_states=hidden_states)
-        elif self.cfg.library == 'huggingface':
-            output = self.nli_model(input_ids=input_ids, attention_mask=attention_mask)
-            logits = output['logits']
-        return logits
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig]):
-        if not train_data_config:
-            logging.info(
-                f"Dataloader config or file_name for the training set is missing, so no data loader for test is created!"
-            )
-            self._test_dl = None
-            return
-        self._train_dl = self._setup_dataloader_from_config(train_data_config, "train")
-
-        # calculate the class weights to be used in the loss function
-        if self.cfg.dataset.class_balancing == 'weighted_loss':
-            self.class_weights = calc_class_weights_from_dataloader(
-                self._train_dl, self.cfg.dataset.num_classes, self.cfg.dataset.data_dir
-            )
-        else:
-            self.class_weights = None
-        # we need to create/update the loss module by using the weights calculated from the training data
-        self.create_loss_module()
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig]):
-        if not val_data_config:
-            logging.info(
-                f"Dataloader config or file_path for the validation data set is missing, so no data loader for test is created!"
-            )
-            self._test_dl = None
-            return
-        self._validation_dl = self._setup_dataloader_from_config(val_data_config, "dev")
-
-    def setup_test_data(self, test_data_config: Optional[DictConfig]):
-        if not test_data_config:
-            logging.info(
-                f"Dataloader config or file_path for the test data set is missing, so no data loader for test is created!"
-            )
-            self._test_dl = None
-            return
-        self._test_dl = self._setup_dataloader_from_config(test_data_config, "test")
-
-    def _setup_infer_dataloader(
-        self,
-        queries: List[str],
-        candidate_labels: List[str],
-        hypothesis_template=str,
-        batch_size=1,
-        max_seq_length: int = -1,
-    ) -> 'torch.utils.data.DataLoader':
-        """
-        Setup method for inference data loader. Here the premise-hypothesis pairs are made from queries and candidate labels.
-
-        Args:
-            queries: the queries to classify
-            candidate_labels: strings to be used as labels
-            hypothesis_template: the template used to turn each label into an NLI-style hypothesis. Must include a {}
-                or similar syntax for the candidate label to be inserted.
-            batch_size: batch size to use during inference
-            max_seq_length: maximum length of queries, default is -1 for no limit
-        Returns:
-            A pytorch DataLoader.
-        """
-        dataset = ZeroShotIntentInferenceDataset(
-            queries=queries,
-            candidate_labels=candidate_labels,
-            tokenizer=self.tokenizer,
-            max_seq_length=max_seq_length,
-            hypothesis_template=hypothesis_template,
-        )
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=batch_size,
-            shuffle=False,
-            num_workers=2,
-            pin_memory=False,
-            drop_last=False,
-            collate_fn=dataset.collate_fn,
-        )
-
-    def validation_step(self, batch, batch_idx, split='val'):
-        """
-        Lightning calls this inside the validation loop with the data from the validation dataloader
-        passed in as `batch`.
-        """
-        input_ids, input_type_ids, input_mask, labels = batch
-        logits = self.forward(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
-
-        val_loss = self.loss(logits=logits, labels=labels)
-
-        preds = torch.argmax(logits, axis=-1)
-
-        tp, fn, fp, _ = self.classification_report(preds, labels)
-
-        loss = {
-            'val_loss': val_loss,
-            'tp': tp,
-            'fn': fn,
-            'fp': fp,
-            'logits': logits,
-            'input_ids': input_ids,
-            'labels': labels,
-        }
-        self.validation_step_outputs.append(loss)
-        return loss
-
-    def on_validation_epoch_end(self, split="val"):
-        """
-        Get metrics based on the candidate label with the highest predicted likelihood and the ground truth label for intent
-        """
-        output_logits = torch.cat([output['logits'] for output in self.validation_step_outputs], dim=0)
-        output_input_ids = torch.cat([output['input_ids'] for output in self.validation_step_outputs], dim=0)
-        output_labels = torch.cat([output['labels'] for output in self.validation_step_outputs], dim=0)
-
-        if self.cfg.library == 'huggingface':
-            entail_logits = output_logits[..., 2]
-            decoded_input_ids = [self.tokenizer.decode(output_input_ids[i]) for i in range(len(output_input_ids))]
-            utterance_candidate_pairs = [i.split(self.tokenizer.sep_token) for i in decoded_input_ids]
-            utterances = [
-                i[0].replace(self.tokenizer.bos_token, '').replace(self.tokenizer.eos_token, '')
-                for i in utterance_candidate_pairs
-            ]
-
-        elif self.cfg.library == 'megatron':
-            entail_logits = output_logits[..., 1]
-            decoded_input_ids = [
-                self.tokenizer.tokenizer.decode(output_input_ids[i]) for i in range(len(output_input_ids))
-            ]
-            utterance_candidate_pairs = [i.split(self.tokenizer.tokenizer.sep_token) for i in decoded_input_ids]
-            utterances = [
-                i[0].replace(self.tokenizer.tokenizer.bos_token, '').replace(self.tokenizer.tokenizer.eos_token, '')
-                for i in utterance_candidate_pairs
-            ]
-
-        # account for uncased tokenization
-        candidates = [
-            i[1]
-            .replace(self.cfg.dataset.prompt_template.lower(), '')
-            .replace(self.cfg.dataset.prompt_template, '')
-            .strip()
-            for i in utterance_candidate_pairs
-        ]
-        utterance_to_idx = defaultdict(list)
-        for idx, utterance in enumerate(utterances):
-            utterance_to_idx[utterance].append(idx)
-
-        predicted_labels = []
-        ground_truth_labels = []
-        utterances = []
-        for utterance, idxs in utterance_to_idx.items():
-            utterance_candidates = [candidates[idx] for idx in idxs]
-            logits = [entail_logits[idx].item() for idx in idxs]
-            labels = [output_labels[idx].item() for idx in idxs]
-            correct_candidate = utterance_candidates[np.argmax(labels)]
-            predicted_candidate = utterance_candidates[np.argmax(logits)]
-            predicted_labels.append(predicted_candidate)
-            ground_truth_labels.append(correct_candidate)
-            utterances.append(utterance)
-
-        os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True)
-        filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl")
-
-        DialogueGenerationMetrics.save_predictions(
-            filename,
-            predicted_labels,
-            ground_truth_labels,
-            utterances,
-        )
-
-        label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))}
-        self.classification_report = ClassificationReport(
-            num_classes=len(label_to_ids), mode='micro', label_ids=label_to_ids, dist_sync_on_step=True
-        ).to(output_logits[0].device)
-        predicted_label_ids = torch.tensor([label_to_ids[label] for label in predicted_labels]).to(
-            output_logits[0].device
-        )
-        ground_truth_label_ids = torch.tensor([label_to_ids[label] for label in ground_truth_labels]).to(
-            output_logits[0].device
-        )
-
-        tp, fn, fp, _ = self.classification_report(predicted_label_ids, ground_truth_label_ids)
-        precision, recall, f1, report = self.classification_report.compute()
-        label_acc = np.mean([int(predicted_labels[i] == ground_truth_labels[i]) for i in range(len(predicted_labels))])
-
-        avg_loss = torch.stack([x[f'val_loss'] for x in self.validation_step_outputs]).mean()
-
-        logging.info(report)
-
-        self.log('unified_precision', precision)
-        self.log('unified_f1', f1)
-        self.log('unified_recall', recall)
-        self.log('unfied_accuracy', label_acc * 100)
-        self.log('val_loss', avg_loss, prog_bar=True)
-
-        self.validation_step_outputs.clear()  # free memory
-        self.classification_report.reset()
-
-    def predict(
-        self,
-        queries: Union[str, List[str]],
-        candidate_labels: Union[str, List[str]],
-        hypothesis_template='This example is {}.',
-        batch_size=1,
-        multi_label=True,
-        entailment_idx=1,
-        contradiction_idx=0,
-    ) -> List[Dict]:
-        """
-        Given a list of queries and a list of candidate labels, return a ranked list of labels and scores for each query.
-
-        Example usage:
-            queries = ["I'd like a veggie burger, fries, and a coke", "Turn off the lights in the living room",]
-            candidate_labels = ["Food order", "Change lighting"]
-            model.predict(queries, candidate_labels)
-
-        Example output:
-            [{'sentence': "I'd like a veggie burger, fries, and a coke",
-              'labels': ['Food order', 'Change lighting'],
-              'scores': [0.8557153344154358, 0.12036784738302231]},
-             {'sentence': 'Turn off the lights in the living room',
-              'labels': ['Change lighting', 'Food order'],
-              'scores': [0.8506497144699097, 0.06594637036323547]}]
-
-
-        Args:
-            queries: the query or list of queries to classify
-            candidate_labels: string or list of strings to be used as labels
-            hypothesis_template: the template used to turn each label into an NLI-style hypothesis. Must include a {}
-            or similar syntax for the candidate label to be inserted.
-            batch_size: the batch size to use for inference.
-            multi_label: whether or not multiple candidate labels can be true. If False, the scores are normalized
-            such that all class probabilities sum to 1. If True, the labels are
-            considered independent and probabilities are normalized for each candidate by doing a softmax of
-            the entailment score vs. the contradiction score.
-            entailment_idx: the index of the "entailment" class in the trained model; models trained on MNLI
-             using NeMo's glue_benchmark.py or zero_shot_intent_model.py use an index of 1 by default.
-            contradiction_idx: the index of the "contradiction" class in the trained model; models trained on MNLI
-             using NeMo's glue_benchmark.py or zero_shot_intent_model.py use an index of 0 by default.
-
-        Returns:
-            list of dictionaries; one dict per input query. Each dict has keys "sentence", "labels", "scores".
-            labels and scores are parallel lists (with each score corresponding to the label at the same index),
-                 sorted from highest to lowest score.
-
-        """
-        if not queries:
-            raise ValueError("No queries were passed for classification!")
-        if not candidate_labels:
-            raise ValueError("No candidate labels were provided!")
-
-        queries = [queries] if isinstance(queries, str) else queries
-        candidate_labels = [candidate_labels] if isinstance(candidate_labels, str) else candidate_labels
-
-        if len(candidate_labels) == 1:
-            multi_label = True
-
-        mode = self.training
-        try:
-            device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-            # Switch model to evaluation mode
-            self.eval()
-            self.to(device)
-
-            infer_datalayer = self._setup_infer_dataloader(
-                queries,
-                candidate_labels,
-                hypothesis_template=hypothesis_template,
-                batch_size=batch_size,
-                max_seq_length=self._cfg.dataset.max_seq_length,
-            )
-
-            all_batch_logits = []
-            for batch in infer_datalayer:
-                input_ids, input_type_ids, input_mask, _ = batch
-
-                logits = self.forward(
-                    input_ids=input_ids.to(device),
-                    token_type_ids=input_type_ids.to(device),
-                    attention_mask=input_mask.to(device),
-                )
-                all_batch_logits.append(logits.detach().cpu().numpy())
-
-            all_logits = np.concatenate(all_batch_logits)
-            outputs = all_logits.reshape((len(queries), len(candidate_labels), -1))
-
-            if not multi_label:
-                # softmax the "entailment" logits over all candidate labels
-                entail_logits = outputs[..., entailment_idx]
-                scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)
-            else:
-                # softmax over the entailment vs. contradiction dim for each label independently
-                entail_contr_logits = outputs[..., [contradiction_idx, entailment_idx]]
-                scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
-                scores = scores[..., 1]
-
-            result = []
-            for i in range(len(queries)):
-                sorted_idxs = list(reversed(scores[i].argsort()))
-                result.append(
-                    {
-                        "sentence": queries[i],
-                        "labels": [candidate_labels[j] for j in sorted_idxs],
-                        "scores": scores[i][sorted_idxs].tolist(),
-                    }
-                )
-
-        finally:
-            # set mode back to its original value
-            self.train(mode=mode)
-        return result
-
-    @classmethod
-    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
-        """
-        This method returns a list of pre-trained models which can be instantiated directly from NVIDIA's NGC cloud.
-
-        Returns:
-            List of available pre-trained models.
-        """
-        result = []
-        result.append(
-            PretrainedModelInfo(
-                pretrained_model_name="zeroshotintent_en_bert_base_uncased",
-                location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/zeroshotintent_en_bert_base_uncased/versions/1.4.1/files/zeroshotintent_en_bert_base_uncased.nemo",
-                description="DialogueZeroShotIntentModel trained by fine tuning BERT-base-uncased on the MNLI (Multi-Genre Natural Language Inference) dataset, which achieves an accuracy of 84.9% and 84.8% on the matched and mismatched dev sets, respectively.",
-            )
-        )
-        result.append(
-            PretrainedModelInfo(
-                pretrained_model_name="zeroshotintent_en_megatron_uncased",
-                location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/zeroshotintent_en_megatron_uncased/versions/1.4.1/files/zeroshotintent_en_megatron_uncased.nemo",
-                description="DialogueZeroShotIntentModel trained by fine tuning Megatron-BERT-345m=M-uncased on the MNLI (Multi-Genre Natural Language Inference) dataset, which achieves an accuracy of 90.0% and 89.9% on the matched and mismatched dev sets, respectively",
-            )
-        )
-        return result
diff --git a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py
deleted file mode 100644
index 09a81b33c973..000000000000
--- a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py
+++ /dev/null
@@ -1,631 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Dict, List, Optional
-
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig, OmegaConf
-from torch.utils.data import DataLoader
-
-from nemo.collections.common.losses import AggregatorLoss, CrossEntropyLoss
-from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_bert_dataset import (
-    DialogueBERTDataset,
-    DialogueIntentSlotInferenceDataset,
-)
-from nemo.collections.nlp.data.intent_slot_classification import IntentSlotDataDesc
-from nemo.collections.nlp.metrics.classification_report import ClassificationReport
-from nemo.collections.nlp.metrics.dialogue_metrics import DialogueClassificationMetrics
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.collections.nlp.modules.common import SequenceTokenClassifier
-from nemo.collections.nlp.parts.utils_funcs import tensor2list
-from nemo.core.classes import typecheck
-from nemo.core.classes.common import PretrainedModelInfo
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-
-class IntentSlotClassificationModel(NLPModel):
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        """Initializes BERT Joint Intent and Slot model."""
-        # deprecation warning
-        deprecated_warning("IntentSlotClassificationModel")
-
-        self.max_seq_length = cfg.dataset.max_seq_length
-        self.cfg = cfg
-        # Check the presence of data_dir.
-        if not cfg.dataset.data_dir or not os.path.exists(cfg.dataset.data_dir):
-            # Set default values of data_desc.
-            self._set_defaults_data_desc(cfg)
-        else:
-            self.data_dir = cfg.dataset.data_dir
-            # Update configuration of data_desc.
-            self._set_data_desc_to_cfg(cfg, cfg.dataset.data_dir, cfg.train_ds, cfg.validation_ds)
-        # init superclass
-        super().__init__(cfg=cfg, trainer=trainer)
-
-        # Initialize Classifier.
-        self._reconfigure_classifier()
-
-    def _set_defaults_data_desc(self, cfg):
-        """
-        Method makes sure that cfg.data_desc params are set.
-        If not, set's them to "dummy" defaults.
-        """
-        if not hasattr(cfg, "data_desc"):
-            OmegaConf.set_struct(cfg, False)
-            cfg.data_desc = {}
-            # Intents.
-            cfg.data_desc.intent_labels = " "
-            cfg.data_desc.intent_label_ids = {" ": 0}
-            cfg.data_desc.intent_weights = [1]
-            # Slots.
-            cfg.data_desc.slot_labels = " "
-            cfg.data_desc.slot_label_ids = {" ": 0}
-            cfg.data_desc.slot_weights = [1]
-
-            cfg.data_desc.pad_label = "O"
-            OmegaConf.set_struct(cfg, True)
-
-    def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds):
-        """Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc."""
-        # Save data from data desc to config - so it can be reused later, e.g. in inference.
-        data_desc = IntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix])
-        OmegaConf.set_struct(cfg, False)
-        if not hasattr(cfg, "data_desc") or cfg.data_desc is None:
-            cfg.data_desc = {}
-        # Intents.
-        cfg.data_desc.intent_labels = list(data_desc.intents_label_ids.keys())
-        cfg.data_desc.intent_label_ids = data_desc.intents_label_ids
-        cfg.data_desc.intent_weights = data_desc.intent_weights
-        # Slots.
-        cfg.data_desc.slot_labels = list(data_desc.slots_label_ids.keys())
-        cfg.data_desc.slot_label_ids = data_desc.slots_label_ids
-        cfg.data_desc.slot_weights = data_desc.slot_weights
-
-        cfg.data_desc.pad_label = data_desc.pad_label
-
-        # for older(pre - 1.0.0.b3) configs compatibility
-        if not hasattr(cfg, "class_labels") or cfg.class_labels is None:
-            cfg.class_labels = {}
-            cfg.class_labels = OmegaConf.create(
-                {'intent_labels_file': 'intent_labels.csv', 'slot_labels_file': 'slot_labels.csv'}
-            )
-
-        slot_labels_file = os.path.join(data_dir, cfg.class_labels.slot_labels_file)
-        intent_labels_file = os.path.join(data_dir, cfg.class_labels.intent_labels_file)
-        self._save_label_ids(data_desc.slots_label_ids, slot_labels_file)
-        self._save_label_ids(data_desc.intents_label_ids, intent_labels_file)
-
-        self.register_artifact('class_labels.intent_labels_file', intent_labels_file)
-        self.register_artifact('class_labels.slot_labels_file', slot_labels_file)
-        OmegaConf.set_struct(cfg, True)
-
-    def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None:
-        """Saves label ids map to a file"""
-        with open(filename, 'w') as out:
-            labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1]))
-            out.write('\n'.join(labels))
-            logging.info(f'Labels: {label_ids}')
-            logging.info(f'Labels mapping saved to : {out.name}')
-
-    def _reconfigure_classifier(self):
-        """Method reconfigures the classifier depending on the settings of model cfg.data_desc"""
-
-        self.classifier = SequenceTokenClassifier(
-            hidden_size=self.hidden_size,
-            num_intents=len(self.cfg.data_desc.intent_labels),
-            num_slots=len(self.cfg.data_desc.slot_labels),
-            dropout=self.cfg.classifier_head.fc_dropout,
-            num_layers=self.cfg.classifier_head.num_output_layers,
-            log_softmax=False,
-        )
-
-        # define losses
-        if self.cfg.class_balancing == 'weighted_loss':
-            # You may need to increase the number of epochs for convergence when using weighted_loss
-            self.intent_loss = CrossEntropyLoss(logits_ndim=2, weight=self.cfg.data_desc.intent_weights)
-            self.slot_loss = CrossEntropyLoss(logits_ndim=3, weight=self.cfg.data_desc.slot_weights)
-        else:
-            self.intent_loss = CrossEntropyLoss(logits_ndim=2)
-            self.slot_loss = CrossEntropyLoss(logits_ndim=3)
-
-        self.total_loss = AggregatorLoss(
-            num_inputs=2, weights=[self.cfg.intent_loss_weight, 1.0 - self.cfg.intent_loss_weight]
-        )
-
-        # setup to track metrics
-        self.intent_classification_report = ClassificationReport(
-            num_classes=len(self.cfg.data_desc.intent_labels),
-            label_ids=self.cfg.data_desc.intent_label_ids,
-            dist_sync_on_step=True,
-            mode='micro',
-        )
-        self.slot_classification_report = ClassificationReport(
-            num_classes=len(self.cfg.data_desc.slot_labels),
-            label_ids=self.cfg.data_desc.slot_label_ids,
-            dist_sync_on_step=True,
-            mode='micro',
-        )
-
-    def update_data_dir_for_training(self, data_dir: str, train_ds, validation_ds) -> None:
-        """
-        Update data directory and get data stats with Data Descriptor.
-        Also, reconfigures the classifier - to cope with data with e.g. different number of slots.
-
-        Args:
-            data_dir: path to data directory
-        """
-        logging.info(f'Setting data_dir to {data_dir}.')
-        self.data_dir = data_dir
-        # Update configuration with new data.
-        self._set_data_desc_to_cfg(self.cfg, data_dir, train_ds, validation_ds)
-        # Reconfigure the classifier for different settings (number of intents, slots etc.).
-        self._reconfigure_classifier()
-
-    def update_data_dir_for_testing(self, data_dir) -> None:
-        """
-        Update data directory.
-
-        Args:
-            data_dir: path to data directory
-        """
-        logging.info(f'Setting data_dir to {data_dir}.')
-        self.data_dir = data_dir
-
-    @typecheck()
-    def forward(self, input_ids, attention_mask, token_type_ids):
-        """
-        No special modification required for Lightning, define it as you normally would
-        in the `nn.Module` in vanilla PyTorch.
-        """
-        if self._cfg.tokenizer.get('library', '') == 'megatron':
-            hidden_states, _ = self.bert_model(input_ids, attention_mask, tokentype_ids=token_type_ids, lm_labels=None)
-        else:
-            hidden_states = self.bert_model(
-                input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-            )
-        intent_logits, slot_logits = self.classifier(hidden_states=hidden_states)
-        return intent_logits, slot_logits
-
-    def training_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the training loop with the data from the training dataloader
-        passed in as `batch`.
-        """
-        # forward pass
-        input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, intent_labels, slot_labels = batch
-        intent_logits, slot_logits = self(
-            input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask
-        )
-
-        # calculate combined loss for intents and slots
-        intent_loss = self.intent_loss(logits=intent_logits, labels=intent_labels)
-        slot_loss = self.slot_loss(logits=slot_logits, labels=slot_labels, loss_mask=loss_mask)
-        train_loss = self.total_loss(loss_1=intent_loss, loss_2=slot_loss)
-        lr = self._optimizer.param_groups[0]['lr']
-
-        self.log('train_loss', train_loss)
-        self.log('lr', lr, prog_bar=True)
-
-        return {
-            'loss': train_loss,
-            'lr': lr,
-        }
-
-    def validation_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the validation loop with the data from the validation dataloader
-        passed in as `batch`.
-        """
-        input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, intent_labels, slot_labels = batch
-        intent_logits, slot_logits = self(
-            input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask
-        )
-
-        # calculate combined loss for intents and slots
-        intent_loss = self.intent_loss(logits=intent_logits, labels=intent_labels)
-        slot_loss = self.slot_loss(logits=slot_logits, labels=slot_labels, loss_mask=loss_mask)
-        val_loss = self.total_loss(loss_1=intent_loss, loss_2=slot_loss)
-
-        # calculate accuracy metrics for intents and slot reporting
-        # intents
-        intent_preds = torch.argmax(intent_logits, axis=-1)
-        self.intent_classification_report.update(intent_preds, intent_labels)
-        # slots
-
-        subtokens_mask = subtokens_mask > 0.5
-        slot_preds = torch.argmax(slot_logits, axis=-1)
-        self.slot_classification_report.update(slot_preds[subtokens_mask], slot_labels[subtokens_mask])
-
-        loss = {
-            'val_loss': val_loss,
-            'intent_tp': self.intent_classification_report.tp,
-            'intent_fn': self.intent_classification_report.fn,
-            'intent_fp': self.intent_classification_report.fp,
-            'slot_tp': self.slot_classification_report.tp,
-            'slot_fn': self.slot_classification_report.fn,
-            'slot_fp': self.slot_classification_report.fp,
-            'intent_preds': intent_preds,
-            'intent_labels': intent_labels,
-            'slot_preds': slot_preds,
-            'slot_labels': slot_labels,
-            'input': input_ids,
-            'subtokens_mask': subtokens_mask,
-        }
-        self.validation_step_outputs.append(loss)
-        return loss
-
-    @staticmethod
-    def get_continuous_slots(slot_ids, utterance_tokens):
-        """
-        Extract continuous spans of slot_ids
-        Args:
-            Slot_ids: list of str representing slot of each word token
-            For instance, 'O', 'email_address', 'email_address', 'email_address', 'O', 'O', 'O', 'O']
-            Corresponds to ['enter', 'atdfd@yahoo', 'dot', 'com', 'into', 'my', 'contact', 'list']
-        Returns:
-            list of str where each element is a slot name-value pair
-            e.g. ['email_address(atdfd@yahoo dot com)']
-
-        """
-        slot_id_stack = []
-        position_stack = []
-        for i, slot_id in enumerate(slot_ids):
-            if not slot_id_stack or slot_id != slot_id_stack[-1]:
-                slot_id_stack.append(slot_id)
-                position_stack.append([])
-            position_stack[-1].append(i)
-
-        slot_id_to_start_and_exclusive_end = {
-            slot_id_stack[i]: [position_stack[i][0], position_stack[i][-1] + 1]
-            for i in range(len(position_stack))
-            if slot_id_stack[i] != 'O'
-        }
-
-        slot_to_words = {
-            slot: ' '.join(utterance_tokens[position[0] : position[1]])
-            for slot, position in slot_id_to_start_and_exclusive_end.items()
-        }
-
-        slot_name_and_values = ["{}({})".format(slot, value) for slot, value in slot_to_words.items()]
-
-        return slot_name_and_values
-
-    def get_utterance_tokens(self, token_ids, token_masks):
-        """
-        Get utterance tokens based on initial utterance tokenization using token_masks,
-        which shows the starting subtoken of each utterance token.
-
-        Args:
-            token_ids: IntTensor of size (max_seq_len, )
-            token_masks: BoolTensor of size (max_seq_len, )
-
-        Returns
-            token_list: List of Str (list of tokens with len <= max_seq_len)
-        """
-        tokens_stack = []
-        tokens = self.tokenizer.tokenizer.convert_ids_to_tokens(token_ids)
-
-        for token_idx, token in enumerate(tokens):
-            if token_masks[token_idx].item():
-                tokens_stack.append([token])
-            elif tokens_stack:
-                clean_token = (
-                    token.replace("##", '')
-                    .replace(self.tokenizer.tokenizer.sep_token, '')
-                    .replace(self.tokenizer.tokenizer.pad_token, '')
-                )
-                tokens_stack[-1].append(clean_token)
-        token_list = [''.join(token) for token in tokens_stack]
-        return token_list
-
-    def get_unified_metrics(self, outputs):
-        slot_preds = []
-        slot_labels = []
-        subtokens_mask = []
-        inputs = []
-        intent_preds = []
-        intent_labels = []
-
-        for output in outputs:
-            slot_preds += output['slot_preds']
-            slot_labels += output["slot_labels"]
-            subtokens_mask += output["subtokens_mask"]
-            inputs += output["input"]
-            intent_preds += output["intent_preds"]
-            intent_labels += output["intent_labels"]
-
-        ground_truth_labels = self.convert_intent_ids_to_intent_names(intent_labels)
-        generated_labels = self.convert_intent_ids_to_intent_names(intent_preds)
-
-        predicted_slots = self.mask_unused_subword_slots(slot_preds, subtokens_mask)
-        ground_truth_slots = self.mask_unused_subword_slots(slot_labels, subtokens_mask)
-
-        all_generated_slots = []
-        all_ground_truth_slots = []
-        all_utterances = []
-
-        for i in range(len(predicted_slots)):
-            utterance_tokens = self.get_utterance_tokens(inputs[i], subtokens_mask[i])
-            ground_truth_slot_names = ground_truth_slots[i].split()
-            predicted_slot_names = predicted_slots[i].split()
-            processed_ground_truth_slots = IntentSlotClassificationModel.get_continuous_slots(
-                ground_truth_slot_names, utterance_tokens
-            )
-            processed_predicted_slots = IntentSlotClassificationModel.get_continuous_slots(
-                predicted_slot_names, utterance_tokens
-            )
-
-            all_generated_slots.append(processed_predicted_slots)
-            all_ground_truth_slots.append(processed_ground_truth_slots)
-            all_utterances.append(' '.join(utterance_tokens))
-
-        os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True)
-        filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "predictions.jsonl")
-
-        DialogueClassificationMetrics.save_predictions(
-            filename,
-            generated_labels,
-            all_generated_slots,
-            ground_truth_labels,
-            all_ground_truth_slots,
-            ['' for i in range(len(generated_labels))],
-            ['' for i in range(len(generated_labels))],
-            all_utterances,
-        )
-
-        (
-            slot_precision,
-            slot_recall,
-            slot_f1,
-            slot_joint_goal_accuracy,
-        ) = DialogueClassificationMetrics.get_slot_filling_metrics(all_generated_slots, all_ground_truth_slots)
-
-        return slot_precision, slot_recall, slot_f1, slot_joint_goal_accuracy
-
-    def on_validation_epoch_end(self):
-        """
-        Called at the end of validation to aggregate outputs.
-        :param outputs: list of individual outputs of each validation step.
-        """
-
-        prefix = "test" if self.trainer.testing else "val"
-        if prefix == "val":
-            outputs = self.validation_step_outputs
-        else:
-            outputs = self.test_step_outputs
-        (
-            unified_slot_precision,
-            unified_slot_recall,
-            unified_slot_f1,
-            unified_slot_joint_goal_accuracy,
-        ) = self.get_unified_metrics(outputs)
-
-        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
-
-        # calculate metrics and log classification report (separately for intents and slots)
-        intent_precision, intent_recall, intent_f1, intent_report = self.intent_classification_report.compute()
-        logging.info(f'Intent report: {intent_report}')
-
-        slot_precision, slot_recall, slot_f1, slot_report = self.slot_classification_report.compute()
-        logging.info(f'Slot report: {slot_report}')
-
-        self.log(f'{prefix}_loss', avg_loss)
-        self.log('intent_precision', intent_precision)
-        self.log('intent_recall', intent_recall)
-        self.log('intent_f1', intent_f1)
-        self.log('slot_precision', slot_precision)
-        self.log('slot_recall', slot_recall)
-        self.log('slot_f1', slot_f1)
-        self.log('unified_slot_precision', unified_slot_precision)
-        self.log('unified_slot_recall', unified_slot_recall)
-        self.log('unified_slot_f1', unified_slot_f1)
-        self.log('unified_slot_joint_goal_accuracy', unified_slot_joint_goal_accuracy)
-
-        self.intent_classification_report.reset()
-        self.slot_classification_report.reset()
-
-        self.validation_step_outputs.clear() if prefix == 'val' else self.test_step_outputs.clear()
-        return {
-            'val_loss': avg_loss,
-            'intent_precision': intent_precision,
-            'intent_recall': intent_recall,
-            'intent_f1': intent_f1,
-            'slot_precision': slot_precision,
-            'slot_recall': slot_recall,
-            'slot_f1': slot_f1,
-            'unified_slot_precision': unified_slot_precision,
-            'unified_slot_recall': unified_slot_recall,
-            'unified_slot_f1': unified_slot_f1,
-            'unified_slot_joint_goal_accuracy': unified_slot_joint_goal_accuracy,
-        }
-
-    def test_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the test loop with the data from the test dataloader
-        passed in as `batch`.
-        """
-        loss = self.validation_step(batch, batch_idx)
-        self.test_step_outputs.append(loss)
-        return loss
-
-    def on_test_epoch_end(self):
-        """
-        Called at the end of test to aggregate outputs.
-        :param outputs: list of individual outputs of each test step.
-        """
-        return self.on_validation_epoch_end()
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig]):
-        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, dataset_split='train')
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig]):
-        self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, dataset_split='dev')
-
-    def setup_test_data(self, test_data_config: Optional[DictConfig]):
-        self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, dataset_split='test')
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig, dataset_split: str):
-        data_processor = DialogueAssistantDataProcessor(self.data_dir, self.tokenizer, cfg=self.cfg.dataset)
-
-        dataset = DialogueBERTDataset(
-            dataset_split,
-            data_processor,
-            self.tokenizer,
-            self.cfg.dataset,  # this is the model.dataset cfg, which is diff from train_ds cfg etc
-        )
-
-        return DataLoader(
-            dataset=dataset,
-            batch_size=cfg.batch_size,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.num_workers,
-            pin_memory=cfg.pin_memory,
-            drop_last=cfg.drop_last,
-            collate_fn=dataset.collate_fn,
-        )
-
-    def _setup_infer_dataloader(self, queries: List[str], test_ds) -> 'torch.utils.data.DataLoader':
-        """
-        Setup function for a infer data loader.
-        Args:
-            queries: text
-            batch_size: batch size to use during inference
-        Returns:
-            A pytorch DataLoader.
-        """
-
-        dataset = DialogueIntentSlotInferenceDataset(
-            tokenizer=self.tokenizer, queries=queries, max_seq_length=-1, do_lower_case=False
-        )
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            collate_fn=dataset.collate_fn,
-            batch_size=test_ds.batch_size,
-            shuffle=test_ds.shuffle,
-            num_workers=test_ds.num_workers,
-            pin_memory=test_ds.pin_memory,
-            drop_last=test_ds.drop_last,
-        )
-
-    def update_data_dirs(self, data_dir: str, dialogues_example_dir: str):
-        """
-        Update data directories
-
-        Args:
-            data_dir: path to data directory
-            dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created.
-        """
-        if not os.path.exists(data_dir):
-            raise ValueError(f"{data_dir} is not found")
-        self.cfg.dataset.data_dir = data_dir
-        self.cfg.dataset.dialogues_example_dir = dialogues_example_dir
-        logging.info(f'Setting model.dataset.data_dir to {data_dir}.')
-        logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.')
-
-    def predict_from_examples(self, queries: List[str], test_ds) -> List[List[str]]:
-        """
-        Get prediction for the queries (intent and slots)
-        Args:
-            queries: text sequences
-            test_ds: Dataset configuration section.
-        Returns:
-            predicted_intents, predicted_slots: model intent and slot predictions
-        """
-
-        predicted_intents = []
-        predicted_slots = []
-        mode = self.training
-
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-        # Switch model to evaluation mode
-        self.eval()
-        self.to(device)
-
-        # Dataset.
-        infer_datalayer = self._setup_infer_dataloader(queries, test_ds)
-
-        for batch in infer_datalayer:
-            input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch
-
-            intent_logits, slot_logits = self.forward(
-                input_ids=input_ids.to(device),
-                token_type_ids=input_type_ids.to(device),
-                attention_mask=input_mask.to(device),
-            )
-
-            # predict intents
-            intent_preds = tensor2list(torch.argmax(intent_logits, axis=-1))
-            predicted_intents += self.convert_intent_ids_to_intent_names(intent_preds)
-
-            # predict slots
-            slot_preds = torch.argmax(slot_logits, axis=-1)
-            predicted_slots += self.mask_unused_subword_slots(slot_preds, subtokens_mask)
-
-        # set mode back to its original value
-        self.train(mode=mode)
-
-        return predicted_intents, predicted_slots
-
-    def convert_intent_ids_to_intent_names(self, intent_preds):
-        # Retrieve intent and slot vocabularies from configuration.
-        intent_labels = self.cfg.data_desc.intent_labels
-
-        predicted_intents = []
-
-        # convert numerical outputs to Intent and Slot labels from the dictionaries
-        for intent_num in intent_preds:
-            # if intent_num < len(intent_labels):
-            predicted_intents.append(intent_labels[int(intent_num)])
-            # else:
-            #     # should not happen
-            #     predicted_intents.append("Unknown Intent")
-        return predicted_intents
-
-    def mask_unused_subword_slots(self, slot_preds, subtokens_mask):
-        # Retrieve intent and slot vocabularies from configuration.
-        slot_labels = self.cfg.data_desc.slot_labels
-        predicted_slots = []
-        for slot_preds_query, mask_query in zip(slot_preds, subtokens_mask):
-            query_slots = ''
-            for slot, mask in zip(slot_preds_query, mask_query):
-                if mask == 1:
-                    # if slot < len(slot_labels):
-                    query_slots += slot_labels[int(slot)] + ' '
-                    # else:
-                    #     query_slots += 'Unknown_slot '
-            predicted_slots.append(query_slots.strip())
-        return predicted_slots
-
-    @classmethod
-    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
-        """
-        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
-
-        Returns:
-            List of available pre-trained models.
-        """
-        result = []
-        model = PretrainedModelInfo(
-            pretrained_model_name="Joint_Intent_Slot_Assistant",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemonlpmodels/versions/1.0.0a5/files/Joint_Intent_Slot_Assistant.nemo",
-            description="This models is trained on this https://github.com/xliuhw/NLU-Evaluation-Data dataset which includes 64 various intents and 55 slots. Final Intent accuracy is about 87%, Slot accuracy is about 89%.",
-        )
-        result.append(model)
-        return result
diff --git a/nemo/collections/nlp/models/dialogue/sgdqa_model.py b/nemo/collections/nlp/models/dialogue/sgdqa_model.py
deleted file mode 100644
index 6cd2243423a4..000000000000
--- a/nemo/collections/nlp/models/dialogue/sgdqa_model.py
+++ /dev/null
@@ -1,607 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''
-This file contains code artifacts adapted from the original implementation:
-https://github.com/google-research/google-research/blob/master/schema_guided_dst/baseline/train_and_predict.py
-'''
-
-import os
-from typing import List, Optional
-
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig
-from torch.utils.data import DataLoader
-
-from nemo.collections.nlp.data.dialogue import DialogueSGDBERTDataset, DialogueSGDDataProcessor
-from nemo.collections.nlp.data.dialogue.sgd.evaluate import evaluate, get_in_domain_services
-from nemo.collections.nlp.data.dialogue.sgd.prediction_utils import write_predictions_to_file
-from nemo.collections.nlp.losses import SGDDialogueStateLoss
-from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.collections.nlp.modules import SGDDecoder, SGDEncoder
-from nemo.collections.nlp.parts.utils_funcs import tensor2list
-from nemo.core.classes.common import PretrainedModelInfo, typecheck
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated_warning
-
-__all__ = ['SGDQAModel']
-
-
-class SGDQAModel(NLPModel):
-    """
-    Dialogue State Tracking Model SGD-QA (https://arxiv.org/abs/2105.08049)
-
-    The SGD-QA model is a fast multi-pass schema-guided state-tracking model, that is trained on the Google schema-guided state tracking dataset (https://arxiv.org/abs/1909.05855).
-    The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs.
-    The model consists of two components: a neural natural language understanding model (NLU), and a rule-based state tracker.
-    The NLU takes in a dialogue turn and different schema (entity) information options and outputs their match score. The state tracker takes the highest rated entities and composes
-    the dialogue state across turns.
-    """
-
-    @property
-    def output_module(self):
-        return self.decoder
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        # deprecation warning
-        deprecated_warning("SGDQAModel")
-
-        self.data_prepared = False
-        super().__init__(cfg=cfg, trainer=trainer)
-        self.encoder = SGDEncoder(hidden_size=self.bert_model.config.hidden_size, dropout=self._cfg.encoder.dropout)
-        self.decoder = SGDDecoder(embedding_dim=self.bert_model.config.hidden_size)
-        self.loss = SGDDialogueStateLoss(reduction="mean")
-
-    @typecheck()
-    def forward(self, input_ids, attention_mask, token_type_ids):
-        token_embeddings = self.bert_model(
-            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-        )
-        if isinstance(token_embeddings, tuple):
-            token_embeddings = token_embeddings[0]
-
-        encoded_utterance, token_embeddings = self.encoder(hidden_states=token_embeddings)
-        (
-            logit_intent_status,
-            logit_req_slot_status,
-            logit_cat_slot_status,
-            logit_cat_slot_value_status,
-            logit_noncat_slot_status,
-            logit_spans,
-        ) = self.decoder(
-            encoded_utterance=encoded_utterance, token_embeddings=token_embeddings, utterance_mask=attention_mask
-        )
-        return (
-            logit_intent_status,
-            logit_req_slot_status,
-            logit_cat_slot_status,
-            logit_cat_slot_value_status,
-            logit_noncat_slot_status,
-            logit_spans,
-        )
-
-    def training_step(self, batch, batch_idx):
-        (
-            example_id_num,
-            service_id,
-            utterance_ids,
-            token_type_ids,
-            attention_mask,
-            intent_status,
-            requested_slot_status,
-            categorical_slot_status,
-            categorical_slot_value_status,
-            noncategorical_slot_status,
-            noncategorical_slot_value_start,
-            noncategorical_slot_value_end,
-            start_char_idx,
-            end_char_idx,
-            task_mask,
-        ) = batch
-        (
-            logit_intent_status,
-            logit_req_slot_status,
-            logit_cat_slot_status,
-            logit_cat_slot_value_status,
-            logit_noncat_slot_status,
-            logit_spans,
-        ) = self(input_ids=utterance_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
-        loss = self.loss(
-            logit_intent_status=logit_intent_status,
-            intent_status=intent_status,
-            logit_req_slot_status=logit_req_slot_status,
-            requested_slot_status=requested_slot_status,
-            logit_cat_slot_status=logit_cat_slot_status,
-            categorical_slot_status=categorical_slot_status,
-            logit_cat_slot_value_status=logit_cat_slot_value_status,
-            categorical_slot_value_status=categorical_slot_value_status,
-            logit_noncat_slot_status=logit_noncat_slot_status,
-            noncategorical_slot_status=noncategorical_slot_status,
-            logit_spans=logit_spans,
-            noncategorical_slot_value_start=noncategorical_slot_value_start,
-            noncategorical_slot_value_end=noncategorical_slot_value_end,
-            task_mask=task_mask,
-        )
-        lr = self._optimizer.param_groups[0]['lr']
-
-        self.log('train_loss', loss)
-        self.log('lr', lr, prog_bar=True)
-
-        return {
-            'loss': loss,
-            'lr': lr,
-        }
-
-    def validation_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_idx: int = 0) -> dict:
-        """
-        Called at every validation step to aggregate and postprocess outputs on each GPU
-        Args:
-            batch: input batch at validation step
-            batch_idx: batch index
-            dataloader_idx: dataloader index
-        """
-        loss, tensors = self.eval_step_helper(batch=batch)
-        self.log(f'val_loss', loss)
-        if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
-            self.validation_step_outputs[dataloader_idx].append({f'val_loss': loss, f'tensors': tensors})
-        else:
-            self.validation_step_outputs.append({f'val_loss': loss, f'tensors': tensors})
-
-        return {f'val_loss': loss, f'tensors': tensors}
-
-    def test_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_idx: int = 0) -> dict:
-        """
-        Called at every test step to aggregate and postprocess outputs on each GPU
-        Args:
-            batch: input batch at test step
-            batch_idx: batch index
-            dataloader_idx: dataloader index
-        """
-        loss, tensors = self.eval_step_helper(batch=batch)
-        if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
-            self.test_step_outputs[dataloader_idx].append({f'test_loss': loss, f'tensors': tensors})
-        else:
-            self.test_step_outputs.append({f'test_loss': loss, f'tensors': tensors})
-
-        return {f'test_loss': loss, f'tensors': tensors}
-
-    def eval_step_helper(self, batch: List[torch.Tensor]):
-        """
-        Helper called at every validation/test step to aggregate and postprocess outputs on each GPU
-        Args:
-            batch: input batch at step
-        Returns:
-            loss: averaged batch loss
-            tensors: collection of aggregated output tensors across all GPU workers
-        """
-        (
-            example_id_num,
-            service_id,
-            utterance_ids,
-            token_type_ids,
-            attention_mask,
-            intent_status,
-            requested_slot_status,
-            categorical_slot_status,
-            categorical_slot_value_status,
-            noncategorical_slot_status,
-            noncategorical_slot_value_start,
-            noncategorical_slot_value_end,
-            start_char_idx,
-            end_char_idx,
-            task_mask,
-        ) = batch
-        (
-            logit_intent_status,
-            logit_req_slot_status,
-            logit_cat_slot_status,
-            logit_cat_slot_value_status,
-            logit_noncat_slot_status,
-            logit_spans,
-        ) = self(input_ids=utterance_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
-        loss = self.loss(
-            logit_intent_status=logit_intent_status,
-            intent_status=intent_status,
-            logit_req_slot_status=logit_req_slot_status,
-            requested_slot_status=requested_slot_status,
-            logit_cat_slot_status=logit_cat_slot_status,
-            categorical_slot_status=categorical_slot_status,
-            logit_cat_slot_value_status=logit_cat_slot_value_status,
-            categorical_slot_value_status=categorical_slot_value_status,
-            logit_noncat_slot_status=logit_noncat_slot_status,
-            noncategorical_slot_status=noncategorical_slot_status,
-            logit_spans=logit_spans,
-            noncategorical_slot_value_start=noncategorical_slot_value_start,
-            noncategorical_slot_value_end=noncategorical_slot_value_end,
-            task_mask=task_mask,
-        )
-
-        all_example_id_num = []
-        all_service_id = []
-        all_logit_intent_status = []
-        all_logit_req_slot_status = []
-        all_logit_cat_slot_status = []
-        all_logit_cat_slot_value_status = []
-        all_logit_noncat_slot_status = []
-        all_logit_spans = []
-        all_start_char_idx = []
-        all_end_char_idx = []
-
-        if self.trainer.num_devices and self.trainer.world_size > 1:
-            world_size = self.trainer.world_size
-            for ind in range(world_size):
-                all_example_id_num.append(torch.empty_like(example_id_num))
-                all_service_id.append(torch.empty_like(service_id))
-                all_logit_intent_status.append(torch.empty_like(logit_intent_status))
-                all_logit_req_slot_status.append(torch.empty_like(logit_req_slot_status))
-                all_logit_cat_slot_status.append(torch.empty_like(logit_cat_slot_status))
-                all_logit_cat_slot_value_status.append(torch.empty_like(logit_cat_slot_value_status))
-                all_logit_noncat_slot_status.append(torch.empty_like(logit_noncat_slot_status))
-                all_logit_spans.append(torch.empty_like(logit_spans))
-                all_start_char_idx.append(torch.empty_like(start_char_idx))
-                all_end_char_idx.append(torch.empty_like(end_char_idx))
-
-            torch.distributed.all_gather(all_example_id_num, example_id_num)
-            torch.distributed.all_gather(all_service_id, service_id)
-            torch.distributed.all_gather(all_logit_intent_status, logit_intent_status)
-            torch.distributed.all_gather(all_logit_req_slot_status, logit_req_slot_status)
-            torch.distributed.all_gather(all_logit_cat_slot_status, logit_cat_slot_status)
-            torch.distributed.all_gather(all_logit_cat_slot_value_status, logit_cat_slot_value_status)
-            torch.distributed.all_gather(all_logit_noncat_slot_status, logit_noncat_slot_status)
-            torch.distributed.all_gather(all_logit_spans, logit_spans)
-            torch.distributed.all_gather(all_start_char_idx, start_char_idx)
-            torch.distributed.all_gather(all_end_char_idx, end_char_idx)
-        else:
-            all_example_id_num.append(example_id_num)
-            all_service_id.append(service_id)
-            all_logit_intent_status.append(logit_intent_status)
-            all_logit_req_slot_status.append(logit_req_slot_status)
-            all_logit_cat_slot_status.append(logit_cat_slot_status)
-            all_logit_cat_slot_value_status.append(logit_cat_slot_value_status)
-            all_logit_noncat_slot_status.append(logit_noncat_slot_status)
-            all_logit_spans.append(logit_spans)
-            all_start_char_idx.append(start_char_idx)
-            all_end_char_idx.append(end_char_idx)
-
-        # after this: all_x is list of tensors, of length world_size
-        example_id_num = torch.cat(all_example_id_num)
-        service_id = torch.cat(all_service_id)
-        logit_intent_status = torch.cat(all_logit_intent_status)
-        logit_req_slot_status = torch.cat(all_logit_req_slot_status)
-        logit_cat_slot_status = torch.cat(all_logit_cat_slot_status)
-        logit_cat_slot_value_status = torch.cat(all_logit_cat_slot_value_status)
-        logit_noncat_slot_status = torch.cat(all_logit_noncat_slot_status)
-        logit_spans = torch.cat(all_logit_spans)
-        start_char_idx = torch.cat(all_start_char_idx)
-        end_char_idx = torch.cat(all_end_char_idx)
-
-        intent_status = torch.nn.Sigmoid()(logit_intent_status)
-
-        # Scores are output for each requested slot.
-        req_slot_status = torch.nn.Sigmoid()(logit_req_slot_status)
-
-        # For categorical slots, the status of each slot and the predicted value are output.
-        cat_slot_status_dist = torch.nn.Softmax(dim=-1)(logit_cat_slot_status)
-
-        cat_slot_status = torch.argmax(logit_cat_slot_status, axis=-1)
-        cat_slot_status_p = torch.max(cat_slot_status_dist, axis=-1)[0]
-        cat_slot_value_status = torch.nn.Sigmoid()(logit_cat_slot_value_status)
-
-        # For non-categorical slots, the status of each slot and the indices for spans are output.
-        noncat_slot_status_dist = torch.nn.Softmax(dim=-1)(logit_noncat_slot_status)
-
-        noncat_slot_status = torch.argmax(logit_noncat_slot_status, axis=-1)
-        noncat_slot_status_p = torch.max(noncat_slot_status_dist, axis=-1)[0]
-
-        softmax = torch.nn.Softmax(dim=1)
-
-        scores = softmax(logit_spans)
-        start_scores, end_scores = torch.unbind(scores, dim=-1)
-
-        batch_size, max_num_tokens = end_scores.size()
-        # Find the span with the maximum sum of scores for start and end indices.
-        total_scores = torch.unsqueeze(start_scores, axis=2) + torch.unsqueeze(end_scores, axis=1)
-        start_idx = torch.arange(max_num_tokens, device=total_scores.get_device()).view(1, -1, 1)
-        end_idx = torch.arange(max_num_tokens, device=total_scores.get_device()).view(1, 1, -1)
-        invalid_index_mask = (start_idx > end_idx).repeat(batch_size, 1, 1)
-        total_scores = torch.where(
-            invalid_index_mask,
-            torch.zeros(total_scores.size(), device=total_scores.get_device(), dtype=total_scores.dtype),
-            total_scores,
-        )
-        max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens**2), axis=-1)
-        max_span_p = torch.max(total_scores.view(-1, max_num_tokens**2), axis=-1)[0]
-
-        span_start_index = torch.floor_divide(max_span_index, max_num_tokens)
-        span_end_index = torch.fmod(max_span_index, max_num_tokens)
-
-        tensors = {
-            'example_id_num': example_id_num,
-            'service_id': service_id,
-            'intent_status': intent_status,
-            'req_slot_status': req_slot_status,
-            'cat_slot_status': cat_slot_status,
-            'cat_slot_status_p': cat_slot_status_p,
-            'cat_slot_value_status': cat_slot_value_status,
-            'noncat_slot_status': noncat_slot_status,
-            'noncat_slot_status_p': noncat_slot_status_p,
-            'noncat_slot_p': max_span_p,
-            'noncat_slot_start': span_start_index,
-            'noncat_slot_end': span_end_index,
-            'noncat_alignment_start': start_char_idx,
-            'noncat_alignment_end': end_char_idx,
-        }
-        return loss, tensors
-
-    def multi_validation_epoch_end(self, outputs: List[dict], dataloader_idx: int = 0):
-        """
-        Called at the end of validation to post process outputs into human readable format
-        Args:
-            outputs: list of individual outputs of each validation step
-            dataloader_idx: dataloader index
-        """
-        avg_loss = torch.stack([x[f'val_loss'] for x in outputs]).mean()
-        split = self._validation_names[dataloader_idx][:-1]
-        dataloader = self._validation_dl[dataloader_idx]
-        metrics = self.multi_eval_epoch_end_helper(outputs=outputs, split=split, dataloader=dataloader)
-
-        for k, v in metrics.items():
-            self.log(f'{split}_{k}', v, rank_zero_only=True)
-
-        self.log(f'val_loss', avg_loss, prog_bar=True, rank_zero_only=True)
-
-    def multi_test_epoch_end(self, outputs: List[dict], dataloader_idx: int = 0):
-        """
-        Called at the end of test to post process outputs into human readable format
-        Args:
-            outputs: list of individual outputs of each test step
-            dataloader_idx: dataloader index
-        """
-        avg_loss = torch.stack([x[f'test_loss'] for x in outputs]).mean()
-        split = self._test_names[dataloader_idx][:-1]
-        dataloader = self._test_dl[dataloader_idx]
-        metrics = self.multi_eval_epoch_end_helper(outputs=outputs, split=split, dataloader=dataloader)
-
-        for k, v in metrics.items():
-            self.log(f'{split}_{k}', v, rank_zero_only=True)
-
-        self.log(f'test_loss', avg_loss, prog_bar=True, rank_zero_only=True)
-
-    def multi_eval_epoch_end_helper(
-        self, outputs: List[dict], split: str, dataloader: torch.utils.data.DataLoader
-    ) -> dict:
-        """
-        Helper called at the end of evaluation to post process outputs into human readable format
-        Args:
-            outputs: list of individual outputs of each step
-            split: data split
-            dataloader: dataloader
-        Returns:
-            metrics: metrics collection
-        """
-
-        def get_str_example_id(split: str, ids_to_service_names_dict: dict, example_id_num: torch.Tensor) -> str:
-            """
-            Constructs string representation of example ID
-            Args:
-                split: evaluation data split
-                ids_to_service_names_dict: id to service name mapping
-                example_id_num: tensor example id
-            """
-
-            def format_turn_id(ex_id_num):
-                dialog_id_1, dialog_id_2, turn_id, service_id, model_task_id, slot_intent_id, value_id = ex_id_num
-                return "{}-{}_{:05d}-{:02d}-{}-{}-{}-{}".format(
-                    split,
-                    dialog_id_1,
-                    dialog_id_2,
-                    turn_id,
-                    ids_to_service_names_dict[service_id],
-                    model_task_id,
-                    slot_intent_id,
-                    value_id,
-                )
-
-            return list(map(format_turn_id, tensor2list(example_id_num)))
-
-        def combine_predictions_in_example(predictions: dict, batch_size: int):
-            '''
-            Combines predicted values to a single example.
-            Args:
-                predictions: predictions ordered by keys then batch
-                batch_size: batch size
-            Returns:
-                examples_preds: predictions ordered by batch then key
-            '''
-            examples_preds = [{} for _ in range(batch_size)]
-            for k, v in predictions.items():
-                if k != 'example_id':
-                    v = torch.chunk(v, batch_size)
-
-                for i in range(batch_size):
-                    if k == 'example_id':
-                        examples_preds[i][k] = v[i]
-                    else:
-                        examples_preds[i][k] = v[i].view(-1)
-            return examples_preds
-
-        example_id_num = torch.cat([x[f'tensors']['example_id_num'] for x in outputs])
-        service_id = torch.cat([x[f'tensors']['service_id'] for x in outputs])
-        intent_status = torch.cat([x[f'tensors']['intent_status'] for x in outputs])
-        req_slot_status = torch.cat([x[f'tensors']['req_slot_status'] for x in outputs])
-        cat_slot_status = torch.cat([x[f'tensors']['cat_slot_status'] for x in outputs])
-        cat_slot_status_p = torch.cat([x[f'tensors']['cat_slot_status_p'] for x in outputs])
-        cat_slot_value_status = torch.cat([x[f'tensors']['cat_slot_value_status'] for x in outputs])
-        noncat_slot_status = torch.cat([x[f'tensors']['noncat_slot_status'] for x in outputs])
-        noncat_slot_status_p = torch.cat([x[f'tensors']['noncat_slot_status_p'] for x in outputs])
-        noncat_slot_p = torch.cat([x[f'tensors']['noncat_slot_p'] for x in outputs])
-        noncat_slot_start = torch.cat([x[f'tensors']['noncat_slot_start'] for x in outputs])
-        noncat_slot_end = torch.cat([x[f'tensors']['noncat_slot_end'] for x in outputs])
-        noncat_alignment_start = torch.cat([x[f'tensors']['noncat_alignment_start'] for x in outputs])
-        noncat_alignment_end = torch.cat([x[f'tensors']['noncat_alignment_end'] for x in outputs])
-
-        ids_to_service_names_dict = self.dialogues_processor.schemas._services_id_to_vocab
-        example_id = get_str_example_id(dataloader.dataset, ids_to_service_names_dict, example_id_num)
-
-        metrics = {}
-        try:
-            prediction_dir = self.trainer.log_dir if self.trainer.log_dir is not None else ""
-        except:
-            prediction_dir = ""
-
-        if self.trainer.global_rank == 0:
-            prediction_dir = os.path.join(
-                prediction_dir, 'predictions', 'pred_res_{}_{}'.format(split, self._cfg.dataset.task_name)
-            )
-            os.makedirs(prediction_dir, exist_ok=True)
-
-            input_json_files = DialogueSGDDataProcessor.get_dialogue_files(
-                self._cfg.dataset.data_dir, split, self._cfg.dataset.task_name
-            )
-
-            predictions = {}
-            predictions['example_id'] = example_id
-            predictions['service_id'] = service_id
-            predictions['intent_status'] = intent_status
-            predictions['req_slot_status'] = req_slot_status
-            predictions['cat_slot_status'] = cat_slot_status
-            predictions['cat_slot_status_p'] = cat_slot_status_p
-            predictions['cat_slot_value_status'] = cat_slot_value_status
-            predictions['noncat_slot_status'] = noncat_slot_status
-            predictions['noncat_slot_status_p'] = noncat_slot_status_p
-            predictions['noncat_slot_p'] = noncat_slot_p
-            predictions['noncat_slot_start'] = noncat_slot_start
-            predictions['noncat_slot_end'] = noncat_slot_end
-            predictions['noncat_alignment_start'] = noncat_alignment_start
-            predictions['noncat_alignment_end'] = noncat_alignment_end
-
-            in_domain_services = get_in_domain_services(
-                os.path.join(self._cfg.dataset.data_dir, split, "schema.json"),
-                self.dialogues_processor.get_seen_services("train"),
-            )
-            predictions = combine_predictions_in_example(predictions, service_id.shape[0])
-
-            # write predictions to file in Dstc8/SGD format
-            write_predictions_to_file(
-                predictions,
-                input_json_files,
-                output_dir=prediction_dir,
-                schemas=self.dialogues_processor.schemas,
-                state_tracker=self._cfg.dataset.state_tracker,
-                eval_debug=False,
-                in_domain_services=in_domain_services,
-            )
-            metrics = evaluate(
-                prediction_dir,
-                self._cfg.dataset.data_dir,
-                split,
-                in_domain_services,
-                joint_acc_across_turn=self._cfg.dataset.joint_acc_across_turn,
-                use_fuzzy_match=self._cfg.dataset.use_fuzzy_match,
-            )
-
-        return metrics
-
-    def prepare_data(self):
-        """
-        Preprocessed schema and dialogues and caches this
-        """
-        if self.data_prepared:
-            return
-
-        self.dialogues_processor = DialogueSGDDataProcessor(
-            data_dir=self._cfg.dataset.data_dir,
-            dialogues_example_dir=self._cfg.dataset.dialogues_example_dir,
-            tokenizer=self.tokenizer,
-            cfg=self._cfg.dataset,
-        )
-
-        self.data_prepared = True
-
-    def update_data_dirs(self, data_dir: str, dialogues_example_dir: str):
-        """
-        Update data directories
-
-        Args:
-            data_dir: path to data directory
-            dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created.
-        """
-        if not os.path.exists(data_dir):
-            raise ValueError(f"{data_dir} is not found")
-        self._cfg.dataset.data_dir = data_dir
-        self._cfg.dataset.dialogues_example_dir = dialogues_example_dir
-        logging.info(f'Setting model.dataset.data_dir to {data_dir}.')
-        logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.')
-
-    def setup_training_data(self, train_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, split=train_data_config.ds_item)
-
-    def setup_validation_data(self, val_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, split=val_data_config.ds_item)
-
-    def setup_test_data(self, test_data_config: Optional[DictConfig] = None):
-        self.prepare_data()
-        self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, split=test_data_config.ds_item)
-
-    def _setup_dataloader_from_config(self, cfg: DictConfig, split: str) -> DataLoader:
-        dataset_cfg = self._cfg.dataset
-        data_dir = dataset_cfg.data_dir
-
-        if not os.path.exists(data_dir):
-            raise FileNotFoundError(f"Data directory is not found at: {data_dir}.")
-
-        # dataset = SGDDataset(dataset_split=split, dialogues_processor=self.dialogues_processor)
-
-        dataset = DialogueSGDBERTDataset(
-            dataset_split=split,
-            dialogues_processor=self.dialogues_processor,
-            tokenizer=self.dialogues_processor._tokenizer,
-            schemas=self.dialogues_processor.schemas,
-            schema_config=self.dialogues_processor.schema_config,
-            cfg=dataset_cfg,
-        )
-
-        dl = torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=cfg.batch_size,
-            collate_fn=dataset.collate_fn,
-            drop_last=cfg.drop_last,
-            shuffle=cfg.shuffle,
-            num_workers=cfg.num_workers,
-            pin_memory=cfg.pin_memory,
-        )
-        return dl
-
-    @classmethod
-    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
-        """
-        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
-
-        Returns:
-            List of available pre-trained models.
-        """
-        result = []
-
-        result.append(
-            PretrainedModelInfo(
-                pretrained_model_name="sgdqa_bertbasecased",
-                location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/sgdqa_bertbasecased/versions/1.0.0/files/sgdqa_bertbasecased.nemo",
-                description="Dialogue State Tracking model finetuned from NeMo BERT Base Cased on Google SGD dataset which has a joint goal accuracy of 59.72% on dev set and 45.85% on test set.",
-            )
-        )
-        return result
diff --git a/nemo/collections/nlp/modules/__init__.py b/nemo/collections/nlp/modules/__init__.py
index 17c768705bdd..82f0ee3bbcd1 100644
--- a/nemo/collections/nlp/modules/__init__.py
+++ b/nemo/collections/nlp/modules/__init__.py
@@ -13,21 +13,17 @@
 # limitations under the License.
 
 
-from nemo.collections.nlp.modules.common import (
-    AlbertEncoder,
-    BertEncoder,
-    BertModule,
-    CamembertEncoder,
-    DistilBertEncoder,
-    PromptEncoder,
-    RobertaEncoder,
-    SequenceClassifier,
-    SequenceRegression,
-    SequenceTokenClassifier,
-    get_lm_model,
-    get_pretrained_lm_models_list,
-    get_tokenizer,
-    get_tokenizer_list,
-)
-from nemo.collections.nlp.modules.dialogue_state_tracking.sgd_decoder import SGDDecoder
-from nemo.collections.nlp.modules.dialogue_state_tracking.sgd_encoder import SGDEncoder
+from nemo.collections.nlp.modules.common import AlbertEncoder  # noqa: F401
+from nemo.collections.nlp.modules.common import BertEncoder  # noqa: F401
+from nemo.collections.nlp.modules.common import BertModule  # noqa: F401
+from nemo.collections.nlp.modules.common import CamembertEncoder  # noqa: F401
+from nemo.collections.nlp.modules.common import DistilBertEncoder  # noqa: F401
+from nemo.collections.nlp.modules.common import PromptEncoder  # noqa: F401
+from nemo.collections.nlp.modules.common import RobertaEncoder  # noqa: F401
+from nemo.collections.nlp.modules.common import SequenceClassifier  # noqa: F401
+from nemo.collections.nlp.modules.common import SequenceRegression  # noqa: F401
+from nemo.collections.nlp.modules.common import SequenceTokenClassifier  # noqa: F401
+from nemo.collections.nlp.modules.common import get_lm_model  # noqa: F401
+from nemo.collections.nlp.modules.common import get_pretrained_lm_models_list  # noqa: F401
+from nemo.collections.nlp.modules.common import get_tokenizer  # noqa: F401
+from nemo.collections.nlp.modules.common import get_tokenizer_list  # noqa: F401
diff --git a/nemo/collections/nlp/modules/dialogue_state_tracking/__init__.py b/nemo/collections/nlp/modules/dialogue_state_tracking/__init__.py
deleted file mode 100644
index 9e3250071955..000000000000
--- a/nemo/collections/nlp/modules/dialogue_state_tracking/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_decoder.py b/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_decoder.py
deleted file mode 100644
index 2ffe5330183e..000000000000
--- a/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_decoder.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional
-
-import torch
-from torch import nn as nn
-
-from nemo.core.classes import NeuralModule, typecheck
-from nemo.core.neural_types import LogitsType, NeuralType
-
-__all__ = ['SGDDecoder']
-
-
-class LogitsQA(nn.Module):
-    def __init__(self, num_classes: int, embedding_dim: int):
-        """Get logits for elements by conditioning on input embedding.
-        Args:
-          num_classes: An int containing the number of classes for which logits are to be generated.
-          embedding_dim: hidden size of the BERT
-    
-        Returns:
-          A tensor of shape (batch_size, num_classes) containing the logits.
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        self.utterance_proj = nn.Linear(embedding_dim, embedding_dim)
-        self.activation = nn.functional.gelu
-
-        self.layer1 = nn.Linear(embedding_dim, num_classes)
-
-    def forward(self, encoded_utterance):
-        """
-        Args:
-            encoded_utterance: [CLS] token hidden state from BERT encoding of the utterance
-        """
-
-        # Project the utterance embeddings.
-        utterance_embedding = self.utterance_proj(encoded_utterance)
-        utterance_embedding = self.activation(utterance_embedding)
-
-        logits = self.layer1(utterance_embedding)
-        return logits
-
-
-class SGDDecoder(NeuralModule):
-    """
-    SGDDecoder
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """
-        Returns definitions of module output ports.
-        """
-        return {
-            "logit_intent_status": NeuralType(('B', 'T'), LogitsType()),  #'B'
-            "logit_req_slot_status": NeuralType(('B', 'T'), LogitsType()),  #'B'
-            "logit_cat_slot_status": NeuralType(('B', 'T'), LogitsType()),
-            "logit_cat_slot_value_status": NeuralType(('B', 'T'), LogitsType()),  #'B'
-            "logit_noncat_slot_status": NeuralType(('B', 'T'), LogitsType()),
-            "logit_spans": NeuralType(('B', 'T', 'D'), LogitsType()),
-        }
-
-    def __init__(self, embedding_dim: int) -> None:
-
-        """Get logits for elements by conditioning on utterance embedding.
-
-        Args:
-            embedding_dim: hidden size of the BERT
-        """
-        super().__init__()
-
-        projection_module = LogitsQA
-
-        self.intent_layer = projection_module(1, embedding_dim)
-        self.requested_slots_layer = projection_module(1, embedding_dim)
-
-        self.cat_slot_value_layer = projection_module(1, embedding_dim)
-
-        # Slot status values: none, dontcare, active.
-        self.slot_status_layer = projection_module(3, embedding_dim)
-
-        # dim 2 for non_categorical slot - to represent start and end position
-        self.noncat_layer1 = nn.Linear(embedding_dim, embedding_dim)
-        self.noncat_activation = nn.functional.gelu
-        self.noncat_layer2 = nn.Linear(embedding_dim, 2)
-
-    @typecheck()
-    def forward(self, encoded_utterance, token_embeddings, utterance_mask):
-        """
-        Args:
-            encoded_utterance: [CLS] token hidden state from BERT encoding of the utterance
-            token_embeddings: token embeddings from BERT encoding of the utterance
-            utterance_mask: utterance mask wiht 0 for padding
-        """
-        _, _ = encoded_utterance.size()
-        logit_intent_status = self._get_intents(encoded_utterance)
-
-        logit_req_slot_status = self._get_requested_slots(encoded_utterance)
-
-        logit_cat_slot_status, logit_cat_slot_value_status = self._get_categorical_slot_goals(encoded_utterance)
-
-        (logit_noncat_slot_status, logit_spans) = self._get_noncategorical_slot_goals(
-            encoded_utterance=encoded_utterance, utterance_mask=utterance_mask, token_embeddings=token_embeddings
-        )
-
-        return (
-            logit_intent_status,
-            logit_req_slot_status,
-            logit_cat_slot_status,
-            logit_cat_slot_value_status,
-            logit_noncat_slot_status,
-            logit_spans,
-        )
-
-    def _get_intents(self, encoded_utterance):
-        """Obtain logits for intents.
-        Args:
-            encoded_utterance: representation of utterance
-        """
-        logits = self.intent_layer(encoded_utterance=encoded_utterance,)
-        return logits
-
-    def _get_requested_slots(self, encoded_utterance):
-        """Obtain logits for requested slots.
-        Args:
-            encoded_utterance: representation of utterance
-        """
-
-        logits = self.requested_slots_layer(encoded_utterance=encoded_utterance)
-        return logits
-
-    def _get_categorical_slot_goals(self, encoded_utterance):
-        """
-        Obtain logits for status and values for categorical slots
-        Slot status values: none, dontcare, active
-        Args:
-            encoded_utterance: representation of utterance
-        """
-
-        # Predict the status of all categorical slots.
-        status_logits = self.slot_status_layer(encoded_utterance=encoded_utterance)
-
-        value_status_logits = self.cat_slot_value_layer(encoded_utterance=encoded_utterance)
-        return status_logits, value_status_logits
-
-    def _get_noncategorical_slot_goals(self, encoded_utterance, utterance_mask, token_embeddings):
-        """
-        Obtain logits for status and slot spans for non-categorical slots.
-        Slot status values: none, dontcare, active
-        Args:
-            encoded_utterance: [CLS] token hidden state from BERT encoding of the utterance
-            utterance_mask: utterance mask wiht 0 for padding
-            token_embeddings: token embeddings from BERT encoding of the utterance
-        """
-        status_logits = self.slot_status_layer(encoded_utterance=encoded_utterance)
-
-        # Project the combined embeddings to obtain logits, Shape: (batch_size, max_num_slots, max_num_tokens, 2)
-        span_logits = self.noncat_layer1(token_embeddings)
-        span_logits = self.noncat_activation(span_logits)
-        span_logits = self.noncat_layer2(span_logits)
-
-        # Mask out invalid logits for padded tokens.
-        utterance_mask = utterance_mask.to(bool)  # Shape: (batch_size, max_num_tokens).
-        repeated_utterance_mask = utterance_mask.unsqueeze(-1)
-        negative_logits = (torch.finfo(span_logits.dtype).max * -0.7) * torch.ones(
-            span_logits.size(), device=span_logits.get_device(), dtype=span_logits.dtype
-        )
-
-        span_logits = torch.where(repeated_utterance_mask, span_logits, negative_logits)
-
-        return status_logits, span_logits
-
-    def _get_negative_logits(self, logits):
-        """Returns tensor with negative logits that will be used to mask out unused values for a particular service 
-        Args:
-            logits: logits whose shape and type will be used to create negative tensor
-        """
-        negative_logits = (torch.finfo(logits.dtype).max * -0.7) * torch.ones(
-            logits.size(), dtype=logits.dtype, device=logits.get_device()
-        )
-        return negative_logits
diff --git a/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_encoder.py b/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_encoder.py
deleted file mode 100644
index 948a806ad37c..000000000000
--- a/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_encoder.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-# Copyright 2019 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional
-
-from torch import nn as nn
-
-from nemo.collections.nlp.modules.common.classifier import Classifier
-from nemo.core.classes import typecheck
-from nemo.core.neural_types import ChannelType, LogitsType, NeuralType
-
-__all__ = ['SGDEncoder']
-
-ACT2FN = {"tanh": nn.functional.tanh, "relu": nn.functional.relu}
-
-
-class SGDEncoder(Classifier):
-    """
-    Neural module which encodes BERT hidden states
-    """
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """
-        Returns definitions of module output ports.
-        """
-
-        return {
-            "logits": NeuralType(('B', 'T'), LogitsType()),
-            'hidden_states': NeuralType(('B', 'T', 'C'), ChannelType()),
-        }
-
-    def __init__(
-        self, hidden_size: int, activation: str = 'tanh', dropout: float = 0.0, use_transformer_init: bool = True,
-    ) -> None:
-
-        """
-        Args:
-            hidden_size: hidden size of the BERT model
-            activation: activation function applied
-            dropout: dropout ratio
-            use_transformer_init: use transformer initialization
-        """
-        super().__init__(hidden_size=hidden_size, dropout=dropout)
-        self.fc = nn.Linear(hidden_size, hidden_size)
-
-        if activation not in ACT2FN:
-            raise ValueError(f'{activation} is not in supported ' + '{ACTIVATIONS_F.keys()}')
-
-        self.activation = ACT2FN[activation]
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        self.post_init(use_transformer_init=use_transformer_init)
-
-    @typecheck()
-    def forward(self, hidden_states):
-        """
-        Args:
-            hidden_states: bert output hidden states
-        """
-        first_token_hidden_states = hidden_states[:, 0]
-        logits = self.fc(first_token_hidden_states)
-        logits = self.activation(logits)
-        logits = self.dropout1(logits)
-        return logits, self.dropout2(hidden_states)