diff --git a/nemo/collections/nlp/data/dialogue/__init__.py b/nemo/collections/nlp/data/dialogue/__init__.py deleted file mode 100644 index a3992ef59971..000000000000 --- a/nemo/collections/nlp/data/dialogue/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.data_processor.sgd_data_processor import DialogueSGDDataProcessor -from nemo.collections.nlp.data.dialogue.dataset import ( - DialogueBERTDataset, - DialogueGPTClassificationDataset, - DialogueSGDBERTDataset, - DialogueZeroShotIntentDataset, -) -from nemo.collections.nlp.data.dialogue.sgd.schema import Schema diff --git a/nemo/collections/nlp/data/dialogue/data_processor/__init__.py b/nemo/collections/nlp/data/dialogue/data_processor/__init__.py deleted file mode 100644 index 2db92b257416..000000000000 --- a/nemo/collections/nlp/data/dialogue/data_processor/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py deleted file mode 100644 index 92c56a4c20df..000000000000 --- a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueAssistantDataProcessor'] - - -class DialogueAssistantDataProcessor(DialogueDataProcessor): - """Data Processor for Assistant dialogues.""" - - def __init__(self, data_dir: str, tokenizer: object, cfg): - """ - Constructs DialogueAssistantDataProcessor - Args: - data_dir: path to data directory - tokenizer: tokenizer object - """ - # deprecation warning - deprecated_warning("DialogueAssistantDataProcessor") - - self.data_dir = data_dir - self._tokenizer = tokenizer - self.cfg = cfg - self.intents = self.open_file("dict.intents.csv") - if self.cfg.preprocess_intent_function == 'remove_domain': - self.intents = [ - DialogueAssistantDataProcessor.normalize_zero_shot_intent(intent) for intent in self.intents - ] - self.slots = self.open_file("dict.slots.csv") - ( - bio_slot_ids_to_unified_slot_ids, - unified_slots, - ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(self.slots) - self.slots = unified_slots - - self.bio_slot_ids_to_unified_slot_ids = bio_slot_ids_to_unified_slot_ids - self.services = sorted(list(set([intent.split('_')[0] for intent in self.intents]))) - self.empty_slot_id = [str(idx) for idx, slot_name in enumerate(self.slots) if slot_name == "O"][0] - - @staticmethod - def normalize_zero_shot_intent(label): - label = label.split('.')[1] - if label == 'nomatch': - return 'no match' - else: - return label.replace('_', ' ') - - def open_file(self, filename): - """ - Reads file into a list - """ - filename = os.path.join(self.data_dir, filename) - with open(filename, "r", encoding="UTF-8") as f: - lines = [i.strip() for i in f.readlines()] - return lines - - @staticmethod - def get_continuous_slots(slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids): - """ - Extract continuous spans of slot_ids - - To accomodate slots with distinct labels for B-label1 and I-label1, - slot_id = self.bio_slot_ids_to_unified_slot_ids[slot_id] is called to map them both to label1 - - Args: - Slot: list of int representing slot of each word token - For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12 - Corresponds to "please set an alarm clock for my next meeting with the team at three pm next friday" - Except for the empty_slot_id (54 in this case), we hope to extract the continuous spans of tokens, - each containing a start position and an exclusive end position - E.g {18: [9, 10], 44: [11, 13], 46: [14, 16], 12: [17, 18]} - """ - slot_id_stack = [] - position_stack = [] - for i in range(len(slot_ids)): - slot_id = slot_ids[i] - - slot_id = bio_slot_ids_to_unified_slot_ids[slot_id] - - if not slot_id_stack or slot_id != slot_id_stack[-1]: - slot_id_stack.append(slot_id) - position_stack.append([]) - position_stack[-1].append(i) - - slot_id_to_start_and_exclusive_end = { - slot_id_stack[i]: [position_stack[i][0], position_stack[i][-1] + 1] - for i in range(len(position_stack)) - if slot_id_stack[i] != empty_slot_id - } - - return slot_id_to_start_and_exclusive_end - - @staticmethod - def map_bio_format_slots_to_unified_slots(slots): - """ - maps BIO format slots to unified slots (meaning that B-alarm_time and I-alarm_time both map to alarm_time) - called even slots does not contain BIO, for unified interface - in that case slots == unified_slots and bio_slot_ids_to_unified_slot_ids is an identity mapping i.e. {"0": "0", "1": "1"} - """ - bio_slot_ids_to_unified_slot_ids = {} - unified_slots = [] - unified_idx = -1 - for idx, slot in enumerate(slots): - if slot.replace('I-', '').replace('B-', '') not in unified_slots: - unified_idx += 1 - unified_slots.append(slot.replace('I-', '').replace('B-', '')) - bio_slot_ids_to_unified_slot_ids[str(idx)] = str(unified_idx) - return bio_slot_ids_to_unified_slot_ids, unified_slots - - def get_dialog_examples(self, dataset_split: str): - """ - Process raw files into DialogueInputExample - Args: - dataset_split: {train, dev, test} - For the assistant dataset, there is no explicit dev set (instead uses the test set as the dev set) - Therefore, this function creates a dev set and a new train set from the train set. - This is done by taking every 10th example and putting it into the dev set, - with all other examples going into the new train set. - """ - examples = [] - - dataset_split_print = {"train": "train", "dev": "train", "test": "test"} - - raw_examples_intent = self.open_file("{}.tsv".format(dataset_split_print[dataset_split])) - # removes header of tsv file - raw_examples_intent = raw_examples_intent[1:] - raw_examples_slots = self.open_file("{}_slots.tsv".format(dataset_split_print[dataset_split])) - - if dataset_split in ["train", "dev"]: - train_idx = [] - dev_idx = [] - for idx in range(len(raw_examples_intent)): - if idx % 10 == 0: - dev_idx.append(idx) - else: - train_idx.append(idx) - - if dataset_split == "train": - raw_examples_intent = [raw_examples_intent[idx] for idx in train_idx] - raw_examples_slots = [raw_examples_slots[idx] for idx in train_idx] - elif dataset_split == "dev": - raw_examples_intent = [raw_examples_intent[idx] for idx in dev_idx] - raw_examples_slots = [raw_examples_slots[idx] for idx in dev_idx] - - for i in range(len(raw_examples_intent)): - utterance, intent_id = raw_examples_intent[i].split('\t') - slot_ids = raw_examples_slots[i].split() - utterance_tokens = utterance.split() - intent = self.intents[int(intent_id)] - slot_id_to_start_and_exclusive_end = DialogueAssistantDataProcessor.get_continuous_slots( - slot_ids, self.empty_slot_id, self.bio_slot_ids_to_unified_slot_ids - ) - - slot_to_start_and_exclusive_end = { - self.slots[int(slot_id)]: position for slot_id, position in slot_id_to_start_and_exclusive_end.items() - } - slot_to_words = { - slot: ' '.join(utterance_tokens[position[0] : position[1]]) - for slot, position in slot_to_start_and_exclusive_end.items() - } - input_example = { - "utterance": utterance, - "labels": {"service": intent.split('_')[0], "intent": intent, "slots": slot_to_words}, - "label_positions": { - "slots": { - slot: { - "start": position[0], - "exclusive_end": position[1], - "slot": slot, - } - for slot, position in slot_to_start_and_exclusive_end.items() - } - }, - "possible_labels": { - "service": self.services, - "intent": self.intents, - "slots": { - # this dataset does not support categorical slots (i.e. only extractive slots) - # therefore use empty list for all values - slot: [] - for slot in self.slots - }, - }, - } - example = DialogueInputExample(input_example) - examples.append(example) - return examples - - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - return self.get_dialog_examples("train") - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - return self.get_dialog_examples("dev") - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - return self.get_dialog_examples("test") diff --git a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py deleted file mode 100644 index c41c1f5e04ca..000000000000 --- a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import random - -from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueDataProcessor'] - - -class DialogueDataProcessor(DataProcessor): - """ - Base class for Data Processing for all data sources - - Data Processor is designed to be Model-independent (but Data-dependent) so that - - Encourages experimentation with a variety of models \ - (BERT-style; GPT-style; T5-style), \ - which have different tokenization/preprocessing requirements - - Facilitates experiments with a variety of data sources, - as data is processed into a common format - - Roles - 1. Processes raw files into Dialogue Input Examples. - 2. Keeps all possibly relevant information from the raw files, which - the Dataset class can then determine which labels to use - - """ - - def __init__(self): - # deprecation warning - deprecated_warning("DialogueDataProcessor") - - raise NotImplementedError() - - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - raise NotImplementedError() - - @staticmethod - def get_relevant_idxs(dataset_split, n_samples, dev_proportion): - """ - Obtain indexes for each dataset_split, when train and dev sets are not in separate files - - Args: - dataset_split: train, dev or test - n_samples: total number of samples - dev_proportion: value from 1 to 99 that represent proportion of data in dev set - Returns: - idxs: indices for relevant samples - """ - - if dataset_split in ["train", "dev"]: - n_dev = int(n_samples * (dev_proportion / 100)) - dev_idxs = random.sample(list(range(n_samples)), n_dev) - if dataset_split == "dev": - idxs = dev_idxs - else: - dev_idxs_set = set(dev_idxs) - train_idxs = [idx for idx in list(range(n_samples)) if idx not in dev_idxs_set] - idxs = train_idxs - - elif dataset_split == "test": - idxs = list(range(n_samples)) - - else: - raise ValueError("please select dataset split from train, dev and test") - - return idxs diff --git a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py deleted file mode 100644 index 56e99c4bcfe9..000000000000 --- a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pandas as pd - -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueDesignDataProcessor'] - - -class DialogueDesignDataProcessor(DialogueDataProcessor): - """Data Processor for Design Dataset""" - - def __init__(self, data_dir: str, tokenizer: object, cfg=None): - """ - Constructs DialogueDesignDataProcessor - Args: - data_dir: path to data directory - tokenizer: tokenizer object - cfg: cfg container for dataset - """ - # deprecation warning - deprecated_warning("DialogueDesignDataProcessor") - - self.data_dir = data_dir - self._tokenizer = tokenizer - self.cfg = cfg - - def open_csv(self, filename): - """ - Reads file into a list - """ - filename = os.path.join(self.data_dir, filename) - with open(filename, "r", encoding="UTF-8") as f: - df = pd.read_csv(filename) - return df.to_dict(orient='index') - - def get_dialog_examples(self, dataset_split: str): - """ - Process raw files into DialogueInputExample - Args: - dataset_split: {train, dev, test} - Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set - Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting - """ - - examples = [] - - raw_examples = self.open_csv('mellon_design_OV.csv') - # remove disabled examples - raw_examples = [raw_examples[i] for i in range(len(raw_examples)) if raw_examples[i]['disabled'] != 'yes'] - - n_samples = len(raw_examples) - - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, self.cfg.dev_proportion) - - all_intents = sorted(list(set(raw_examples[i]['intent labels'] for i in range(len(raw_examples))))) - all_services = sorted(list(set(raw_examples[i]['domain'] for i in range(len(raw_examples))))) - for i in idxs: - raw_example = raw_examples[i] - utterances = [raw_example['example_{}'.format(i)] for i in range(1, 4)] - service = raw_example['domain'] - intent = raw_example['intent'] - intent_description = raw_example['intent labels'] - system_utterance = raw_example['response'] - - slot_names = [raw_example['slot{}'.format(i)] for i in range(1, 3)] - # these are possible slot values not ground truth slot values - slot_values = [raw_example['slot{}_values'.format(i)] for i in range(1, 3)] - slot_questions = [raw_example['slot{}_values'.format(i)] for i in range(1, 3)] - - for j in range(1, 3): - value = raw_example['slot{}'.format(j)] - if isinstance(value, str): - system_utterance = system_utterance.replace('slot{}'.format(j), value) - - valid_slots_ids = [i for i, slot in enumerate(slot_names) if isinstance(slot, str)] - slot_names = [slot_names[i] for i in valid_slots_ids] - slot_values = [slot_values[i] if isinstance(slot_values[i], str) else '' for i in valid_slots_ids] - slot_questions = [slot_questions[i] if isinstance(slot_questions[i], str) else '' for i in valid_slots_ids] - - for utterance in utterances: - if not isinstance(utterance, str): - continue - input_example = { - "utterance": utterance, - "system_utterance": system_utterance, - "labels": { - "service": service, - "intent": intent_description, - "slots": { - slot: '' for slot in slot_names - }, # dataset does not contain ground truth slot values - }, - "possible_labels": { - 'intent': all_intents, - "service": all_services, - "slots": {slot: slot_values[i] for i, slot in enumerate(slot_names)}, - }, - "description": { - "service": service, - "intent": intent_description, - "slots": {slot: slot_questions[i] for i, slot in enumerate(slot_names)}, - }, - } - - example = DialogueInputExample(input_example) - examples.append(example) - return examples - - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - return self.get_dialog_examples("train") - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - return self.get_dialog_examples("dev") - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - return self.get_dialog_examples("test") diff --git a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py deleted file mode 100644 index 67d58ff5d21e..000000000000 --- a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pandas as pd - -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueMellonQADataProcessor'] - - -class DialogueMellonQADataProcessor(DialogueDataProcessor): - """Data Processor for Mellon QA dialogues.""" - - def __init__(self, data_dir: str, tokenizer: object, cfg=None): - """ - Constructs DialogueMSMarcoDataProcessor - Args: - data_dir: path to data directory - tokenizer: tokenizer object - cfg: cfg container for dataset - """ - # deprecation warning - deprecated_warning("DialogueMellonQADataProcessor") - - self.data_dir = data_dir - self._tokenizer = tokenizer - self.cfg = cfg - - def open_csv(self, filename): - """ - Reads file into a list - """ - filename = os.path.join(self.data_dir, filename) - with open(filename, "r", encoding="UTF-8") as f: - df = pd.read_csv(filename) - return df.to_dict(orient='index') - - def get_dialog_examples(self, dataset_split: str): - """ - Process raw files into DialogueInputExample - Args: - dataset_split: {train, dev, test} - For the Mellon QA dataset, there is no explicit dev set (instead uses the test set as the dev set) - Therefore, this function creates a dev set and a new train set from the train set. - Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set - Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting - """ - - examples = [] - - raw_examples = self.open_csv('mellon_qa_data.csv') - raw_examples = list(raw_examples.values()) - # filter out answers with no answer - raw_examples = [ - example - for example in raw_examples - if isinstance(example['Non Generative Question Answering '], str) - and isinstance(example['Generative Question Answering '], str) - ] - - n_samples = len(raw_examples) - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, self.cfg.dev_proportion) - - for i in idxs: - utterance = str(raw_examples[i]['Question']) - answer = str(raw_examples[i]['Non Generative Question Answering ']) - well_formed_answer = str(raw_examples[i]['Generative Question Answering ']) - passage = raw_examples[i]['Passage'] - input_example = { - "utterance": utterance, - "example_id": i, - "labels": { - "response": answer, - "fluent_response": well_formed_answer, - "passage": passage, - }, - } - example = DialogueInputExample(input_example) - examples.append(example) - return examples - - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - return self.get_dialog_examples("train") - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - return self.get_dialog_examples("dev") - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - return self.get_dialog_examples("test") diff --git a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py deleted file mode 100644 index d09960a35d69..000000000000 --- a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -from ast import literal_eval - -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueMSMarcoDataProcessor'] - - -class DialogueMSMarcoDataProcessor(DialogueDataProcessor): - """Data Processor for MS Marco dialogues. (https://github.com/microsoft/MSMARCO-Question-Answering) - Please agree to the Terms of Use before downloading data at - https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz - https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz - """ - - def __init__(self, data_dir: str, tokenizer: object, cfg=None): - """ - Constructs DialogueMSMarcoDataProcessor - Args: - data_dir: path to data directory - tokenizer: tokenizer object - debug_mode: reduce number of samples to load in order to increase speed of processing - cfg: cfg container for dataset - """ - # deprecation warning - deprecated_warning("DialogueMSMarcoDataProcessor") - - self.data_dir = data_dir - self._tokenizer = tokenizer - self.cfg = cfg - - def open_json(self, filename): - """ - Reads file into a list - """ - filename = os.path.join(self.data_dir, filename) - with open(filename, "r", encoding="UTF-8") as f: - data = json.load(f) - return data - - def get_dialog_examples(self, dataset_split: str): - """ - Process raw files into DialogueInputExample - Args: - dataset_split: {train, dev, test} - For the MS Marco dataset, there is no explicit dev set (instead uses the test set as the dev set) - Therefore, this function creates a dev set and a new train set from the train set. - Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set - """ - - examples = [] - - dataset_split_print = {"train": "train", "dev": "train", "test": "dev"} - - raw_examples = self.open_json("{}_v2.1.json".format(dataset_split_print[dataset_split])) - - n_samples = len(raw_examples['answers']) - - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, self.cfg.dev_proportion) - - if self.cfg.debug_mode: - idxs = idxs[:100] - - for i in idxs: - utterance = raw_examples['query'][str(i)] - # answer need not be extracted from passage - # taking the first answer as the ground truth correct answer as only <1% has multiple answers - answer = raw_examples['answers'][str(i)] - answer = answer[0] if isinstance(answer, list) else answer - - well_formed_answer = raw_examples['wellFormedAnswers'][str(i)] - well_formed_answer = ( - well_formed_answer if isinstance(well_formed_answer, list) else literal_eval(well_formed_answer) - ) - well_formed_answer = well_formed_answer[0] if well_formed_answer else None - query_type = raw_examples['query_type'][str(i)] - candidate_passages = raw_examples['passages'][str(i)] - passage = [ - candidate_passage["passage_text"] - for candidate_passage in candidate_passages - if int(candidate_passage["is_selected"]) - ] - passage = passage[0] if passage else None - - possible_passages = [candidate_passage["passage_text"] for candidate_passage in candidate_passages] - - input_example = { - "utterance": utterance, - "example_id": i, - "labels": { - "service": query_type, - "response": answer, - "fluent_response": well_formed_answer, - "passage": passage, - }, - "possible_labels": { - "service": "LOCATION,NUMERIC,PERSON,DESCRIPTION,ENTITY".split(','), - "passage": possible_passages, - }, - } - example = DialogueInputExample(input_example) - examples.append(example) - return examples - - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - return self.get_dialog_examples("train") - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - return self.get_dialog_examples("dev") - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - return self.get_dialog_examples("test") diff --git a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py deleted file mode 100644 index 1d37c26f1c45..000000000000 --- a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py +++ /dev/null @@ -1,578 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/baseline/data_utils.py -""" -import collections -import json -import os -import pickle -import re -from typing import List - -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample -from nemo.collections.nlp.data.dialogue.sgd.schema import Schema -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning -from nemo.utils.get_rank import is_global_rank_zero - -__all__ = ['DialogueSGDDataProcessor'] - -FILE_RANGES = { - "sgd_single_domain": {"train": range(1, 44), "dev": range(1, 8), "test": range(1, 12)}, - "sgd_multi_domain": {"train": range(44, 128), "dev": range(8, 21), "test": range(12, 35)}, - "sgd_all": {"train": range(1, 128), "dev": range(1, 21), "test": range(1, 35)}, - "sgd_all_single": {"train": range(1, 128), "dev": range(1, 8), "test": range(1, 12)}, - "multiwoz": {"train": range(1, 18), "dev": range(1, 3), "test": range(1, 3)}, - "debug_sample": {"train": range(1, 2), "dev": range(1, 2), "test": range(1, 2)}, -} - - -class DialogueSGDDataProcessor(DialogueDataProcessor): - """Data Processor for SGD dialogues. - - More information at https://arxiv.org/abs/1909.05855 - - ***Downloading the dataset*** - # git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git - - ***Data format*** - SGD data comes with a JSON schema file and dialogue files for each dataset split. - - In the following we will show an example for a service entry in the schema file. - * service_name - * description - * slots - * name - * description - * is_categorical - * possible values - * intents - * name - * description - * required_slots (not used) - * is_transactional (not used) - * optional_slots (not used) - * result_slots (not used) - - - In the following we will show an example for a dialogue. - * dialogue_id - * services - * turns - * frames - * actions - * act - * slot - * values - * service - * slots - * exclusive_end - * slot - * start - * state - * active_intent - * requeste_slots - * slot_values - * speaker - [USER, SYSTEM] - * utterance - - """ - - def __init__( - self, - data_dir: str, - dialogues_example_dir: str, - tokenizer: object, - cfg=None, - ): - """ - Constructs DialogueSGDDataProcessor - Args: - data_dir: path to data directory - dialogues_example_dir: path to store processed dialogue examples - tokenizer: tokenizer object - cfg: cfg container for dataset - """ - # deprecation warning - deprecated_warning("DialogueSGDDataProcessor") - - self.data_dir = data_dir - self.cfg = cfg - - self._task_name = self.cfg.task_name # e.g. "sgd_single_domain" - self._subsample = self.cfg.subsample - - all_schema_json_paths = [] - for dataset_split in ['train', 'test', 'dev']: - all_schema_json_paths.append(os.path.join(self.cfg.data_dir, dataset_split, "schema.json")) - self.schemas = Schema(all_schema_json_paths) - - self.schema_config = { - "MAX_NUM_CAT_SLOT": self.cfg.max_num_cat_slot, - "MAX_NUM_NONCAT_SLOT": self.cfg.max_num_noncat_slot, - "MAX_NUM_VALUE_PER_CAT_SLOT": self.cfg.max_value_per_cat_slot, - "MAX_NUM_INTENT": self.cfg.max_num_intent, - "NUM_TASKS": self.cfg.num_tasks, - "MAX_SEQ_LENGTH": self.cfg.max_seq_length, - } - - train_file_range = FILE_RANGES[self._task_name]["train"] - dev_file_range = FILE_RANGES[self._task_name]["dev"] - test_file_range = FILE_RANGES[self._task_name]["test"] - - self._file_ranges = { - "train": train_file_range, - "dev": dev_file_range, - "test": test_file_range, - } - - self._seen_services = { - "train": set(), - "dev": set(), - "test": set(), - } - - self._tokenizer = tokenizer - - self._dialogues_example_dir = dialogues_example_dir - - self.dial_files = {} - - # slots_relation_list.np would contain the candidate list of slots for each (service, slot) which would be - # looked into when a switch between two services happens in the dialogue and we can not find any value for a slot in the current user utterance. - # This file would get generated from the dialogues in the training set. - self.slots_relation_file = os.path.join( - dialogues_example_dir, f"{self._task_name}_train_slots_relation_list.np" - ) - for dataset in ["train", "dev", "test"]: - # Process dialogue files - dial_file = f"{self._task_name}_{dataset}_examples.json" - dial_file = os.path.join(dialogues_example_dir, dial_file) - self.dial_files[(self._task_name, dataset)] = dial_file - - dialog_paths = DialogueSGDDataProcessor.get_dialogue_files(data_dir, dataset, self._task_name) - dialogs = DialogueSGDDataProcessor.load_dialogues(dialog_paths) - for dialog in dialogs: - self._seen_services[dataset].update(set(dialog['services'])) - - if is_global_rank_zero(): - overwrite_dial_files = not self.cfg.use_cache - self.save_dialog_examples(overwrite_dial_files=overwrite_dial_files) - - def save_dialog_examples(self, overwrite_dial_files: bool): - """ - Preprocesses dialogues and saves to disk. - Args: - overwrite_dial_files: whether or not to overwrite saved file if already exists - """ - for dataset in ["train", "dev", "test"]: - dial_file = self.dial_files[(self._task_name, dataset)] - if not os.path.exists(dial_file) or overwrite_dial_files: - logging.info(f"Start generating the dialogue examples for {dataset} dataset.") - if not os.path.exists(self._dialogues_example_dir): - os.makedirs(self._dialogues_example_dir) - dial_examples, slots_relation_list = self._generate_dialog_examples( - dataset, self.schemas, self._subsample - ) - - with open(dial_file, "w", encoding="UTF-8") as f: - json.dump([i.data for i in dial_examples], f) - - if dataset == "train": - with open(self.slots_relation_file, "wb") as f: - pickle.dump(slots_relation_list, f) - logging.info(f"The slot carry-over list for train set is stored at {self.slots_relation_file}") - - logging.info(f"The dialogue examples for {dataset} dataset saved at {dial_file}") - logging.info(f"Finish generating the dialogue examples for {dataset} dataset.") - - # common interface for Data Processor - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - return self.get_dialog_examples("train") - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - return self.get_dialog_examples("dev") - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - return self.get_dialog_examples("test") - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - def get_dialog_examples(self, dataset_split: str) -> List[object]: - """ - Loads preprocessed dialogue examples from disk. - Args: - dataset_split: dataset split - Returns: - dial_examples: list of InputExample's. - """ - if (self._task_name, dataset_split) not in self.dial_files or not os.path.exists( - self.dial_files[(self._task_name, dataset_split)] - ): - raise ValueError( - f"{dataset_split} dialogue examples were not processed for {self._task_name} task. Re-initialize SGDDataProcessor and add {dataset_split} dataset split to datasets arg." - ) - dial_file = self.dial_files[(self._task_name, dataset_split)] - logging.info(f"Loading dialogue examples from {dial_file}.") - - with open(dial_file, "rb") as f: - dial_examples = json.load(f) - dial_examples = [DialogueInputExample(i) for i in dial_examples] - if not os.path.exists(self.slots_relation_file): - raise ValueError( - f"Slots relation file {self.slots_relation_file} does not exist. It is needed for the carry-over mechanism of state tracker for switches between services." - ) - if os.path.getsize(self.slots_relation_file) > 0: - with open(self.slots_relation_file, "rb") as f: - self.schemas._slots_relation_list = pickle.load(f) - logging.info( - f"Loaded the slot relation list for value carry-over between services from {self.slots_relation_file}." - ) - - return dial_examples - - def get_seen_services(self, dataset_split: str): - """ - Returns list of seen services, i.e. both in given and training split - Args: - dataset_split: data split - Returns: - seen_services: list of seen services - """ - seen_services = self._seen_services[dataset_split] - return seen_services - - def _generate_dialog_examples(self, dataset_split: str, schemas: object, subsample: bool): - """ - Returns a list of `InputExample`s of the data splits' dialogues. - Args: - dataset_split: data split, can be "train", "dev", or "test". - schemas: schema for all services of all datasets - subsample: whether to balance postive and negative samples in the dataset - Returns: - examples: a list of `InputExample`s. - """ - logging.info(f'Creating examples and slot relation list from the dialogues started...') - dialog_paths = [ - os.path.join(self.data_dir, dataset_split, "dialogues_{:03d}.json".format(i)) - for i in self._file_ranges[dataset_split] - ] - dialogs = DialogueSGDDataProcessor.load_dialogues(dialog_paths) - - examples = [] - slot_carryover_candlist = collections.defaultdict(int) - for dialog_idx, dialog in enumerate(dialogs): - if dialog_idx % 1000 == 0: - logging.info(f'Processed {dialog_idx} dialogues.') - examples.extend( - self._create_examples_from_dialog(dialog, schemas, dataset_split, slot_carryover_candlist, subsample) - ) - - slots_relation_list = collections.defaultdict(list) - for slots_relation, relation_size in slot_carryover_candlist.items(): - if relation_size > 0: - slots_relation_list[(slots_relation[0], slots_relation[1])].append( - (slots_relation[2], slots_relation[3], relation_size) - ) - slots_relation_list[(slots_relation[2], slots_relation[3])].append( - (slots_relation[0], slots_relation[1], relation_size) - ) - - return examples, slots_relation_list - - def _create_examples_from_dialog( - self, dialog: dict, schemas: object, dataset_split: str, slot_carryover_candlist: dict, subsample: bool - ): - """ - Create examples for every turn in the dialogue. - Args: - dialog: dialogue example - schemas: schema for all services of all datasets - dataset_split: data split - slot_carryover_candlist: a dictionary to keep and count the number of carry-over cases between two slots from two different services - subsample: whether to balance postive and negative samples in the dataset - Returns: - examples: a list of `InputExample`s. - """ - dialog_id = dialog["dialogue_id"] - prev_states = {} - examples = [] - for turn_idx, turn in enumerate(dialog["turns"]): - # Generate an example for every frame in every user turn. - if turn["speaker"] == "USER": - user_utterance = turn["utterance"] - user_frames = {f["service"]: f for f in turn["frames"]} - if self.cfg.system_utterance == 'prev_turn': - if turn_idx > 0: - system_turn = dialog["turns"][turn_idx - 1] - system_utterance = system_turn["utterance"] - system_frames = {f["service"]: f for f in system_turn["frames"]} - else: - system_utterance = "" - system_frames = {} - else: # takes the system utterance of the next turn - system_turn = dialog["turns"][turn_idx + 1] - system_utterance = system_turn["utterance"] - system_frames = {f["service"]: f for f in system_turn["frames"]} - - turn_id = "{}-{}-{:02d}".format(dataset_split, dialog_id, turn_idx) - turn_examples, prev_states, slot_carryover_values = self._create_examples_from_turn( - turn_id, - system_utterance, - user_utterance, - system_frames, - user_frames, - prev_states, - schemas, - subsample, - ) - examples.extend(turn_examples) - - for value, slots_list in slot_carryover_values.items(): - if value in ["True", "False"]: - continue - if len(slots_list) > 1: - for service1, slot1 in slots_list: - for service2, slot2 in slots_list: - if service1 == service2: - continue - if service1 > service2: - service1, service2 = service2, service1 - slot1, slot2 = slot2, slot1 - slot_carryover_candlist[(service1, slot1, service2, slot2)] += 1 - return examples - - def _get_state_update(self, current_state: dict, prev_state: dict) -> dict: - """ - Updates dialogue state - Args: - current_state: slot values pairs for the current dialogue turn - prev_state: slot values pairs for the previous dialogue turns - Returns: - state_update: slot values pairs that are added/updated during the current dialogue turn - """ - state_update = dict(current_state) - for slot, values in current_state.items(): - if slot in prev_state and prev_state[slot][0] in values: - # Remove the slot from state if its value didn't change. - state_update.pop(slot) - return state_update - - @staticmethod - def convert_camelcase_to_lower(label): - """Converts camelcase to lowercase with spaces e.g. 'HelloWorld' --> 'hello world'""" - if label.lower() == "none": - return "none" - label = label.split("_")[0] - tokens = re.findall('[A-Z][^A-Z]*', label) - return ' '.join([token.lower() for token in tokens]) - - def preprocess_intent(self, intent, schemas, service): - if self.cfg.preprocess_intent_function == 'default': - return intent - elif self.cfg.preprocess_intent_function == 'lowercase': - return DialogueSGDDataProcessor.convert_camelcase_to_lower(intent) - elif self.cfg.preprocess_intent_function == 'description': - return schemas.get_service_schema(service).intent_descriptions[intent] - else: - raise ValueError( - 'Only default, lowercase and description are allowed for model.dataset.preprocess_intent_function for SGD task' - ) - - def _create_examples_from_turn( - self, - turn_id: int, - system_utterance: str, - user_utterance: str, - system_frames: dict, - user_frames: dict, - prev_states: dict, - schemas: object, - subsample: bool, - ): - """ - Creates an example for each frame in the user turn. - Args: - turn_id: turn number - system_utterance: last system utterance - user_utterance: lst user utterance - system_frames: all system utterances and slot - slot value pairs - user_frames: all user utterances and slot - slot value pairs - prev_states: slot - slot value pairs from the previous turns - schemas: schema for all services of all datasets - subsample: whether to balance postive and negative samples in the dataset - Returns: - examples: a list of `InputExample`s. - prev_states: updated dialogue state e.g. {'Restaurants_1': {'city': ['San Jose'], 'cuisine': ['American']}} - """ - system_user_utterance = system_utterance + ' ' + user_utterance - states = {} - - examples = [] - slot_carryover_values = collections.defaultdict(list) - - for service, user_frame in user_frames.items(): - - state = user_frame["state"]["slot_values"] - state_update = self._get_state_update(state, prev_states.get(service, {})) - states[service] = state - system_frame = system_frames.get(service, None) - dataset_split, dialog_id, turn_id_ = turn_id.split('-') - dialog_id_1, dialog_id_2 = dialog_id.split('_') - example_id = f"{turn_id}-{service}" - example_id_num = [ - int(dialog_id_1), - int(dialog_id_2), - int(turn_id_), - schemas.get_service_id(service), - ] - intent = user_frames[service]["state"]['active_intent'] - all_possible_slots = schemas.get_service_schema(service).slots - categorical_slots = schemas.get_service_schema(service).categorical_slots - one_example = { - "example_id": example_id, - "example_id_num": example_id_num, - "utterance": user_utterance, - "system_utterance": system_utterance, - "system_slots": ( - {slot["slot"]: slot for slot in system_frame["slots"]} if system_frame is not None else None - ), - "system_actions": system_frame["actions"] if system_frame is not None else None, - "labels": { - "service": service, - "intent": self.preprocess_intent(intent, schemas, service), - "slots": {slot: state[slot] for slot in state_update}, - }, - "label_positions": {"slots": {slot["slot"]: slot for slot in user_frames[service]["slots"]}}, - "possible_labels": { - "service": schemas.services, - "intent": [ - self.preprocess_intent(intent, schemas, service) - for intent in schemas.get_service_schema(service).intents - ], - "slots": { - slot: ( - schemas.get_service_schema(service).get_categorical_slot_values(slot) - if slot in categorical_slots - else [] - ) - for slot in all_possible_slots - }, - }, - "description": { - "service": schemas.get_service_schema(service).description, - "intent": schemas.get_service_schema(service).intent_descriptions[intent], - "slots": { - slot: schemas.get_service_schema(service).slot_descriptions[slot] for slot in state_update - }, - }, - } - - examples.append(DialogueInputExample(one_example)) - - if service not in prev_states and int(turn_id_) > 0: - for slot_name, values in state_update.items(): - for value in values: - slot_carryover_values[value].append((service, slot_name)) - for prev_service, prev_slot_value_list in prev_states.items(): - if prev_service == service: - continue - if prev_service in state: - prev_slot_value_list = state[prev_service] - for prev_slot_name, prev_values in prev_slot_value_list.items(): - for prev_value in prev_values: - slot_carryover_values[prev_value].append((prev_service, prev_slot_name)) - - return examples, states, slot_carryover_values - - def _find_subword_indices( - self, - slot_values: dict, - utterance: str, - char_slot_spans: dict, - alignments: List[int], - subwords: List[str], - bias: int, - ) -> dict: - """ - Find indices for subwords corresponding to slot values. - Args: - slot_values: slot - slot value pairs - utterance: utterance - char_slot_spans: char - slot spans - alignments: alignments - subwords: subtokens mapping - bias: offset - Returns: - span_boundaries: span boundaries - """ - span_boundaries = {} - for slot, values in slot_values.items(): - # Get all values present in the utterance for the specified slot. - value_char_spans = {} - for slot_span in char_slot_spans: - if slot_span["slot"] == slot: - value = utterance[slot_span["start"] : slot_span["exclusive_end"]] - start_tok_idx = alignments[slot_span["start"]] - end_tok_idx = alignments[slot_span["exclusive_end"] - 1] - if 0 <= start_tok_idx < len(subwords): - end_tok_idx = min(end_tok_idx, len(subwords) - 1) - value_char_spans[value] = (start_tok_idx + bias, end_tok_idx + bias) - for v in values: - if v in value_char_spans: - span_boundaries[slot] = value_char_spans[v] - break - return span_boundaries - - @classmethod - def load_dialogues(cls, dialog_json_filepaths: List[str]) -> List[dict]: - """ - Obtain the list of all dialogues from specified json files. - Args: - dialog_json_filepaths: list of json files - Returns: - dialogs: the list of all dialogues - """ - dialogs = [] - for dialog_json_filepath in sorted(dialog_json_filepaths): - with open(dialog_json_filepath, 'r', encoding="UTF-8") as f: - dialogs.extend(json.load(f)) - f.close() - return dialogs - - @classmethod - def get_dialogue_files(cls, data_dir: str, dataset_split: str, task_name: str): - """ - Obtain the list of all dialogue json files - Args: - data_dir: path to the data folder - dataset_split: data split - task_name: SGD task name, see keys of the FILE_RANGES - Returns: - dialog: the list of all dialogue json files paths - """ - return [ - os.path.join(data_dir, dataset_split, 'dialogues_{:03d}.json'.format(fid)) - for fid in FILE_RANGES[task_name][dataset_split] - ] diff --git a/nemo/collections/nlp/data/dialogue/dataset/__init__.py b/nemo/collections/nlp/data/dialogue/dataset/__init__.py deleted file mode 100644 index 3352c7be2d9b..000000000000 --- a/nemo/collections/nlp/data/dialogue/dataset/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_bert_dataset import DialogueBERTDataset -from nemo.collections.nlp.data.dialogue.dataset.dialogue_gpt_classification_dataset import ( - DialogueGPTClassificationDataset, -) -from nemo.collections.nlp.data.dialogue.dataset.dialogue_sgd_bert_dataset import DialogueSGDBERTDataset -from nemo.collections.nlp.data.dialogue.dataset.dialogue_zero_shot_intent_dataset import DialogueZeroShotIntentDataset diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py deleted file mode 100644 index 33d46c308e81..000000000000 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py +++ /dev/null @@ -1,337 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional - -import numpy as np - -from nemo.collections.nlp.data.data_utils import get_stats -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset -from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueBERTDataset', 'DialogueIntentSlotInferenceDataset'] - - -class DialogueBERTDataset(DialogueDataset): - """ - Creates a dataset to use for the task of joint intent - and slot classification with pretrained model. - - For a dataset to use during inference without labels, see - IntentSlotDataset. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports.""" - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'intent_labels': NeuralType(('B'), LabelsType()), - 'slot_labels': NeuralType(('B', 'T'), LabelsType()), - } - - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ - Args: - dataset_split: dataset split - dialogues_processor: Data generator for dialogues - tokenizer: tokenizer - cfg: config container for dataset - """ - # deprecation warning - deprecated_warning("DialogueBERTDataset") - - self.cfg = cfg - self.all_possible_labels = dialogues_processor.intents - self.label_to_label_id = {self.all_possible_labels[i]: i for i in range(len(self.all_possible_labels))} - self.all_possible_slots = dialogues_processor.slots - self.slot_name_to_slot_id = {self.all_possible_slots[i]: i for i in range(len(self.all_possible_slots))} - self.empty_slot_name = 'O' - - self.features = dialogues_processor.get_dialog_examples(dataset_split) - self.features = self.features if self.cfg.num_samples == -1 else self.features[: self.cfg.num_samples] - - queries = [feature.data["utterance"] for feature in self.features] - if self.cfg.do_lowercase: - queries = [query.lower() for query in queries] - intents = [self.label_to_label_id[feature.data["labels"]["intent"]] for feature in self.features] - word_level_slots = [self.convert_slot_position_to_slot_ids(feature.data) for feature in self.features] - - features = DialogueBERTDataset.get_features( - queries, - self.cfg.max_seq_length, - tokenizer, - pad_label=self.cfg.pad_label, - word_level_slots=word_level_slots, - ignore_extra_tokens=self.cfg.ignore_extra_tokens, - ignore_start_end=self.cfg.ignore_start_end, - ) - - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - self.all_slots = features[5] - self.all_intents = intents - - def convert_slot_position_to_slot_ids(self, feature): - slot_ids = [self.slot_name_to_slot_id[self.empty_slot_name] for i in range(len(feature["utterance"].split()))] - slot_name_to_positions = feature["label_positions"]["slots"] - - for slot_name in slot_name_to_positions: - slot_id = self.slot_name_to_slot_id[slot_name] - start = slot_name_to_positions[slot_name]["start"] - exclusive_end = slot_name_to_positions[slot_name]["exclusive_end"] - for to_replace_position in range(start, min(exclusive_end, len(slot_ids))): - slot_ids[to_replace_position] = slot_id - - return slot_ids - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.longlong), - np.array(self.all_loss_mask[idx]), - np.array(self.all_subtokens_mask[idx]), - self.all_intents[idx], - np.array(self.all_slots[idx]), - ) - - @staticmethod - def truncate_and_pad( - max_seq_length, - ignore_start_end, - with_label, - pad_label, - tokenizer, - all_slots, - all_subtokens, - all_input_mask, - all_loss_mask, - all_subtokens_mask, - all_input_ids, - all_segment_ids, - ): - - too_long_count = 0 - - for i, subtokens in enumerate(all_subtokens): - if len(subtokens) > max_seq_length: - subtokens = [tokenizer.cls_token] + subtokens[-max_seq_length + 1 :] - all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :] - all_loss_mask[i] = [1 - ignore_start_end] + all_loss_mask[i][-max_seq_length + 1 :] - all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :] - - if with_label: - all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1 :] - too_long_count += 1 - - all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens]) - - if len(subtokens) < max_seq_length: - extra = max_seq_length - len(subtokens) - all_input_ids[i] = all_input_ids[i] + [0] * extra - all_loss_mask[i] = all_loss_mask[i] + [0] * extra - all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra - all_input_mask[i] = all_input_mask[i] + [0] * extra - - if with_label: - all_slots[i] = all_slots[i] + [pad_label] * extra - - all_segment_ids.append([0] * max_seq_length) - - logging.info(f'{too_long_count} are longer than {max_seq_length}') - return ( - all_slots, - all_subtokens, - all_input_mask, - all_loss_mask, - all_subtokens_mask, - all_input_ids, - all_segment_ids, - ) - - @staticmethod - def get_features( - queries, - max_seq_length, - tokenizer, - pad_label=128, - word_level_slots=None, - ignore_extra_tokens=False, - ignore_start_end=False, - ): - """ - Convert queries (utterance, intent label and slot labels) to BERT input format - """ - - all_subtokens = [] - all_loss_mask = [] - all_subtokens_mask = [] - all_segment_ids = [] - all_input_ids = [] - all_input_mask = [] - sent_lengths = [] - all_slots = [] - - with_label = word_level_slots is not None - - for i, query in enumerate(queries): - words = query.strip().split() - subtokens = [tokenizer.cls_token] - loss_mask = [1 - ignore_start_end] - subtokens_mask = [0] - if with_label: - slots = [pad_label] - - for j, word in enumerate(words): - word_tokens = tokenizer.text_to_tokens(word) - - # to handle emojis that could be neglected during tokenization - if len(word.strip()) > 0 and len(word_tokens) == 0: - word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)] - - subtokens.extend(word_tokens) - # mask all sub-word tokens except the first token in a word - # use the label for the first sub-word token as the label for the entire word to eliminate need for disambiguation - loss_mask.append(1) - loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1)) - - subtokens_mask.append(1) - subtokens_mask.extend([0] * (len(word_tokens) - 1)) - - if with_label: - slots.extend([word_level_slots[i][j]] * len(word_tokens)) - - subtokens.append(tokenizer.sep_token) - loss_mask.append(1 - ignore_start_end) - subtokens_mask.append(0) - sent_lengths.append(len(subtokens)) - all_subtokens.append(subtokens) - all_loss_mask.append(loss_mask) - all_subtokens_mask.append(subtokens_mask) - all_input_mask.append([1] * len(subtokens)) - if with_label: - slots.append(pad_label) - all_slots.append(slots) - max_seq_length_data = max(sent_lengths) - max_seq_length = min(max_seq_length, max_seq_length_data) if max_seq_length > 0 else max_seq_length_data - logging.info(f'Setting max length to: {max_seq_length}') - get_stats(sent_lengths) - - # truncate and pad samples - ( - all_slots, - all_subtokens, - all_input_mask, - all_loss_mask, - all_subtokens_mask, - all_input_ids, - all_segment_ids, - ) = DialogueBERTDataset.truncate_and_pad( - max_seq_length, - ignore_start_end, - with_label, - pad_label, - tokenizer, - all_slots, - all_subtokens, - all_input_mask, - all_loss_mask, - all_subtokens_mask, - all_input_ids, - all_segment_ids, - ) - - # log examples for debugging - logging.debug("*** Some Examples of Processed Data ***") - for i in range(min(len(all_input_ids), 5)): - logging.debug("i: %s" % (i)) - logging.debug("subtokens: %s" % " ".join(list(map(str, all_subtokens[i])))) - logging.debug("loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i])))) - logging.debug("input_mask: %s" % " ".join(list(map(str, all_input_mask[i])))) - logging.debug("subtokens_mask: %s" % " ".join(list(map(str, all_subtokens_mask[i])))) - if with_label: - logging.debug("slots_label: %s" % " ".join(list(map(str, all_slots[i])))) - - return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots) - - -class DialogueIntentSlotInferenceDataset(DialogueBERTDataset): - """ - Creates dataset to use for the task of joint intent - and slot classification with pretrained model. - This is to be used during inference only. - It uses list of queries as the input. - - Args: - queries (list): list of queries to run inference on - max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] - tokenizer (Tokenizer): such as NemoBertTokenizer - pad_label (int): pad value use for slot labels. - by default, it's the neutral label. - - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """ - Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - } - - def __init__(self, queries, max_seq_length, tokenizer, do_lower_case): - # deprecation warning - deprecated_warning("DialogueIntentSlotInferenceDataset") - - if do_lower_case: - queries = [query.lower() for query in queries] - - features = DialogueBERTDataset.get_features(queries, max_seq_length, tokenizer) - - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.longlong), - np.array(self.all_loss_mask[idx]), - np.array(self.all_subtokens_mask[idx]), - ) diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_dataset.py deleted file mode 100644 index 5540dd3b19f7..000000000000 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_dataset.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.core.classes import Dataset - -__all__ = ['DialogueDataset'] - - -class DialogueDataset(Dataset): - ''' - Base class for Dialogue Datasets - 1. Performs Model-dependent (but Data-independent) operations (tokenization etc) - 2. This can allow the same model preprocessing for multiple datasources - 3. Users can configurate which labels to use for modelling - (e.g. intent classification, slot filling or sequence generation etc) - ''' - - def __init__(self, dataset_split: str, dialogues_processor: object, **kwargs): - raise NotImplementedError - - def __len__(self): - raise NotImplementedError - - def __getitem__(self, idx: int): - raise NotImplementedError diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py deleted file mode 100644 index f89a5013c2ae..000000000000 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py +++ /dev/null @@ -1,314 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import random -from collections import defaultdict - -import torch - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - - -class DialogueGPTClassificationDataset(DialogueDataset): - ''' - Designed for classification tasks such as intent/domain classification as well as slot tagging - - Dataset Class - 1. Performs Model-dependent (but Data-independent) operations (tokenization etc) - 2. This can allow the same model preprocessing for multiple datasources - 3. Users can configurate which labels to use for modelling - (e.g. intent classification, slot filling or both together etc) - ''' - - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """Constructor - Args: - dataset_split: dataset split - dialogues_processor: Data generator for SGD dialogues - tokenizer: tokenizer - cfg: cfg container for dataset - """ - # deprecation warning - deprecated_warning("DialogueGPTClassificationDataset") - - self.cfg = cfg - - if self.cfg.target_template == "with_slots" and self.cfg.eval_mode != "generation": - raise ValueError( - "slot-filling is not supported by eval_mode {}, please set model.dataset.eval_mode=generation instead".format( - self.cfg.eval_mode - ) - ) - if self.cfg.target_template != "with_slots" and self.cfg.field == "slots": - raise ValueError("please set model.dataset.target_template='with_slots' if model.dataset.field='slots'") - self.label_type = self.cfg.field - if self.cfg.target_template == "with_description": - self.label_to_description = defaultdict(str) - self.all_possible_labels = set() - self.tokenizer = tokenizer - self.tokenizer.tokenizer.padding_side = "right" - self.max_candidates = 2 - if not isinstance(dataset_split, str): - dataset_split = dataset_split[0] - self.features = dialogues_processor.get_dialog_examples(dataset_split) - for idx in range(len(self.features)): - self.preprocess_feature(idx) - if self.cfg.debug_mode: - self.features = self.features[:16] - # for few shot learning to append in the prompt - self.lm_features = self.get_lm_samples() - - def transform(self, label): - """ - Normalize labels by replacing underscore with space - - Args: - label: str - Returns: - normalized_label: str - """ - if self.cfg.task == "assistant" and self.cfg.prompt_template != "prompt_tuning": - label = label.replace('_', ' ') - return label - - def __len__(self): - return len(self.features) - - def get_n_tokens_in_sentence(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding=False, return_tensors="pt" - ) - output = torch.squeeze(encodings_dict['input_ids']) - return len(output) if len(output.size()) > 0 else 0 - - def preprocess_feature(self, idx): - ex = self.features[idx].data - label = ex["labels"][self.label_type] - candidates = ex["possible_labels"][self.label_type] - - if self.label_type in ["service", "intent"]: - label = self.transform(label) - candidates = [self.transform(candidate) for candidate in candidates] - - self.features[idx].data["labels"][self.label_type] = label - self.features[idx].data["possible_labels"][self.label_type] = candidates - if self.cfg.target_template == "with_description": - description = ex["description"][self.label_type] - self.label_to_description[label] = description - for candidate in candidates: - self.all_possible_labels.add(candidate) - self.max_candidates = max(self.max_candidates, len(candidates)) - - def default_encode(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding="max_length", return_tensors="pt" - ) - input_ids = torch.squeeze(encodings_dict['input_ids']) - attn_masks = torch.squeeze(encodings_dict['attention_mask']) - return encodings_dict, input_ids, attn_masks - - @staticmethod - def linearize_slots(slots): - """ - Serialize slots into a linear text - - Args: - slots: dict with each slot_name as key and possible slot values as value - Returns: - linear_slots: text based representation of slot names and values - """ - if not slots: - return "None" - return ", ".join( - ["{}({})".format(slot, value if isinstance(value, str) else value[0]) for slot, value in slots.items()] - ) - - def format_target(self, target, slots=None): - """ - Formats the back part of the training example, after the base_template - for instance, "restaurant" in " service: restaurant" - or "set alarm\nslots: (), ()" in \ - "\nintent: set alarm\nslots: (), ()" - """ - if self.cfg.target_template == "with_description": - return target + ' (' + self.label_to_description[target] + ')' - elif self.cfg.target_template == "default": - return target - elif self.cfg.target_template == "with_slots" and slots is not None and self.cfg.field == "intent": - return target + '\nslots: ' + DialogueGPTClassificationDataset.linearize_slots(slots) - elif self.cfg.target_template == "with_slots" and slots is not None and self.cfg.field == "slots": - return DialogueGPTClassificationDataset.linearize_slots(slots) - else: - raise ValueError("Please choose a target format from {default, with_description, with_slots}") - - def get_lm_samples(self): - max_sample_length = 0 - lm_features = [] - for idx in range(len(self.features)): - ex = self.features[idx].data - utterance = ex["utterance"] - label = ex["labels"][self.label_type] - slots = ex["labels"]["slots"] if self.cfg.target_template == "with_slots" else None - lm_feature = self.format_prompt(utterance) + ' ' + self.format_target(label, slots=slots) - feature_len = self.get_n_tokens_in_sentence(lm_feature) - max_sample_length = max(max_sample_length, feature_len) - lm_features.append(lm_feature) - logging.info("max feature length per sample with label: ".format(max_sample_length)) - logging.info( - "please adjust max seq len to at least {} * ({} + 1) = {} but not too much more for efficiency".format( - max_sample_length, self.cfg.few_shot, max_sample_length * (1 + self.cfg.few_shot) - ) - ) - return lm_features - - def format_prompt(self, utterance, few_shot=0, idx=None): - if self.cfg.prompt_template == "default": - base_template = utterance + ' ' + self.label_type + ':' - elif self.cfg.prompt_template == "i_want_to": - base_template = utterance + ' ' + 'I want to' - elif self.cfg.prompt_template == "prompt_tuning": - base_template = utterance + '\n' + self.label_type + ':' - elif self.cfg.prompt_template == "prompt_tuning_with_options": - base_template = ( - 'possible intents: ' - + ', '.join(sorted(list(self.all_possible_labels))) - + '\n\n' - + utterance - + '\n' - + self.label_type - + ':' - ) - - if few_shot > 0: - few_shot_indices = random.sample(range(len(self.features)), few_shot + 1) - few_shot_indices = [i for i in few_shot_indices if i != idx][:few_shot] - few_shot_samples = [self.lm_features[i] for i in few_shot_indices] - base_template = ( - self.tokenizer.tokenizer.pad_token.join(few_shot_samples) - + self.tokenizer.tokenizer.pad_token - + base_template - ) - return base_template - - def collate_fn(self, batch): - """ - Truncates elements to max length in batch - """ - _, _, _, _, candidate_attn_masks, _, _, _ = zip(*batch) - # determine max length in batch - batch_max_length = 0 - for candidate_attn_mask in candidate_attn_masks: - for one_attn_mask in candidate_attn_mask: - batch_max_length = max(batch_max_length, torch.sum(one_attn_mask).item()) - # padding for tp=2 situation - if batch_max_length % 2: - batch_max_length += 1 - - all_items = [] - for item in zip(*batch): - if isinstance(item[0], int): - item = [torch.tensor(i) for i in item] - item_stack = torch.stack(item) - # if item_stack is 1d, elements refers to indexes and there is no need to truncate - if len(item_stack.size()) == 1: - all_items.append(item_stack) - # otherwise, truncate last dimension to max length in batch - else: - all_items.append(item_stack[..., :batch_max_length]) - return all_items - - def __getitem__(self, idx: int): - ''' - State how the input and output samples look like - - This template can be changed - - Training example: - e.g. service: restaurant - e.g. service: restaurant - e.g. \nintent: set alarm\nslots: (), () - - Generation example: - e.g. service: - - ''' - ex = self.features[idx].data - - utterance = ex["utterance"] - utterance_length = self.get_n_tokens_in_sentence(utterance) - - label = ex["labels"][self.label_type] - candidates = ex["possible_labels"][self.label_type] - - slots = ex["labels"]["slots"] if self.cfg.target_template == "with_slots" else None - - base_template = self.format_prompt(utterance, few_shot=self.cfg.few_shot, idx=idx) - - sentence_without_answer = base_template - - sentence = base_template + ' ' + self.format_target(label, slots=slots) - - if self.cfg.eval_mode == "binary_score": - candidate_sentences = [] - for candidate in candidates: - positive_answer = base_template + ' ' + candidate + ' Answer: ' + 'yes' - negative_answer = base_template + ' ' + candidate + ' Answer: ' + 'no' - if candidate == label: - correct_candidate = len(candidate_sentences) // 2 - candidate_sentences.append(positive_answer) - candidate_sentences.append(negative_answer) - else: - candidate_sentences.append(negative_answer) - candidate_sentences.append(positive_answer) - else: - correct_candidate = 0 - candidate_sentences = [ - base_template + ' ' + self.format_target(candidate, slots=slots) for candidate in candidates - ] - - encodings_dict, input_ids, attn_masks = self.default_encode(sentence) - - candidate_tokenized_sentences = [ - self.default_encode(candidate_sentence) for candidate_sentence in candidate_sentences - ] - - # ensure all samples have the same number of candidates for collating into tensor - while len(candidate_tokenized_sentences) < self.max_candidates: - candidate_tokenized_sentences.append(candidate_tokenized_sentences[0]) - - candidate_input_ids = torch.stack([i[1] for i in candidate_tokenized_sentences]) - candidate_attn_masks = torch.stack([i[2] for i in candidate_tokenized_sentences]) - - labels = copy.copy(torch.squeeze(encodings_dict['input_ids'])) - - training_mask_end = self.get_n_tokens_in_sentence(sentence_without_answer) - - labels.data = torch.tensor( - [-100 if i < training_mask_end else labels.data[i] for i in range(len(labels.data))] - ) - - return ( - input_ids, - attn_masks, - labels, - candidate_input_ids, - candidate_attn_masks, - training_mask_end, - utterance_length, - correct_candidate, - ) diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py deleted file mode 100644 index 8ddbc2e3925e..000000000000 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -import torch - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset -from nemo.utils.decorators import deprecated_warning - - -class DialogueGPTGenerationDataset(DialogueDataset): - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """Constructor - Designed for free form generation tasks such as Dialogue Response Generation - - Args: - dataset_split: dataset split - dialogues_processor: dialogues processor - tokenizer: tokenizer - cfg: cfg container for dataset - """ - # deprecation warning - deprecated_warning("DialogueGPTGenerationDataset") - - self.cfg = cfg - self.input_label_type = self.cfg.input_field - self.output_label_type = self.cfg.output_field - self.tokenizer = tokenizer - self.tokenizer.tokenizer.padding_side = "right" - if not isinstance(dataset_split, str): - dataset_split = dataset_split[0] - - self.features = dialogues_processor.get_dialog_examples(dataset_split) - self.features = self.remove_invalid_samples(self.features) - - if self.cfg.debug_mode: - self.features = self.features[:16] - - def remove_invalid_samples(self, features): - valid_idxs = [] - all_fields = self.input_label_type.split('+') + self.output_label_type.split('+') - for i in range(len(features)): - features[i].data["labels"]["utterance"] = features[i].data["utterance"] - all_fields_non_empty = True - for field in all_fields: - if not features[i].data["labels"][field] or not features[i].data["labels"][field].strip(): - all_fields_non_empty = False - if all_fields_non_empty: - valid_idxs.append(i) - return [features[i] for i in valid_idxs] - - def __len__(self): - return len(self.features) - - def get_n_tokens_in_sentence(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding=False, return_tensors="pt" - ) - output = torch.squeeze(encodings_dict['input_ids']) - return len(output) if len(output.size()) > 0 else 0 - - def default_encode(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding="max_length", return_tensors="pt" - ) - input_ids = torch.squeeze(encodings_dict['input_ids']) - attn_masks = torch.squeeze(encodings_dict['attention_mask']) - return encodings_dict, input_ids, attn_masks - - def format_prompt(self, ex): - ''' - Formats training prompt based on self.input_field_type - - Training example: - e.g. response: # input_label_type = response - e.g. utterance: # input_label_type = utterance - e.g. passage: utterance: # input_label_type = passage+utterance - ''' - ex["labels"]["utterance"] = ex["utterance"] - parts = self.input_label_type.split('+') - input_sentence = ' '.join([part + ': ' + ex["labels"][part] for part in parts]) - return input_sentence - - def __getitem__(self, idx: int): - ''' - For each example, this function determines the format of input and output sequences based on user-specified conguration. - This is controlled by model.dataset.input_field and model.dataset.output_field - For instance: - If model.dataset.input_field == response and model.dataset.output_field == fluent_response: - Input = "response: " and output = "response: fluent_response: " (with loss calculated from only) - If model.dataset.input_field == utterance and model.dataset.output_field == response: - Input = "utterance: " and output = "utterance: response: " (with loss calculated from only) - If model.dataset.input_field == passage+utterance and model.dataset.output_field == response: - Input = "passage: utterance: " and output="passage: utterance: response: " (with loss calculated from only) - ''' - ex = self.features[idx].data - - input_sentence = self.format_prompt(ex) - - utterance_length = self.get_n_tokens_in_sentence(input_sentence) - - output_sentence = ex["labels"][self.output_label_type] - - base_template = input_sentence - - sentence_without_answer = base_template + ' ' + self.output_label_type + ':' - - sentence = sentence_without_answer + ' ' + output_sentence - - encodings_dict, input_ids, attn_masks = self.default_encode(sentence) - - labels = copy.copy(torch.squeeze(encodings_dict['input_ids'])) - - training_mask_end = self.get_n_tokens_in_sentence(sentence_without_answer) - - labels.data = torch.tensor( - [-100 if i < training_mask_end else labels.data[i] for i in range(len(labels.data))] - ) - - return (input_ids, attn_masks, labels, training_mask_end, utterance_length) diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py deleted file mode 100644 index dc123ca0e3d7..000000000000 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueNearestNeighbourDataset'] - - -class DialogueNearestNeighbourDataset(DialogueDataset): - """ - Dataset for training a Nearest Neighbour model for zero shot intent recognition. - """ - - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ - Args: - dataset_split: dataset split - dialogues_processor: Data generator for dialogues - tokenizer: tokenizer to split text into sub-word tokens - """ - # deprecation warning - deprecated_warning("DialogueNearestNeighbourDataset") - - self.cfg = cfg - self.tokenizer = tokenizer - self.raw_features = dialogues_processor.get_dialog_examples(dataset_split) - self.max_n = self.find_max_n_candidates() - self.examples = self._create_examples(self.raw_features) - - def find_max_n_candidates(self): - max_n = 0 - for idx in range(len(self.raw_features)): - ex = self.raw_features[idx].data - n = len(ex["possible_labels"]["intent"]) - max_n = max(max_n, n) - return max_n - - def _create_examples(self, raw_features): - """Creates examples for the training and dev sets.""" - examples = [] - seen_utterances = set() - for idx in range(len(raw_features)): - ex = self.raw_features[idx].data - user_utterance = ex["utterance"] - if user_utterance in seen_utterances: - continue - seen_utterances.add(user_utterance) - intent = ex["labels"]["intent"] - sentences = [user_utterance] - labels = [-1] - for candidate_intent in ex["possible_labels"]["intent"]: - text_b = "{} {}".format(self.cfg.prompt_template, candidate_intent) - label = 1 if candidate_intent == intent else 0 - labels.append(label) - sentences.append(text_b) - - while self.max_n > len(labels) - 1: - labels.append(label) - sentences.append(text_b) - - encoded_input = self.tokenizer.tokenizer( - sentences, - padding='max_length', - truncation=True, - return_tensors='pt', - max_length=self.cfg.max_seq_length, - ) - examples.append((encoded_input['input_ids'], encoded_input['attention_mask'], torch.tensor(labels))) - return examples - - def __len__(self): - return len(self.examples) - - def __getitem__(self, idx: int): - return self.examples[idx] diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py deleted file mode 100644 index df522b74e861..000000000000 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset -from nemo.utils.decorators import deprecated_warning - - -class DialogueS2SGenerationDataset(DialogueDataset): - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """Constructor - Designed for free form generation tasks such as Dialogue Response Generation - - Args: - dataset_split: dataset split - dialogues_processor: dialogues processor - tokenizer: tokenizer - cfg: cfg container for dataset - """ - # deprecation warning - deprecated_warning("DialogueS2SGenerationDataset") - - self.cfg = cfg - self.input_label_type = self.cfg.input_field - self.output_label_type = self.cfg.output_field - self.tokenizer = tokenizer - if not isinstance(dataset_split, str): - dataset_split = dataset_split[0] - - self.features = dialogues_processor.get_dialog_examples(dataset_split) - self.features = self.remove_invalid_samples(self.features) - - if self.cfg.debug_mode: - self.features = self.features[:16] - - @staticmethod - def format_actions(prompt_template, actions): - """ - Formats actions based on prompt_template - - Args: - prompt_template: determines whether acts, slot-names, slot-values are necessary in formatted actions - actions: list of actions, each a dict containing keys 'act', 'slot' and 'values' with their corresponding values as their attribute-values - - Returns: - formatted_actions: string representations of actions, formatted based on the fields needed. - """ - actions_str = [] - for action in actions: - act = action['act'].lower() - slot = action['slot'] - value = action['values'][0] if action['values'] else '' - - if prompt_template == 'values': - action_str = value - elif prompt_template == 'slots_values': - if value: - action_str = '{} ({})'.format(slot, value) - else: - action_str = slot - elif prompt_template == 'acts_slots_values': - if value: - action_str = '{} {} ({})'.format(act, slot, value) - elif slot: - action_str = '{} {}'.format(act, slot) - else: - action_str = act - else: - raise ValueError( - "Please set model.dataset.prompt_template to acts_slots_values, slots_values or values" - ) - actions_str.append(action_str) - return ' '.join(actions_str) - - def remove_invalid_samples(self, features): - valid_idxs = [] - for i in range(len(features)): - for field in ['utterance', 'system_utterance', 'system_actions']: - if field in features[i].data: - features[i].data["labels"][field] = features[i].data[field] - all_fields = self.input_label_type.split('+') + self.output_label_type.split('+') - all_fields_non_empty = True - for field in all_fields: - if not features[i].data["labels"][field]: - all_fields_non_empty = False - if all_fields_non_empty: - valid_idxs.append(i) - return [features[i] for i in valid_idxs] - - def __len__(self): - return len(self.features) - - def get_n_tokens_in_sentence(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding=False, return_tensors="pt" - ) - output = torch.squeeze(encodings_dict['input_ids']) - return len(output) if len(output.size()) > 0 else 0 - - def default_encode(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding="max_length", return_tensors="pt" - ) - input_ids = torch.squeeze(encodings_dict['input_ids']) - attn_masks = torch.squeeze(encodings_dict['attention_mask']) - return encodings_dict, input_ids, attn_masks - - def format_prompt(self, ex): - ''' - Formats training prompt based on self.input_field_type - - Training example: - e.g. response: # input_label_type = response - e.g. utterance: # input_label_type = utterance - e.g. passage: utterance: # input_label_type = passage+utterance - ''' - parts = self.input_label_type.split('+') - input_sentence = ' '.join([part + ': ' + ex["labels"][part] for part in parts]) - return input_sentence - - def __getitem__(self, idx: int): - ''' - State how the input and output samples look like - - This template can be changed - - Training example: - e.g. INPUT - "response: " OUTPUT - "" # input_label_type = response, output_label_type = fluent_response - e.g. INPUT - "utterance: " OUTPUT - "" # input_label_type = utterance, output_label_type = response - e.g. INPUT - "passage: utterance: " OUTPUT - "" # input_label_type = passage+utterance, output_label_type = response - ''' - ex = self.features[idx].data - for field in ['utterance', 'system_utterance']: - if field in ex: - ex["labels"][field] = ex[field] - - if 'system_actions' in ex: - ex["labels"]['system_actions'] = DialogueS2SGenerationDataset.format_actions( - self.cfg.prompt_template, ex['system_actions'] - ) - - input_sentence = self.format_prompt(ex) - output_sentence = ex["labels"][self.output_label_type] - - _, input_ids, attn_masks = self.default_encode(input_sentence) - - _, labels, _ = self.default_encode(output_sentence) - - labels[labels == self.tokenizer.tokenizer.pad_token_id] = -100 - - return input_ids, attn_masks, labels diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py deleted file mode 100644 index fcab5e91329f..000000000000 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py +++ /dev/null @@ -1,425 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst -""" - -import os -import re -from typing import List - -import numpy as np - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset -from nemo.collections.nlp.data.dialogue.input_example.sgd_input_example import SGDInputExample - -__all__ = ['DialogueSGDBERTDataset'] - - -class DialogueSGDBERTDataset(DialogueDataset): - ''' - Dataset Class - 1. Performs Model-dependent (but Data-independent) operations (tokenization etc) - 2. This can allow the same model preprocessing for multiple datasources - 3. Users can configurate which labels to use for modelling - (e.g. intent classification, slot filling or both together etc) - ''' - - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, schemas, schema_config, cfg): - """ Constructor - Args: - dataset_split: dataset split - dialogues_processor: Data generator for SGD dialogues - tokenizer: tokenizer - schemas: SGD schema for domain, intent and slots - schema_config: config dict for schemas - cfg: cfg container for dataset - """ - self.dataset_split = dataset_split - self.tokenizer = tokenizer - self.schemas = schemas - self.schema_config = schema_config - self.dialogues_processor = dialogues_processor - self.cfg = cfg - self.subsample = self.dialogues_processor._subsample - - dial_file = f"{dialogues_processor._task_name}_{dataset_split}_examples_bert.processed" - self.dial_file = os.path.join(self.cfg.data_dir, dial_file) - if self.cfg.use_cache and os.path.exists(self.dial_file): - self.load_features() - else: - self.process_features() - self.save_features() - - def load_features(self): - with open(self.dial_file, "rb") as f: - self.features = np.load(f, allow_pickle=True) - - def process_features(self): - self.features = [] - self.raw_features = self.dialogues_processor.get_dialog_examples(self.dataset_split) - for idx in range(len(self.raw_features)): - self.bert_process_one_sample(idx) - - def save_features(self): - with open(self.dial_file, "wb") as f: - np.save(f, self.features) - - def _tokenize(self, utterance: str): - """ - Tokenize the utterance - - Args: - utterance: A string containing the utterance to be tokenized. - - Returns: - bert_tokens: A list of tokens obtained by word-piece tokenization of the - utterance. - alignments: A dict mapping indices of characters corresponding to start - and end positions of words (not subwords) to corresponding indices in - bert_tokens list. - inverse_alignments: A list of size equal to bert_tokens. Each element is a - tuple containing the index of the starting and inclusive ending - character of the word corresponding to the subword. This list is used - during inference to map word-piece indices to spans in the original - utterance. - """ - # utterance = tokenization.convert_to_unicode(utterance) - - # After _naive_tokenize, spaces and punctuation marks are all retained, i.e. - # direct concatenation of all the tokens in the sequence will be the - # original string. - tokens = DialogueSGDBERTDataset._naive_tokenize(utterance) - # ['I', ' ', 'am', ' ', 'feeling', ' ', 'hungry', ' ', 'so', ' ', 'I', ' ', 'would', ' ', 'like', ' ', 'to', ' ', 'find', ' ', 'a', ' ', 'place', ' ', 'to', ' ', 'eat', '.'] - # Filter out empty tokens and obtain aligned character index for each token. - alignments = {} - char_index = 0 - bert_tokens = ( - [] - ) # ['I', 'am', 'feeling', 'hungry', 'so', 'I', 'would', 'like', 'to', 'find', 'a', 'place', 'to', 'eat', '.'] - # These lists store inverse alignments to be used during inference. - bert_tokens_start_chars = [] - bert_tokens_end_chars = [] - for token in tokens: - if token.strip(): - subwords = self.tokenizer.text_to_tokens(token) - # Store the alignment for the index of starting character and the - # inclusive ending character of the token. - alignments[char_index] = len(bert_tokens) - bert_tokens_start_chars.extend([char_index] * len(subwords)) - bert_tokens.extend(subwords) - # The inclusive ending character index corresponding to the word. - inclusive_char_end = char_index + len(token) - 1 - alignments[inclusive_char_end] = len(bert_tokens) - 1 - bert_tokens_end_chars.extend([inclusive_char_end] * len(subwords)) - char_index += len(token) - inverse_alignments = list(zip(bert_tokens_start_chars, bert_tokens_end_chars)) - return bert_tokens, alignments, inverse_alignments - - @classmethod - def _naive_tokenize(cls, s: str): - """ - Tokenizes a string, separating words, spaces and punctuations. - Args: - s: a string - Returns: - seq_tok: list of words, spaces and punctuations from the string - """ - # Spaces and punctuation marks are all retained, i.e. direct concatenation - # of all the tokens in the sequence will be the original string. - seq_tok = [tok for tok in re.split(r"([^a-zA-Z0-9])", s) if tok] - return seq_tok - - def __len__(self): - return len(self.features) - - def __getitem__(self, idx: int): - ex = self.features[idx] - - return ( - np.array(ex.example_id_num), - np.array(ex.example_id_num[-1]), # service_id - np.array(ex.utterance_ids), - np.array(ex.utterance_segment), - np.array(ex.utterance_mask, dtype=np.longlong), - np.array(ex.intent_status, dtype=np.float32), - np.array(ex.requested_slot_status, dtype=np.float32), - np.array(ex.categorical_slot_status), - np.array(ex.categorical_slot_value_status, dtype=np.float32), - np.array(ex.noncategorical_slot_status), - np.array(ex.noncategorical_slot_value_start), - np.array(ex.noncategorical_slot_value_end), - np.array(ex.start_char_idx), # noncat_alignment_start - np.array(ex.end_char_idx), # noncat_alignment_end - np.array(ex.task_mask), # noncat_alignment_end - ) - - def bert_process_one_sample(self, idx): - """ - Creates an example for each frame in the user turn. - Args: - turn_id: turn number - system_utterance: last system utterance - user_utterance: lst user utterance - system_frames: all system utterances and slot - slot value pairs - user_frames: all user utterances and slot - slot value pairs - prev_states: slot - slot value pairs from the previous turns - schemas: schema for all services of all datasets - subsample: whether to balance postive and negative samples in the dataset - Returns: - examples: a list of `InputExample`s. - prev_states: updated dialogue state e.g. {'Restaurants_1': {'city': ['San Jose'], 'cuisine': ['American']}} - """ - - ex = self.raw_features[idx].data - example_id_num = ex["example_id_num"] - example_id = ex["example_id"] - user_utterance = ex["utterance"] - system_utterance = ex["system_utterance"] - service = ex["labels"]["service"] - schemas = self.schemas - state_update = ex["labels"]["slots"] - system_slots = ex["system_slots"] - - user_tokens, user_alignments, user_inv_alignments = self._tokenize(user_utterance) - system_tokens, system_alignments, system_inv_alignments = self._tokenize(system_utterance) - system_user_utterance = system_utterance + ' ' + user_utterance - system_user_tokens, system_user_alignments, system_user_inv_alignments = self._tokenize(system_user_utterance) - examples = [] - - base_example = SGDInputExample(schema_config=self.schema_config, tokenizer=self.tokenizer) - base_example.service_schema = self.schemas.get_service_schema(service) - base_example.service_id = example_id_num[-1] - - base_example.example_id = example_id - base_example.example_id_num = example_id_num - - for model_task in range(self.schema_config["NUM_TASKS"]): - if model_task == 0: - for intent_id, intent in enumerate(schemas.get_service_schema(service).intents): - task_example = base_example.make_copy() - task_example.task_mask[model_task] = 1 - task_example.intent_id = intent_id - task_example.example_id += f"-{model_task}-{intent_id}-0" - task_example.example_id_num.extend([model_task, intent_id, 0]) - intent_description = ( - intent + " " + self.schemas.get_service_schema(service).intent_descriptions[intent] - ) - intent_tokens, intent_alignments, intent_inv_alignments = self._tokenize(intent_description) - task_example.add_utterance_features( - intent_tokens, - intent_inv_alignments, - system_user_tokens, - system_user_inv_alignments, - intent_description, - system_user_utterance, - ) - - task_example.add_intents(ex) - examples.append(task_example) - - if model_task == 1: - for slot_id, slot in enumerate(schemas.get_service_schema(service).slots): - task_example = base_example.make_copy() - task_example.task_mask[model_task] = 1 - task_example.requested_slot_id = slot_id - task_example.example_id += f"-{model_task}-{slot_id}-0" - task_example.example_id_num.extend([model_task, slot_id, 0]) - slot_description = slot + " " + self.schemas.get_service_schema(service).slot_descriptions[slot] - slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description) - task_example.add_utterance_features( - slot_tokens, - slot_inv_alignments, - user_tokens, - user_inv_alignments, - slot_description, - user_utterance, - ) - - task_example.add_requested_slots(ex) - examples.append(task_example) - - if model_task == 2: - off_slots = [] - on_slots = [] - for slot_id, slot in enumerate(schemas.get_service_schema(service).categorical_slots): - task_example = base_example.make_copy() - task_example.task_mask[model_task] = 1 - - # assert task_example.task_mask == [0, 0, 1, 0, 0, 0] - task_example.categorical_slot_id = slot_id - task_example.example_id += f"-{model_task}-{slot_id}-0" - task_example.example_id_num.extend([model_task, slot_id, 0]) - slot_description = slot + " " + schemas.get_service_schema(service).slot_descriptions[slot] - slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description) - task_example.add_utterance_features( - slot_tokens, - slot_inv_alignments, - system_user_tokens, - system_user_inv_alignments, - slot_description, - system_user_utterance, - ) - task_example.add_categorical_slots(state_update) - - if task_example.categorical_slot_status == 0: - off_slots.append(task_example) - else: - on_slots.append(task_example) - examples.append(task_example) - old_example = task_example - - for value_id, value in enumerate( - schemas.get_service_schema(service).get_categorical_slot_values(slot) - ): - if self.dataset_split != 'train' or task_example.categorical_slot_status == 1: - task_example = old_example.make_copy_of_categorical_features() - task_example.task_mask[3] = 1 - # assert task_example.task_mask == [0, 0, 0, 1, 0, 0] - task_example.categorical_slot_id = slot_id - task_example.categorical_slot_value_id = value_id - task_example.example_id = base_example.example_id + f"-3-{slot_id}-{value_id}" - task_example.example_id_num = base_example.example_id_num + [3, slot_id, value_id] - slot_description = slot + " " + value # add slot description - slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description) - task_example.add_utterance_features( - slot_tokens, - slot_inv_alignments, - system_user_tokens, - system_user_inv_alignments, - slot_description, - system_user_utterance, - ) - task_example.add_categorical_slots(state_update) - assert task_example.categorical_slot_status == old_example.categorical_slot_status - examples.append(task_example) - - if self.dataset_split == 'train' and self.subsample: - num_on_slots = len(on_slots) - examples.extend( - np.random.choice(off_slots, replace=False, size=min(max(num_on_slots, 1), len(off_slots))) - ) - else: - examples.extend(off_slots) - - if model_task == 4: # noncat slot status - off_slots = [] - on_slots = [] - for slot_id, slot in enumerate(schemas.get_service_schema(service).non_categorical_slots): - task_example = base_example.make_copy() - task_example.task_mask[model_task] = 1 - # assert task_example.task_mask == [0, 0, 0, 0, 1, 0] - task_example.noncategorical_slot_id = slot_id - task_example.example_id += f"-{model_task}-{slot_id}-0" - task_example.example_id_num.extend([model_task, slot_id, 0]) - slot_description = slot + " " + schemas.get_service_schema(service).slot_descriptions[slot] - slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description) - task_example.add_utterance_features( - slot_tokens, - slot_inv_alignments, - system_user_tokens, - system_user_inv_alignments, - slot_description, - system_user_utterance, - ) - - user_span_boundaries = self._find_subword_indices( - state_update, - user_utterance, - ex["label_positions"]["slots"], - user_alignments, - user_tokens, - 2 + len(slot_tokens) + len(system_tokens), - ) - - if system_slots is not None: - system_span_boundaries = self._find_subword_indices( - state_update, - system_utterance, - system_slots, - system_alignments, - system_tokens, - 2 + len(slot_tokens), - ) - else: - system_span_boundaries = {} - - task_example.add_noncategorical_slots(state_update, user_span_boundaries, system_span_boundaries) - if task_example.noncategorical_slot_status == 0: - off_slots.append(task_example) - else: - on_slots.append(task_example) - examples.append(task_example) - - if self.dataset_split != 'train' or task_example.noncategorical_slot_status == 1: - task_example = task_example.make_copy_of_non_categorical_features() - task_example.task_mask[5] = 1 - # assert task_example.task_mask == [0, 0, 0, 0, 0, 1] - task_example.example_id = base_example.example_id + f"-5-{slot_id}-0" - task_example.example_id_num = base_example.example_id_num + [5, slot_id, 0] - examples.append(task_example) - - if self.dataset_split == 'train' and self.subsample: - num_on_slots = len(on_slots) - examples.extend( - np.random.choice(off_slots, replace=False, size=min(max(num_on_slots, 1), len(off_slots))) - ) - else: - examples.extend(off_slots) - - for example in examples: - self.features.append(example) - - def _find_subword_indices( - self, - slot_values: dict, - utterance: str, - char_slot_spans: dict, - alignments: List[int], - subwords: List[str], - bias: int, - ) -> dict: - """ - Find indices for subwords corresponding to slot values. - Args: - slot_values: slot - slot value pairs - utterance: utterance - char_slot_spans: char - slot spans - alignments: alignments - subwords: subtokens mapping - bias: offset - Returns: - span_boundaries: span boundaries - """ - span_boundaries = {} - for slot, values in slot_values.items(): - # Get all values present in the utterance for the specified slot. - value_char_spans = {} - for key, slot_span in char_slot_spans.items(): - # print(key, slot, slot_span, char_slot_spans) - if slot_span["slot"] == slot: - value = utterance[slot_span["start"] : slot_span["exclusive_end"]] - start_tok_idx = alignments[slot_span["start"]] - end_tok_idx = alignments[slot_span["exclusive_end"] - 1] - if 0 <= start_tok_idx < len(subwords): - end_tok_idx = min(end_tok_idx, len(subwords) - 1) - value_char_spans[value] = (start_tok_idx + bias, end_tok_idx + bias) - for v in values: - if v in value_char_spans: - span_boundaries[slot] = value_char_spans[v] - break - return span_boundaries diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py deleted file mode 100644 index c1308238bea1..000000000000 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py +++ /dev/null @@ -1,300 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Dict, List, Optional, Union - -import numpy as np - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.glue_benchmark.data_processors import InputExample -from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset -from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueZeroShotIntentDataset'] - - -class DialogueZeroShotIntentDataset(GLUEDataset): - """ - Dataset for training a NLI model for zero shot intent recognition. Similar to GLUE/MNLI - dataset, but allows the user to specify which columns in the data files contain the - premise, hypothesis, and gold label. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports.""" - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'labels': NeuralType(tuple('B'), CategoricalValuesType()), - } - - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ - Args: - dataset_split: dataset split - dialogues_processor: Data generator for dialogues - tokenizer: tokenizer to split text into sub-word tokens - cfg: config dict for dataset - num_classes: number of classes in the data (should be either 2 or 3, corresponding to - labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"]) - """ - # deprecation warning - deprecated_warning("DialogueZeroShotIntentDataset") - - self.cfg = cfg - self.tokenizer = tokenizer - if self.cfg.num_classes not in [2, 3]: - raise ValueError("num_classes must be either 2 or 3!") - self.label_list = ( - ["contradiction", "entailment", "neutral"] - if self.cfg.num_classes == 3 - else ['not_entailment', 'entailment'] - ) - token_params = { - 'bos_token': None, - 'eos_token': tokenizer.eos_token, - 'pad_token': tokenizer.pad_token, - 'cls_token': tokenizer.cls_token, - 'sep_token_extra': ( - tokenizer.eos_token if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower() else None - ), - } - - self.raw_features = dialogues_processor.get_dialog_examples(dataset_split) - self.examples = self._create_examples(self.raw_features, dataset_split) - self.features = self.convert_examples_to_features( - self.examples, - [0, 1, 2, 3], - self.cfg.max_seq_length, - tokenizer, - output_mode="classification", - **token_params, - ) - - def _create_examples(self, raw_features, dataset_split: str): - """Creates examples for the training and dev sets.""" - examples = [] - for idx in range(len(raw_features)): - ex = self.raw_features[idx].data - user_utterance = ex["utterance"] - intent = ex["labels"]["intent"] - for candidate_idx, candidate_intent in enumerate(ex["possible_labels"]["intent"]): - guid = "{}-{}-{}".format(dataset_split, idx, candidate_idx) - text_a = user_utterance - text_b = "{} {}".format(self.cfg.prompt_template, candidate_intent) - label = 1 if candidate_intent == intent else 0 - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def convert_examples_to_features( - self, - examples: List[str], - label_list: List[int], - max_seq_length: int, - tokenizer: TokenizerSpec, - output_mode: str, - bos_token: str = None, - eos_token: str = '[SEP]', - pad_token: str = '[PAD]', - cls_token: str = '[CLS]', - sep_token_extra: str = None, - cls_token_at_end: bool = False, - cls_token_segment_id: int = 0, - pad_token_segment_id: int = 0, - pad_on_left: bool = False, - mask_padding_with_zero: bool = True, - sequence_a_segment_id: int = 0, - sequence_b_segment_id: int = 1, - ): - """ - Loads a data file into a list of `InputBatch`s. - The `cls_token_at_end` defines the location of the CLS token: - - * False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - * True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - - The `cls_token_segment_id` defines the segment id associated to the CLS token (0 for BERT, 2 for XLNet) - - The convention in BERT is: - - a. For sequence pairs: - * tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] - * type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 - b. For single sequences: - * tokens: [CLS] the dog is hairy . [SEP] - * type_ids: 0 0 0 0 0 0 0 - - Where "type_ids" are used to indicate whether this is the first - sequence or the second sequence. The embedding vectors for `type=0` - and `type=1` were learned during pre-training and are added to the - wordpiece embedding vector (and position vector). This is - not *strictly* necessarysince the [SEP] token unambiguously separates - the sequences, but it makes it easier for the model to learn - the concept of sequences. - For classification tasks, the first vector (corresponding to [CLS]) - is used as as the "sentence vector". Note that this only makes sense - because the entire model is fine-tuned. - - The convention for NMT is: - - a. For sequence pairs: - * tokens: is this jack ##ville ? no it is not . - * type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 - b. For single sequences: - * tokens: the dog is hairy . - * type_ids: 0 0 0 0 0 0 0 - - """ - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for ex_index, example in enumerate(examples): - if example.label == "-": # skip examples without a consensus label (e.g. in SNLI data set) - continue - if ex_index % 10000 == 0: - logging.info("Writing example %d of %d" % (ex_index, len(examples))) - - if hasattr(tokenizer, 'text_to_tokens'): - tokens_a = tokenizer.text_to_tokens(example.text_a) - else: - tokens_a = tokenizer.tokenize(example.text_a) - - tokens_b = None - if example.text_b: - if hasattr(tokenizer, 'text_to_tokens'): - tokens_b = tokenizer.text_to_tokens(example.text_b) - else: - tokens_b = tokenizer.tokenize(example.text_b) - - special_tokens_count = 2 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 2 if bos_token else 0 - special_tokens_count += 1 if cls_token else 0 - self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) - else: - special_tokens_count = 1 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 1 if bos_token else 0 - if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[: max_seq_length - special_tokens_count] - # Add special tokens to sequence_a - tokens = tokens_a - if bos_token: - tokens = [bos_token] + tokens - if eos_token: - tokens += [eos_token] - segment_ids = [sequence_a_segment_id] * len(tokens) - - # Add sequence separator between sequences - if tokens_b and sep_token_extra: - tokens += [sep_token_extra] - segment_ids += [sequence_a_segment_id] - - # Add special tokens to sequence_b - if tokens_b: - if bos_token: - tokens += [bos_token] - segment_ids += [sequence_b_segment_id] - tokens += tokens_b - segment_ids += [sequence_b_segment_id] * (len(tokens_b)) - if eos_token: - tokens += [eos_token] - segment_ids += [sequence_b_segment_id] - - # Add classification token - for BERT models - if cls_token: - if cls_token_at_end: - tokens += [cls_token] - segment_ids += [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - segment_ids = [cls_token_segment_id] + segment_ids - if hasattr(tokenizer, 'tokens_to_ids'): - input_ids = tokenizer.tokens_to_ids(tokens) - else: - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_seq_length - len(input_ids) - - if hasattr(tokenizer, 'tokens_to_ids'): - pad_token_id = tokenizer.tokens_to_ids([pad_token])[0] - else: - pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0] - - if pad_on_left: - input_ids = ([pad_token_id] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask - segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids - else: - input_ids = input_ids + ([pad_token_id] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) - if len(input_ids) != max_seq_length: - raise ValueError("input_ids must be of length max_seq_length") - if len(input_mask) != max_seq_length: - raise ValueError("input_mask must be of length max_seq_length") - if len(segment_ids) != max_seq_length: - raise ValueError("segment_ids must be of length max_seq_length") - if output_mode == "classification": - label_id = label_map[example.label] - elif output_mode == "regression": - label_id = np.float32(example.label) - else: - raise KeyError(output_mode) - - if ex_index < 5: - logging.info("*** Example ***") - logging.info("guid: %s" % (example.guid)) - logging.info("tokens: %s" % " ".join(list(map(str, tokens)))) - logging.info("input_ids: %s" % " ".join(list(map(str, input_ids)))) - logging.info("input_mask: %s" % " ".join(list(map(str, input_mask)))) - logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids)))) - logging.info("label: %s (id = %d)" % (example.label, label_id)) - - features.append( - InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) - ) - - return features - - -class InputFeatures(object): - """A single set of features of data. - - Args: - input_ids: input/token ids - input_mask: masks out subword tokens - segment_ids: distinguish one sentence from the other one (if present) - label_ids: label for the current example - """ - - def __init__( - self, input_ids: List[int], input_mask: List[int], segment_ids: List[int], label_id: Union[float, int] - ): - """Initialized InputFeatures.""" - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id diff --git a/nemo/collections/nlp/data/dialogue/input_example/__init__.py b/nemo/collections/nlp/data/dialogue/input_example/__init__.py deleted file mode 100644 index de4cf417e58c..000000000000 --- a/nemo/collections/nlp/data/dialogue/input_example/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.input_example.assistant_input_example import DialogueAssistantInputExample -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample -from nemo.collections.nlp.data.dialogue.input_example.sgd_input_example import DialogueSGDInputExample, SGDInputExample diff --git a/nemo/collections/nlp/data/dialogue/input_example/assistant_input_example.py b/nemo/collections/nlp/data/dialogue/input_example/assistant_input_example.py deleted file mode 100644 index c5574e8fa103..000000000000 --- a/nemo/collections/nlp/data/dialogue/input_example/assistant_input_example.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - - -class DialogueAssistantInputExample(DialogueInputExample): - """ - Template for DialogueAssistantInputExample - - Meant as a descriptor rather than to be instantiated - - Please instantiate using the base class 'DialogueInputExample' - - { - - "utterance": , - "labels": { - "service": , - "intent": , - "slots": { - "": [, ], - "": [], - } - }, - "label_positions":{ - "slots": { - "": { - # note for the Assistant dataset, start and end are word positions rather than char position - # these are whitespace-delimited word positions rather than tokenization-specific sub-word tokens. - "exclusive_end": 3, - "slot": "restaurant_name", - "start": 1 - }, - } - }, - "possible_labels": { - "service": [, , ...], - "intent": [, , ...], - "slots": { - # all slots for categorical variables - # empty list for extractive slots - # Assistant only support extractive slots - "": [], - "": [], - } - } - } - """ diff --git a/nemo/collections/nlp/data/dialogue/input_example/design_input_example.py b/nemo/collections/nlp/data/dialogue/input_example/design_input_example.py deleted file mode 100644 index 80f3152cd82e..000000000000 --- a/nemo/collections/nlp/data/dialogue/input_example/design_input_example.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - - -class DialogueDesignInputExample(DialogueInputExample): - """ - Template for DialogueDesignInputExample - - Meant as a descriptor rather than to be instantiated - - Please instantiate using the base class 'DialogueInputExample' - - { - "utterance": , - "system_utterance": , - "labels": { - "service": , - "intent": , - "slots": { - : '', - : '', - }, # dataset does not contain ground truth slot values - }, - "possible_labels": { - 'intent': [, , ...], - "service": [, , ...], - "slots": { - "": [, , ...], - "": [, , ...], - } - }, - "description": { - "service": , - "intent": , - "slots": { - "": "", - "": "", - } - }, - } - """ diff --git a/nemo/collections/nlp/data/dialogue/input_example/input_example.py b/nemo/collections/nlp/data/dialogue/input_example/input_example.py deleted file mode 100644 index 4920c2927f46..000000000000 --- a/nemo/collections/nlp/data/dialogue/input_example/input_example.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -__all__ = ['DialogueInputExample'] - - -class DialogueInputExample(object): - """ - Generic Dialogue Input Example - Uses data: dict as a flexible interface to support various input types. - This ranges from classification labels, to complex nested labels such as those in SGD - - { - "utterance": , - "labels": { - "intent": , - "slots": { ... }, - } - } - """ - - def __init__(self, data: dict): - self.data = data - - def __repr__(self): - return self.data - - def __str__(self): - return self.data diff --git a/nemo/collections/nlp/data/dialogue/input_example/mellon_qa_input_example.py b/nemo/collections/nlp/data/dialogue/input_example/mellon_qa_input_example.py deleted file mode 100644 index e6576d40460b..000000000000 --- a/nemo/collections/nlp/data/dialogue/input_example/mellon_qa_input_example.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - - -class MellonQAInputExample(DialogueInputExample): - """ - Template for MellonQAInputExample - - Meant as a descriptor rather than to be instantiated - - Please instantiate using the base class 'DialogueInputExample' - - { - "utterance": , - "labels": { - "example_id": , - "response": , - "fluent_response": , # written version of the response that is more fluent - "passage": , # passage which supports generating the response (answer) to the utterance (question) - } - } - """ diff --git a/nemo/collections/nlp/data/dialogue/input_example/ms_marco_input_example.py b/nemo/collections/nlp/data/dialogue/input_example/ms_marco_input_example.py deleted file mode 100644 index ded84d3ece67..000000000000 --- a/nemo/collections/nlp/data/dialogue/input_example/ms_marco_input_example.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - - -class DialogueMSMarcoInputExample(DialogueInputExample): - """ - Template for DialogueMSMarcoInputExample - - Meant as a descriptor rather than to be instantiated - - Please instantiate using the base class 'DialogueInputExample' - - { - - "utterance": , - "labels": { - "service": , # this is the domain - "example_id": , - "response": , - "fluent_response": , # written version of the response that is more fluent - "passage": , # passage which supports generating the response (answer) to the utterance (question) - }, - "possible_labels": { - "service": [, , ...], - "passage": [, , ...], - } - } - """ diff --git a/nemo/collections/nlp/data/dialogue/input_example/sgd_input_example.py b/nemo/collections/nlp/data/dialogue/input_example/sgd_input_example.py deleted file mode 100644 index 9862a07baccd..000000000000 --- a/nemo/collections/nlp/data/dialogue/input_example/sgd_input_example.py +++ /dev/null @@ -1,481 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/baseline/data_utils.py -""" - -from typing import List - -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample -from nemo.utils import logging - -__all__ = [ - 'SGDInputExample', - 'STR_DONTCARE', - 'STATUS_OFF', - 'STATUS_ACTIVE', - 'STATUS_DONTCARE', -] - - -class DialogueSGDInputExample(DialogueInputExample): - - """ - Template for DialogueSGDInputExample - - Meant as a descriptor rather than to be instantiated - - Please instantiate using the base class 'DialogueInputExample' - - { - "example_id": , - "example_id_num": , - "utterance": , - "system_utterance": , - "system_slots": None or { - "": { - "exclusive_end": 46, - "slot": "restaurant_name", - "start": 34 - }, - "system_actions": None or [{ - "act": "INFORM", - "canonical_values": [ - "2019-03-02" - ], - "slot": "date", - "values": [ - "March 2nd" - ] - }, ...] - "labels": { - "service": , - "intent": , - "slots": { - #only non-empty slots - #most slot values are list of length 1 - #but there are some of length 2 as both are accepted - #e.g. 1930 and 7:30 pm - "": [, ], - "": [], - } - }, - "label_positions":{ - "slots": { - "": { - "exclusive_end": 46, - "slot": "restaurant_name", - "start": 34 - }, - } - }, - "possible_labels": { - "service": [, , ...], - "intent": [, , ...], - "slots": { - #all slots including empty - "": [, , ...], - "": [, , ...], - } - }, - "description": { - "service": , - "intent": , - "slots": { - #only non-empty slots - "": , - "": , - } - } - } - - """ - - -STR_DONTCARE = "dontcare" - -# These are used to represent the status of slots (off, active, dontcare) and -# intents (off, active) in dialogue state tracking. -STATUS_OFF = 0 -STATUS_ACTIVE = 1 -STATUS_DONTCARE = 2 - - -class SGDInputExample(object): - """An example for training/inference.""" - - def __init__( - self, - schema_config: dict, - tokenizer: object, - service_schema: object = None, - example_id: str = "NONE", - example_id_num: List[int] = [], - ): - """ - Constructs an InputExample. - Args: - schema_config: configuration - tokenizer: tokenizer object - service_schema: A ServiceSchema object wrapping the schema for the service - corresponding to this example. - example_id: Unique identifier for the example, like: 'train-1_00000-00-Restaurants_1' - example_id_num: dialogue_id and turn_id combined and service id combined into a list of ints, - like: [1, 0, 0, 18] - """ - self.schema_config = schema_config - self.service_schema = service_schema - self.service_id = None - if service_schema: - self.service_id = service_schema.service_id - self.example_id = example_id - self.example_id_num = example_id_num - self._max_seq_length = schema_config["MAX_SEQ_LENGTH"] - self._tokenizer = tokenizer - if self._tokenizer is None: - raise ValueError("Must specify tokenizer") - - self.user_utterance = '' - self.system_utterance = '' - # The id of each subword in the vocabulary for BERT. - self.utterance_ids = [0] * self._max_seq_length - # Denotes the identity of the sequence. Takes values 0 (schema description) and 1 (system and user utterance). - self.utterance_segment = [0] * self._max_seq_length - # Mask which takes the value 0 for padded tokens and 1 otherwise. - self.utterance_mask = [0] * self._max_seq_length - # Start and inclusive end character indices in the original utterance - # corresponding to the tokens. This is used to obtain the character indices - # from the predicted subword indices during inference. - # NOTE: A positive value indicates the character indices in the schema description - # whereas a negative value indicates the character indices in the - # utterance. The indices are offset by 1 to prevent ambiguity in the - # 0 index, which could be in either the schema description or utterance by the - # above convention. Now the 0 index corresponds to padded tokens. - self.start_char_idx = [0] * self._max_seq_length - self.end_char_idx = [0] * self._max_seq_length - - # Id of categorical slot present in the example or 0 if not present. - self.categorical_slot_id = 0 - # Id of non categorical slot present in the example or 0 if not present. - self.noncategorical_slot_id = 0 - # The status of categorical slot in the example. - self.categorical_slot_status = STATUS_OFF - # The status of non categorical slot in the example. - self.noncategorical_slot_status = STATUS_OFF - # Masks out tasks not represented by example - self.task_mask = [0] * schema_config["NUM_TASKS"] - - # The index of the starting subword corresponding to the slot span - # for a non-categorical slot value. - self.noncategorical_slot_value_start = 0 - # The index of the ending (inclusive) subword corresponding to the slot span - # for a non-categorical slot value. - self.noncategorical_slot_value_end = 0 - - # Id of categorical slot value present in the example or 0 if not present. - self.categorical_slot_value_id = 0 - # The status of categorical slot value in the example. - self.categorical_slot_value_status = STATUS_OFF - # Id of requested slot present in the example or 0 if not present. - self.requested_slot_id = 0 - # Takes value 1 if the corresponding slot is requested, 0 otherwise. - self.requested_slot_status = STATUS_OFF - - # ID of intent present in the example. - self.intent_id = 0 - # Takes value 1 if the intent is active, 0 otherwise. - self.intent_status = STATUS_OFF - - @property - def readable_summary(self): - """Get a readable dict that summarizes the attributes of an InputExample.""" - seq_length = sum(self.utterance_mask) - utt_toks = self._tokenizer.ids_to_tokens(self.utterance_ids[:seq_length]) - utt_tok_mask_pairs = list(zip(utt_toks, self.utterance_segment[:seq_length])) - active_intent = ( - self.service_schema.get_intent_from_id(self.intent_id) if self.intent_status == STATUS_ACTIVE else "" - ) - slot_values_in_state = {} - if self.categorical_slot_status == STATUS_ACTIVE: - slot_values_in_state[ - self.service_schema.get_categorical_slot_from_id(self.categorical_slot_id) - ] = self.service_schema.get_categorical_slot_value_from_id( - self.categorical_slot_id, self.categorical_slot_value_id - ) - elif self.categorical_slot_status == STATUS_DONTCARE: - slot_values_in_state[ - self.service_schema.get_categorical_slot_from_id(self.categorical_slot_id) - ] = STR_DONTCARE - if self.noncategorical_slot_status == STATUS_ACTIVE: - slot = self.service_schema.get_non_categorical_slot_from_id(self.noncategorical_slot_id) - start_id = self.noncategorical_slot_value_start[slot] - end_id = self.noncategorical_slot_value_end[slot] - # Token list is consisted of the subwords that may start with "##". We - # remove "##" to reconstruct the original value. Note that it's not a - # strict restoration of the original string. It's primarily used for - # debugging. - # ex. ["san", "j", "##ose"] --> "san jose" - readable_value = " ".join(utt_toks[start_id : end_id + 1]).replace(" ##", "") - slot_values_in_state[slot] = readable_value - elif self.noncategorical_slot_status == STATUS_DONTCARE: - slot = self.service_schema.get_non_categorical_slot_from_id(self.noncategorical_slot_id) - slot_values_in_state[slot] = STR_DONTCARE - - summary_dict = { - "utt_tok_mask_pairs": utt_tok_mask_pairs, - "utt_len": seq_length, - "categorical_slot_id": self.categorical_slot_id, - "noncategorical_slot_id": self.noncategorical_slot_id, - "intent_id": self.intent_id, - "service_name": self.service_schema.service_name, - "active_intent": active_intent, - "slot_values_in_state": slot_values_in_state, - } - return summary_dict - - def add_utterance_features( - self, system_tokens, system_inv_alignments, user_tokens, user_inv_alignments, system_utterance, user_utterance - ): - """Add utterance related features input to InputExample. - - Note: this method modifies the system tokens and user_tokens in place to - make their total length <= the maximum input length for BERT model. - - Args: - system_tokens: a list of strings which represents schema description. - system_inv_alignments: a list of tuples which denotes the start and end - charater of the tpken that a bert token originates from in the original - schema description. - user_tokens: a list of strings which represents utterance. - user_inv_alignments: a list of tuples which denotes the start and end - charater of the token that a bert token originates from in the original - system and user utterance. - """ - # Input sequence length for utterance BERT encoder - max_utt_len = self._max_seq_length - - # Modify lengths of schema description & utterance so that length of total utt - # (including cls_token, setp_token, sep_token) is no more than max_utt_len - is_too_long = truncate_seq_pair(system_tokens, user_tokens, max_utt_len - 3) - if is_too_long: - logging.debug( - f'Utterance sequence truncated in example id - {self.example_id} from {len(system_tokens) + len(user_tokens)}.' - ) - - # Construct the tokens, segment mask and valid token mask which will be - # input to BERT, using the tokens for schema description (sequence A) and - # system and user utterance (sequence B). - utt_subword = [] - utt_seg = [] - utt_mask = [] - start_char_idx = [] - end_char_idx = [] - - utt_subword.append(self._tokenizer.cls_token) - utt_seg.append(0) - utt_mask.append(1) - start_char_idx.append(0) - end_char_idx.append(0) - - for subword_idx, subword in enumerate(system_tokens): - utt_subword.append(subword) - utt_seg.append(0) - utt_mask.append(1) - st, en = system_inv_alignments[subword_idx] - start_char_idx.append(-(st + 1)) - end_char_idx.append(-(en + 1)) - - utt_subword.append(self._tokenizer.sep_token) - utt_seg.append(0) - utt_mask.append(1) - start_char_idx.append(0) - end_char_idx.append(0) - - for subword_idx, subword in enumerate(user_tokens): - utt_subword.append(subword) - utt_seg.append(1) - utt_mask.append(1) - st, en = user_inv_alignments[subword_idx] - start_char_idx.append(st + 1) - end_char_idx.append(en + 1) - - utt_subword.append(self._tokenizer.sep_token) - utt_seg.append(1) - utt_mask.append(1) - start_char_idx.append(0) - end_char_idx.append(0) - - utterance_ids = self._tokenizer.tokens_to_ids(utt_subword) - - # Zero-pad up to the BERT input sequence length. - while len(utterance_ids) < max_utt_len: - utterance_ids.append(0) - utt_seg.append(0) - utt_mask.append(0) - start_char_idx.append(0) - end_char_idx.append(0) - self.utterance_ids = utterance_ids - self.utterance_segment = utt_seg - self.utterance_mask = utt_mask - self.start_char_idx = start_char_idx - self.end_char_idx = end_char_idx - - self.user_utterance = user_utterance - self.system_utterance = system_utterance - - def make_copy(self): - """Make a copy of the current example with utterance features.""" - new_example = SGDInputExample( - schema_config=self.schema_config, - service_schema=self.service_schema, - example_id=self.example_id, - example_id_num=self.example_id_num.copy(), - tokenizer=self._tokenizer, - ) - return new_example - - def make_copy_of_categorical_features(self): - """Make a copy of the current example with utterance and categorical features.""" - new_example = self.make_copy() - - new_example.categorical_slot_status = self.categorical_slot_status - return new_example - - def make_copy_of_non_categorical_features(self): - """Make a copy of the current example with utterance features and non categorical features.""" - new_example = self.make_copy() - new_example.noncategorical_slot_id = self.noncategorical_slot_id - new_example.noncategorical_slot_status = self.noncategorical_slot_status - new_example.utterance_ids = list(self.utterance_ids) - new_example.utterance_segment = list(self.utterance_segment) - new_example.utterance_mask = list(self.utterance_mask) - new_example.start_char_idx = list(self.start_char_idx) - new_example.end_char_idx = list(self.end_char_idx) - new_example.user_utterance = self.user_utterance - new_example.system_utterance = self.system_utterance - new_example.noncategorical_slot_status = self.noncategorical_slot_status - new_example.noncategorical_slot_value_start = self.noncategorical_slot_value_start - new_example.noncategorical_slot_value_end = self.noncategorical_slot_value_end - return new_example - - def add_categorical_slots(self, state_update: dict): - """Add features for categorical slots. - Args: - state_update: slot value pairs of the state update - """ - - categorical_slots = self.service_schema.categorical_slots - if not categorical_slots: - return - slot = categorical_slots[self.categorical_slot_id] - values = state_update.get(slot, []) - - if not values: - self.categorical_slot_status = STATUS_OFF - elif values[0] == STR_DONTCARE: - self.categorical_slot_status = STATUS_DONTCARE - else: - self.categorical_slot_status = STATUS_ACTIVE - self.categorical_slot_value_status = ( - self.categorical_slot_value_id == self.service_schema.get_categorical_slot_value_id(slot, values[0]) - ) - - def add_noncategorical_slots(self, state_update: dict, system_span_boundaries: dict, user_span_boundaries: dict): - """Add features for non-categorical slots. - Args: - state_update: slot value pairs of state update - system_span_boundaries: span boundaries of schema description - user_span_boundaries: span boundaries of utterance - """ - - noncategorical_slots = self.service_schema.non_categorical_slots - slot = noncategorical_slots[self.noncategorical_slot_id] - - values = state_update.get(slot, []) - if not values: - self.noncategorical_slot_status = STATUS_OFF - elif values[0] == STR_DONTCARE: - self.noncategorical_slot_status = STATUS_DONTCARE - else: - self.noncategorical_slot_status = STATUS_ACTIVE - # Add indices of the start and end tokens for the first encountered - # value. Spans in user utterance are prioritized over the system - # utterance. If a span is not found, the slot value is ignored. - if slot in user_span_boundaries: - start, end = user_span_boundaries[slot] - elif slot in system_span_boundaries: - start, end = system_span_boundaries[slot] - else: - # A span may not be found because the value was cropped out or because - # the value was mentioned earlier in the dialogue. Since this model - # only makes use of the last two utterances to predict state updates, - # it will fail in such cases. - logging.debug( - f'"Slot values {str(values)} not found in user or system utterance in example with id - {self.example_id}.' - ) - start = 0 - end = 0 - self.noncategorical_slot_value_start = start - self.noncategorical_slot_value_end = end - - def add_requested_slots(self, frame: dict): - """Add requested slots to InputExample - Args: - frame: frame object from which requested slots are extracted - """ - all_slots = self.service_schema.slots - slot = all_slots[self.requested_slot_id] - if slot in frame["labels"]["slots"]: - self.requested_slot_status = STATUS_ACTIVE - - def add_intents(self, frame): - """Add intents to InputExample - Args: - frame: frame object from which intents are extracted - """ - all_intents = self.service_schema.intents - intent = all_intents[self.intent_id] - if intent == frame["labels"]["intent"]: - self.intent_status = STATUS_ACTIVE - - -# Modified from run_classifier._truncate_seq_pair in the public bert model repo. -# https://github.com/google-research/bert/blob/master/run_classifier.py. -def truncate_seq_pair(tokens_a: List[int], tokens_b: List[int], max_length: int) -> bool: - """Truncate a seq pair in place so that their total length <= max_length. - Args: - tokens_a: first token sequence - tokens_b: second token sequence - max_length: truncated sequence length - Returns: - is_too_long: whether combined sequences exceed maximum sequence length - """ - is_too_long = False - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - is_too_long = True - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - return is_too_long diff --git a/nemo/collections/nlp/data/dialogue/sgd/__init__.py b/nemo/collections/nlp/data/dialogue/sgd/__init__.py deleted file mode 100644 index 9bc88d075659..000000000000 --- a/nemo/collections/nlp/data/dialogue/sgd/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.sgd.evaluate import evaluate, get_in_domain_services -from nemo.collections.nlp.data.dialogue.sgd.schema import Schema diff --git a/nemo/collections/nlp/data/dialogue/sgd/evaluate.py b/nemo/collections/nlp/data/dialogue/sgd/evaluate.py deleted file mode 100644 index 0829543dcc51..000000000000 --- a/nemo/collections/nlp/data/dialogue/sgd/evaluate.py +++ /dev/null @@ -1,294 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Evaluate predictions JSON file, w.r.t. ground truth file. -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/evaluate.py -""" - -import collections -import glob -import json -import os - -import numpy as np - -from nemo.collections.nlp.metrics.sgd_metrics import ( - ACTIVE_INTENT_ACCURACY, - JOINT_CAT_ACCURACY, - JOINT_GOAL_ACCURACY, - JOINT_NONCAT_ACCURACY, - NAN_VAL, - REQUESTED_SLOTS_F1, - REQUESTED_SLOTS_PRECISION, - REQUESTED_SLOTS_RECALL, - SLOT_TAGGING_F1, - SLOT_TAGGING_PRECISION, - SLOT_TAGGING_RECALL, - get_active_intent_accuracy, - get_average_and_joint_goal_accuracy, - get_requested_slots_f1, - get_slot_tagging_f1, -) -from nemo.utils import logging - -__all__ = ['get_in_domain_services'] - -ALL_SERVICES = "#ALL_SERVICES" -SEEN_SERVICES = "#SEEN_SERVICES" -UNSEEN_SERVICES = "#UNSEEN_SERVICES" - -# Name of the file containing all predictions and their corresponding frame metrics. -PER_FRAME_OUTPUT_FILENAME = "dialogues_and_metrics.json" - - -def get_service_set(schema_path: str) -> set: - """ - Get the set of all services present in a schema. - Args: - schema_path: schema file path - Returns: - service_set: set of services in file - """ - service_set = set() - with open(schema_path, encoding="UTF-8") as f: - schema = json.load(f) - for service in schema: - service_set.add(service["service_name"]) - f.close() - return service_set - - -def get_in_domain_services(schema_path: str, service_set: set) -> set: - """Get the set of common services between a schema and set of services. - Args: - schema_path: path to schema file - service_set: set of services - Returns: - joint_services: joint services between schema path file and service set - """ - joint_services = get_service_set(schema_path) & service_set - return joint_services - - -def get_dataset_as_dict(file_path_patterns) -> dict: - """Read the DSTC8/SGD json dialogue data as dictionary with dialog ID as keys. - Args: - file_path_patterns: list or directory of files - Returns: - dataset_dict: dataset dictionary with dialog ID as keys - """ - dataset_dict = {} - if isinstance(file_path_patterns, list): - list_fp = file_path_patterns - else: - list_fp = sorted(glob.glob(file_path_patterns)) - for fp in list_fp: - if PER_FRAME_OUTPUT_FILENAME in fp: - continue - logging.debug("Loading file: %s", fp) - with open(fp, encoding="UTF-8") as f: - data = json.load(f) - if isinstance(data, list): - for dial in data: - dataset_dict[dial["dialogue_id"]] = dial - elif isinstance(data, dict): - dataset_dict.update(data) - f.close() - return dataset_dict - - -def get_metrics( - dataset_ref: dict, - dataset_hyp: dict, - service_schemas: dict, - in_domain_services: set, - joint_acc_across_turn: bool, - use_fuzzy_match: bool, -): - """Calculate the DSTC8/SGD metrics. - Args: - dataset_ref: The ground truth dataset represented as a dict mapping dialogue id to the corresponding dialogue. - dataset_hyp: The predictions in the same format as `dataset_ref`. - service_schemas: A dict mapping service name to the schema for the service. - in_domain_services: The set of services which are present in the training set. - joint_acc_across_turn: Whether to compute joint accuracy across turn instead of across service. Should be set to True when conducting multiwoz style evaluation. - use_fuzzy_match: Whether to use fuzzy string matching when comparing non-categorical slot values. Should be set to False when conducting multiwoz style evaluation. - - Returns: - all_metric_aggregate: A dict mapping a metric collection name to a dict containing the values - for various metrics. Each metric collection aggregates the metrics across a specific set of frames in the dialogues. - per_frame_metric: metrics aggregated for each frame - """ - # Metrics can be aggregated in various ways, eg over all dialogues, only for - # dialogues containing unseen services or for dialogues corresponding to a - # single service. This aggregation is done through metric_collections, which - # is a dict mapping a collection name to a dict, which maps a metric to a list - # of values for that metric. Each value in this list is the value taken by - # the metric on a frame. - metric_collections = collections.defaultdict(lambda: collections.defaultdict(list)) - - # Ensure the dialogs in dataset_hyp also occur in dataset_ref. - assert set(dataset_hyp.keys()).issubset(set(dataset_ref.keys())) - logging.debug("len(dataset_hyp)=%d, len(dataset_ref)=%d", len(dataset_hyp), len(dataset_ref)) - - # Store metrics for every frame for debugging. - per_frame_metric = {} - - for dial_id, dial_hyp in dataset_hyp.items(): - dial_ref = dataset_ref[dial_id] - - if set(dial_ref["services"]) != set(dial_hyp["services"]): - raise ValueError( - "Set of services present in ground truth and predictions don't match " - "for dialogue with id {}".format(dial_id) - ) - - joint_metrics = [JOINT_GOAL_ACCURACY, JOINT_CAT_ACCURACY, JOINT_NONCAT_ACCURACY] - for turn_id, (turn_ref, turn_hyp) in enumerate(zip(dial_ref["turns"], dial_hyp["turns"])): - metric_collections_per_turn = collections.defaultdict(lambda: collections.defaultdict(lambda: 1.0)) - if turn_ref["speaker"] != turn_hyp["speaker"]: - raise ValueError("Speakers don't match in dialogue with id {}".format(dial_id)) - - # Skip system turns because metrics are only computed for user turns. - if turn_ref["speaker"] != "USER": - continue - - if turn_ref["utterance"] != turn_hyp["utterance"]: - logging.error("Ref utt: %s", turn_ref["utterance"]) - logging.error("Hyp utt: %s", turn_hyp["utterance"]) - raise ValueError("Utterances don't match for dialogue with id {}".format(dial_id)) - - hyp_frames_by_service = {frame["service"]: frame for frame in turn_hyp["frames"]} - - # Calculate metrics for each frame in each user turn. - for frame_ref in turn_ref["frames"]: - service_name = frame_ref["service"] - if service_name not in hyp_frames_by_service: - raise ValueError( - "Frame for service {} not found in dialogue with id {}".format(service_name, dial_id) - ) - service = service_schemas[service_name] - frame_hyp = hyp_frames_by_service[service_name] - - active_intent_acc = get_active_intent_accuracy(frame_ref, frame_hyp) - slot_tagging_f1_scores = get_slot_tagging_f1(frame_ref, frame_hyp, turn_ref["utterance"], service) - requested_slots_f1_scores = get_requested_slots_f1(frame_ref, frame_hyp) - goal_accuracy_dict = get_average_and_joint_goal_accuracy( - frame_ref, frame_hyp, service, use_fuzzy_match - ) - - frame_metric = { - ACTIVE_INTENT_ACCURACY: active_intent_acc, - REQUESTED_SLOTS_F1: requested_slots_f1_scores.f1, - REQUESTED_SLOTS_PRECISION: requested_slots_f1_scores.precision, - REQUESTED_SLOTS_RECALL: requested_slots_f1_scores.recall, - } - if slot_tagging_f1_scores is not None: - frame_metric[SLOT_TAGGING_F1] = slot_tagging_f1_scores.f1 - frame_metric[SLOT_TAGGING_PRECISION] = slot_tagging_f1_scores.precision - frame_metric[SLOT_TAGGING_RECALL] = slot_tagging_f1_scores.recall - frame_metric.update(goal_accuracy_dict) - - frame_id = "{:s}-{:03d}-{:s}".format(dial_id, turn_id, frame_hyp["service"]) - per_frame_metric[frame_id] = frame_metric - # Add the frame-level metric result back to dialogues. - frame_hyp["metrics"] = frame_metric - - # Get the domain name of the service. - domain_name = frame_hyp["service"].split("_")[0] - domain_keys = [ALL_SERVICES, frame_hyp["service"], domain_name] - if frame_hyp["service"] in in_domain_services: - domain_keys.append(SEEN_SERVICES) - - else: - domain_keys.append(UNSEEN_SERVICES) - for domain_key in domain_keys: - for metric_key, metric_value in frame_metric.items(): - if metric_value != NAN_VAL: - if joint_acc_across_turn and metric_key in joint_metrics: - metric_collections_per_turn[domain_key][metric_key] *= metric_value - else: - metric_collections[domain_key][metric_key].append(metric_value) - if joint_acc_across_turn: - # Conduct multiwoz style evaluation that computes joint goal accuracy - # across all the slot values of all the domains for each turn. - for domain_key in metric_collections_per_turn: - for metric_key, metric_value in metric_collections_per_turn[domain_key].items(): - metric_collections[domain_key][metric_key].append(metric_value) - - all_metric_aggregate = {} - for domain_key, domain_metric_vals in metric_collections.items(): - domain_metric_aggregate = {} - for metric_key, value_list in domain_metric_vals.items(): - if value_list: - # Metrics are macro-averaged across all frames. - domain_metric_aggregate[metric_key] = round(float(np.mean(value_list)) * 100.0, 2) - else: - domain_metric_aggregate[metric_key] = NAN_VAL - all_metric_aggregate[domain_key] = domain_metric_aggregate - return all_metric_aggregate, per_frame_metric - - -def evaluate( - prediction_dir: str, - data_dir: str, - eval_dataset: str, - in_domain_services: set, - joint_acc_across_turn: bool, - use_fuzzy_match: bool, -) -> dict: - """Calculate the DSTC8/SGD metrics for given data. - - Args: - prediction_dir: prediction location - data_dir: ground truth data location. - eval_dataset: evaluation data split - in_domain_services: The set of services which are present in the training set. - joint_acc_across_turn: Whether to compute joint goal accuracy across turn instead of across service. Should be set to True when conducting multiwoz style evaluation. - use_fuzzy_match: Whether to use fuzzy string matching when comparing non-categorical slot values. Should be set to False when conducting multiwoz style evaluation. - - Returns: - A dict mapping a metric collection name to a dict containing the values - for various metrics for all dialogues and all services - """ - - with open(os.path.join(data_dir, eval_dataset, "schema.json"), encoding="UTF-8") as f: - eval_services = {} - list_services = json.load(f) - for service in list_services: - eval_services[service["service_name"]] = service - f.close() - - dataset_ref = get_dataset_as_dict(os.path.join(data_dir, eval_dataset, "dialogues_*.json")) - dataset_hyp = get_dataset_as_dict(os.path.join(prediction_dir, "*.json")) - - # has ALLSERVICE, SEEN_SERVICES, UNSEEN_SERVICES, SERVICE, DOMAIN - all_metric_aggregate, _ = get_metrics( - dataset_ref, dataset_hyp, eval_services, in_domain_services, joint_acc_across_turn, use_fuzzy_match - ) - if SEEN_SERVICES in all_metric_aggregate: - logging.info(f'Dialog metrics for {SEEN_SERVICES} : {sorted(all_metric_aggregate[SEEN_SERVICES].items())}') - if UNSEEN_SERVICES in all_metric_aggregate: - logging.info(f'Dialog metrics for {UNSEEN_SERVICES}: {sorted(all_metric_aggregate[UNSEEN_SERVICES].items())}') - if ALL_SERVICES in all_metric_aggregate: - logging.info(f'Dialog metrics for {ALL_SERVICES} : {sorted(all_metric_aggregate[ALL_SERVICES].items())}') - - # Write the per-frame metrics values with the corrresponding dialogue frames. - with open(os.path.join(prediction_dir, PER_FRAME_OUTPUT_FILENAME), "w", encoding="UTF-8") as f: - json.dump(dataset_hyp, f, indent=2, separators=(",", ": ")) - f.close() - return all_metric_aggregate[ALL_SERVICES] diff --git a/nemo/collections/nlp/data/dialogue/sgd/prediction_utils.py b/nemo/collections/nlp/data/dialogue/sgd/prediction_utils.py deleted file mode 100644 index c9ddd2fd6f23..000000000000 --- a/nemo/collections/nlp/data/dialogue/sgd/prediction_utils.py +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Prediction and evaluation-related utility functions. -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/baseline/pred_utils.py -""" - -import json -import os -from collections import OrderedDict, defaultdict -from typing import Dict, List, Optional - -from nemo.collections.nlp.data.dialogue.input_example.sgd_input_example import ( - STATUS_ACTIVE, - STATUS_DONTCARE, - STR_DONTCARE, -) -from nemo.utils import logging - -REQ_SLOT_THRESHOLD = 0.5 - - -__all__ = ['write_predictions_to_file'] - - -def set_cat_slot(predictions_status: dict, predictions_value: dict, cat_slot_values: Dict[str, List[str]]) -> dict: - """ - Extract predicted categorical slot information - Args: - predictions_status: predicted statuses - predictions_value: predicted slot values - cat_slot_values: possible categorical slots and their potential values for this service - Returns: - out_dict: predicted slot value pairs - """ - out_dict = {} - for slot_idx, slot in enumerate(cat_slot_values): - slot_status = predictions_status[slot_idx][0]["cat_slot_status"] - if slot_status == STATUS_DONTCARE: - out_dict[slot] = STR_DONTCARE - elif slot_status == STATUS_ACTIVE: - tmp = predictions_value[slot_idx] - value_idx = max(tmp, key=lambda k: tmp[k]['cat_slot_value_status'][0].item()) - out_dict[slot] = cat_slot_values[slot][value_idx] - return out_dict - - -def set_noncat_slot( - predictions_status: dict, - predictions_value: dict, - non_cat_slots: List[str], - user_utterance: str, - sys_slots_agg: Optional[dict] = None, -) -> dict: - """ - Extract predicted non categorical slot information - Args: - predictions_status: predicted statuses - predictions_value: predicted slot values - non_cat_slots: list of possible non categorical slots for this service - user_utterance: system and user utterance - sys_slots_agg: system retrieval lookup table. Contains for each slot the most recent value seen in the history - Returns: - out_dict: predicted slot value pairs - """ - out_dict = {} - for slot_idx, slot in enumerate(non_cat_slots): - slot_status = predictions_status[slot_idx][0]["noncat_slot_status"] - if slot_status == STATUS_DONTCARE: - out_dict[slot] = STR_DONTCARE - elif slot_status == STATUS_ACTIVE: - tok_start_idx = predictions_value[slot_idx][0]["noncat_slot_start"] - tok_end_idx = predictions_value[slot_idx][0]["noncat_slot_end"] - ch_start_idx = predictions_value[slot_idx][0]["noncat_alignment_start"][tok_start_idx] - ch_end_idx = predictions_value[slot_idx][0]["noncat_alignment_end"][tok_end_idx] - if ch_start_idx > 0 and ch_end_idx > 0: - # Add span from the utterance. - out_dict[slot] = user_utterance[ch_start_idx - 1 : ch_end_idx] - elif sys_slots_agg and slot in sys_slots_agg: - # system retrieval - out_dict[slot] = sys_slots_agg[slot] - return out_dict - - -def get_predicted_dialog(dialog: dict, all_predictions: dict, schemas: object, state_tracker: str) -> dict: - """Overwrite the labels in the turn with the predictions from the model. For test set, these labels are missing from the data and hence they are added. - Args: - dialog: ground truth dialog - all_predictions: predictions - schemas: schema object of all services of all datasets - state_tracker: state tracker option, e.g. nemotracker - Returns: - dialog: dialog overwritten with prediction information - """ - dialog_id = dialog["dialogue_id"] - if state_tracker == "baseline": - sys_slots_agg = {} - else: - sys_slots_agg = defaultdict(OrderedDict) - all_slot_values = defaultdict(dict) - for turn_idx, turn in enumerate(dialog["turns"]): - if turn["speaker"] == "SYSTEM" and state_tracker == 'nemotracker': - for frame in turn["frames"]: - if frame["service"] not in sys_slots_agg: - sys_slots_agg[frame["service"]] = OrderedDict() - for action in frame["actions"]: - if action["slot"] and len(action["values"]) > 0: - sys_slots_agg[frame["service"]][action["slot"]] = action["values"][0] - if turn["speaker"] == "USER": - user_utterance = turn["utterance"] - system_utterance = dialog["turns"][turn_idx - 1]["utterance"] if turn_idx else "" - system_user_utterance = system_utterance + ' ' + user_utterance - turn_id = "{:02d}".format(turn_idx) - for frame in turn["frames"]: - - predictions = all_predictions[(dialog_id, turn_id, frame["service"])] - slot_values = all_slot_values[frame["service"]] - service_schema = schemas.get_service_schema(frame["service"]) - # Remove the slot spans and state if present. - frame.pop("slots", None) - frame.pop("state", None) - - # The baseline model doesn't predict slot spans. Only state predictions - # are added. - state = {} - - # Add prediction for active intent. No Offset is subtracted since schema has now NONE intent at index 0 - state["active_intent"] = get_predicted_intent( - predictions=predictions[0], intents=service_schema.intents - ) - # Add prediction for requested slots. - state["requested_slots"] = get_requested_slot(predictions=predictions[1], slots=service_schema.slots) - - # Add prediction for user goal (slot values). - # Categorical slots. - cat_out_dict = set_cat_slot( - predictions_status=predictions[2], - predictions_value=predictions[3], - cat_slot_values=service_schema.categorical_slot_values, - ) - for k, v in cat_out_dict.items(): - slot_values[k] = v - - # Non-categorical slots. - noncat_out_dict = set_noncat_slot( - predictions_status=predictions[4], - predictions_value=predictions[5], - non_cat_slots=service_schema.non_categorical_slots, - user_utterance=system_user_utterance, - sys_slots_agg=sys_slots_agg.get(frame["service"], None), - ) - for k, v in noncat_out_dict.items(): - slot_values[k] = v - # Create a new dict to avoid overwriting the state in previous turns - # because of use of same objects. - state["slot_values"] = {s: [v] for s, v in slot_values.items()} - frame["state"] = state - return dialog - - -def get_predicted_intent(predictions: dict, intents: List[str]) -> str: - """ - Returns intent name with maximum score - Args: - predictions: predictions - intents: list of possible intents for this service - Returns: - intent: predicted intent - """ - assert len(predictions) == len(intents) - active_intent_id = max(predictions, key=lambda k: predictions[k][0]['intent_status']) - intent = intents[active_intent_id] - return intent - - -def get_requested_slot(predictions: dict, slots: List[str]) -> List[str]: - """ - Returns list of slots which are predicted to be requested - Args: - predictions: predictions - slots: list of possible slots - Returns: - requested_slots: list of requested slots - """ - active_indices = [k for k in predictions if predictions[k][0]["req_slot_status"] > REQ_SLOT_THRESHOLD] - requested_slots = list(map(lambda k: slots[k], active_indices)) - return requested_slots - - -def write_predictions_to_file( - predictions: List[dict], - input_json_files: List[str], - output_dir: str, - schemas: object, - state_tracker: str, - eval_debug: bool, - in_domain_services: set, -): - """Save predicted dialogues as json files. - - Args: - predictions: An iterator containing model predictions. This is the output of - the predict method in the estimator. - input_json_files: A list of json paths containing the dialogues to run - inference on. - output_dir: The directory where output json files will be created. - schemas: Schemas to all services in the dst dataset - state_tracker: state tracker option - eval_debug: output evaluation debugging information - in_domain_services: in domain services - """ - logging.info(f"Writing predictions to {output_dir} started.") - - # Index all predictions. - all_predictions = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) - for idx, prediction in enumerate(predictions): - eval_dataset, dialog_id, turn_id, service_name, model_task, slot_intent_id, value_id = prediction[ - 'example_id' - ].split('-') - all_predictions[(dialog_id, turn_id, service_name)][int(model_task)][int(slot_intent_id)][ - int(value_id) - ] = prediction - logging.info(f'Predictions for {idx} examples in {eval_dataset} dataset are getting processed.') - - # Read each input file and write its predictions. - for input_file_path in input_json_files: - with open(input_file_path, encoding="UTF-8") as f: - dialogs = json.load(f) - logging.debug(f'{input_file_path} file is loaded') - pred_dialogs = [] - for d in dialogs: - pred_dialog = get_predicted_dialog(d, all_predictions, schemas, state_tracker) - pred_dialogs.append(pred_dialog) - input_file_name = os.path.basename(input_file_path) - output_file_path = os.path.join(output_dir, input_file_name) - with open(output_file_path, "w", encoding="UTF-8") as f: - json.dump(pred_dialogs, f, indent=2, separators=(",", ": "), sort_keys=True) diff --git a/nemo/collections/nlp/data/dialogue/sgd/schema.py b/nemo/collections/nlp/data/dialogue/sgd/schema.py deleted file mode 100644 index b12a11fdb63c..000000000000 --- a/nemo/collections/nlp/data/dialogue/sgd/schema.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Wrappers for schemas of different services. -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/schema.py -""" - -import json -from typing import List, Optional, Union - -from nemo.utils import logging - -__all__ = ['Schema'] - - -class ServiceSchema(object): - """A wrapper for schema for a service.""" - - def __init__(self, schema_json: dict, service_id: Optional[int] = None): - """ - Constructor for ServiceSchema. - Args: - schema_json: schema json dict - service_id: service ID - """ - self._service_name = schema_json["service_name"] - self._description = schema_json["description"] - self._schema_json = schema_json - self._service_id = service_id - - # Construct the vocabulary for intents, slots, categorical slots, - # non-categorical slots and categorical slot values. - self._intents = ["NONE"] + sorted(i["name"] for i in schema_json["intents"]) - self._intent_descriptions = {i["name"]: i["description"] for i in schema_json["intents"]} - self._intent_descriptions["NONE"] = "none" - self._slots = sorted(s["name"] for s in schema_json["slots"]) - self._slots_descriptions = {s["name"]: s["description"] for s in schema_json["slots"]} - self._categorical_slots = sorted( - s["name"] for s in schema_json["slots"] if s["is_categorical"] and s["name"] in self.state_slots - ) - self._non_categorical_slots = sorted( - s["name"] for s in schema_json["slots"] if not s["is_categorical"] and s["name"] in self.state_slots - ) - slot_schemas = {s["name"]: s for s in schema_json["slots"]} - categorical_slot_values = {} - categorical_slot_value_ids = {} - categorical_slot_ids = {} - non_categorical_slot_ids = {} - for slot_id, slot in enumerate(self._categorical_slots): - slot_schema = slot_schemas[slot] - values = sorted(slot_schema["possible_values"]) - categorical_slot_values[slot] = values - value_ids = {value: idx for idx, value in enumerate(values)} - categorical_slot_value_ids[slot] = value_ids - categorical_slot_ids[slot] = slot_id - - for slot_id, slot in enumerate(self._non_categorical_slots): - non_categorical_slot_ids[slot] = slot_id - - self._categorical_slot_values = categorical_slot_values - self._categorical_slot_value_ids = categorical_slot_value_ids - - self._categorical_slot_ids = categorical_slot_ids - self._non_categorical_slot_ids = non_categorical_slot_ids - - @property - def schema_json(self) -> dict: - """Returns schema json dictionary""" - return self._schema_json - - @property - def state_slots(self) -> set: - """Set of slots which are permitted to be in the dialogue state.""" - state_slots = set() - for intent in self._schema_json["intents"]: - state_slots.update(intent["required_slots"]) - state_slots.update(intent["optional_slots"]) - return state_slots - - @property - def service_name(self): - return self._service_name - - @property - def service_id(self): - return self._service_id - - @property - def description(self): - return self._description - - @property - def slots(self): - return self._slots - - @property - def intents(self): - return self._intents - - @property - def intent_descriptions(self): - return self._intent_descriptions - - @property - def slot_descriptions(self): - return self._slots_descriptions - - @property - def categorical_slots(self): - return self._categorical_slots - - @property - def non_categorical_slots(self): - return self._non_categorical_slots - - @property - def categorical_slot_values(self): - return self._categorical_slot_values - - def get_categorical_slot_values(self, slot): - return self._categorical_slot_values[slot] - - def get_slot_from_id(self, slot_id): - return self._slots[slot_id] - - def get_intent_from_id(self, intent_id): - return self._intents[intent_id] - - def get_categorical_slot_from_id(self, slot_id): - return self._categorical_slots[slot_id] - - def get_non_categorical_slot_from_id(self, slot_id): - return self._non_categorical_slots[slot_id] - - def get_categorical_slot_value_from_id(self, slot_id, value_id): - slot = self._categorical_slots[slot_id] - return self._categorical_slot_values[slot][value_id] - - def get_categorical_slot_value_id(self, slot, value): - return self._categorical_slot_value_ids[slot][value] - - def get_categorical_slot_id(self, slot): - return self._categorical_slot_ids[slot] - - def get_non_categorical_slot_id(self, slot): - return self._non_categorical_slot_ids[slot] - - -class Schema(object): - """Wrapper for schemas for all services in a dataset.""" - - def __init__(self, schema_json_paths: Union[str, List[str]]): - """ - schema_json_paths: list of .json path to schema files of a single str with path to the json file. - """ - # Load the schema from the json file. - if isinstance(schema_json_paths, str): - with open(schema_json_paths, "r") as f: - all_schemas = json.load(f) - f.close() - else: - # load multiple schemas from the list of the json files - all_schemas = [] - completed_services = [] - for schema_json_path in schema_json_paths: - with open(schema_json_path, "r") as f: - schemas = json.load(f) - f.close() - logging.debug("Num of services in %s: %s", schema_json_path, len(schemas)) - - for service in schemas: - if service['service_name'] not in completed_services: - completed_services.append(service['service_name']) - all_schemas.append(service) - - self._services = sorted(schema["service_name"] for schema in all_schemas) - self._services_vocab = {v: k for k, v in enumerate(self._services)} - self._services_id_to_vocab = {v: k for k, v in self._services_vocab.items()} - service_schemas = {} - for schema in all_schemas: - service = schema["service_name"] - service_schemas[service] = ServiceSchema(schema, service_id=self.get_service_id(service)) - - self._service_schemas = service_schemas - self._schemas = all_schemas - self._slots_relation_list = {} - - def get_service_id(self, service: str): - return self._services_vocab[service] - - def get_service_from_id(self, service_id: int): - return self._services[service_id] - - def get_service_schema(self, service: str): - return self._service_schemas[service] - - @property - def services(self): - return self._services - - def save_to_file(self, file_path): - """ - Saves schema object to file - Args: - file_path: path to store schema object at - """ - with open(file_path, "w") as f: - json.dump(self._schemas, f, indent=2) diff --git a/nemo/collections/nlp/metrics/__init__.py b/nemo/collections/nlp/metrics/__init__.py index 18414412d91c..fda3c1f799b9 100644 --- a/nemo/collections/nlp/metrics/__init__.py +++ b/nemo/collections/nlp/metrics/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo.collections.nlp.metrics.classification_report import ClassificationReport, MultiLabelClassificationReport -from nemo.collections.nlp.metrics.dialogue_metrics import DialogueClassificationMetrics -from nemo.collections.nlp.metrics.qa_metrics import QAMetrics -from nemo.collections.nlp.metrics.sequence_perplexity import SequencePerplexity +from nemo.collections.nlp.metrics.classification_report import ClassificationReport # noqa: F401 +from nemo.collections.nlp.metrics.classification_report import MultiLabelClassificationReport # noqa: F401 +from nemo.collections.nlp.metrics.qa_metrics import QAMetrics # noqa: F401 +from nemo.collections.nlp.metrics.sequence_perplexity import SequencePerplexity # noqa: F401 diff --git a/nemo/collections/nlp/metrics/dialogue_metrics.py b/nemo/collections/nlp/metrics/dialogue_metrics.py deleted file mode 100644 index 7330a1c90611..000000000000 --- a/nemo/collections/nlp/metrics/dialogue_metrics.py +++ /dev/null @@ -1,186 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -from collections import Counter - -import numpy as np -from sacrebleu import corpus_bleu - - -class DialogueGenerationMetrics(object): - @staticmethod - def save_predictions( - filename, generated_field, ground_truth_field, inputs, - ): - """ - Save predictions as a jsonl file - - Args: - Each arg is a list of strings (all args have the same length) - """ - docs = [] - for i in range(len(inputs)): - docs.append( - {"input": inputs[i], "ground_truth": ground_truth_field[i], "generated": generated_field[i],} - ) - with open(filename, 'w', encoding="UTF-8") as f: - for item in docs: - f.write(json.dumps(item) + "\n") - - @staticmethod - def _get_one_f1(generated_field, ground_truth_field): - """ - Get precision, recall, f1 based on token overlap between generated and ground_truth sequence - """ - generated_tokens = generated_field.split() - ground_truth_tokens = ground_truth_field.split() - - common = Counter(generated_tokens) & Counter(ground_truth_tokens) - num_same = sum(common.values()) - if num_same == 0: - return 0, 0, 0 - precision = 1.0 * num_same / len(generated_tokens) - recall = 1.0 * num_same / len(ground_truth_tokens) - f1 = (2 * precision * recall) / (precision + recall) - return np.array([precision * 100, recall * 100, f1 * 100]) - - @staticmethod - def get_f1(generated_fields, ground_truth_fields): - total_p_r_f1 = np.array( - [ - DialogueGenerationMetrics._get_one_f1(generated_fields[i], ground_truth_fields[i]) - for i in range(len(ground_truth_fields)) - ] - ) - avg_p_r_f1 = np.mean(total_p_r_f1, axis=0) - return avg_p_r_f1 - - @staticmethod - def get_bleu(generated_field, ground_truth_field): - """ - Referenced from NMT evaluation - Note 13a is the default tokenizer for English for WMT - Known issue that it doesn't hand edge case of None or '' - https://github.com/mjpost/sacrebleu/issues/161 - """ - valid_indices = [i for i in range(len(generated_field)) if generated_field[i] and ground_truth_field[i]] - generated_field = [generated_field[i] for i in valid_indices] - ground_truth_field = [ground_truth_field[i] for i in valid_indices] - sacre_bleu = corpus_bleu(generated_field, [ground_truth_field], tokenize="13a") - return sacre_bleu.score - - -class DialogueClassificationMetrics(object): - @staticmethod - def save_predictions( - filename, - generated_labels, - generated_slots, - ground_truth_labels, - ground_truth_slots, - generated_field, - ground_truth_field, - inputs, - ): - """ - Save predictions as a jsonl file - - Args: - Each arg is a list of strings (all args have the same length) - """ - docs = [] - for i in range(len(inputs)): - docs.append( - { - "input": inputs[i], - "ground_truth": ground_truth_field[i], - "ground_truth_slots": ground_truth_slots[i], - "ground_truth_labels": ground_truth_labels[i], - "generated": generated_field[i], - "generated_slots": generated_slots[i], - "generated_labels": generated_labels[i], - } - ) - with open(filename, 'w', encoding="UTF-8") as f: - for item in docs: - f.write(json.dumps(item) + "\n") - - @staticmethod - def split_label_and_slots(fields, with_slots=False): - """ - Split target into label and slots when doing joint label (i.e. intent) classificaiton and slot filling - - For instance, split "reserve_restaurant\nslots: time_of_day(7pm), number_of_people(3)" into - label = "reserve_restaurant" and slots = ["time_of_day(7pm)", "number_of_people(3)"] - Args: - fields: list of strings - """ - labels = [] - slots_list = [] - for field in fields: - if with_slots: - combo = [i.strip() for i in field.split('slots:', 1)] - label = 'none' - if len(combo) == 2: - label, slots = combo - elif len(combo) == 1: - slots = combo[0] - label = 'none' - if isinstance(slots, str): - # temporary patch for purnendu model output - if 'possible intents:' in slots: - slots = slots.split('possible intents:')[0] - slots = slots.split(', ') - else: - slots = ['None'] - else: - label = field - slots = [] - slots_list.append(slots) - labels.append(label) - - return labels, slots_list - - @staticmethod - def get_slot_filling_metrics(generated_slots, ground_truth_slots): - """ - Args: - generated_slots: list of list of strings. - Each string is slot-name and slot-value pair e.g. location(Seattle) - ground_truth_slots: list of list of strings - """ - all_recall = [] - all_precision = [] - all_joint_goal_accuracy = [] - - for i in range(len(generated_slots)): - # depulicate and sort - ground_truth = sorted(list(set(ground_truth_slots[i]))) - predicted = sorted(list(set(generated_slots[i]))) - correct = [item for item in predicted if item in ground_truth] - recall = len(correct) / len(ground_truth) if len(ground_truth) > 0 else 0 - precision = len(correct) / len(predicted) if len(predicted) > 0 else 0 - joint_goal_accuracy = int(ground_truth == predicted) - all_recall.append(recall) - all_precision.append(precision) - all_joint_goal_accuracy.append(joint_goal_accuracy) - - avg_joint_goal_accuracy = np.mean(all_joint_goal_accuracy) * 100 - avg_precision = np.mean(all_precision) * 100 - avg_recall = np.mean(all_recall) * 100 - avg_f1 = 2 * (avg_recall * avg_precision) / (avg_recall + avg_precision + 1e-20) - - return avg_precision, avg_recall, avg_f1, avg_joint_goal_accuracy diff --git a/nemo/collections/nlp/models/dialogue/__init__.py b/nemo/collections/nlp/models/dialogue/__init__.py deleted file mode 100644 index 2b75ee1a778a..000000000000 --- a/nemo/collections/nlp/models/dialogue/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.models.dialogue.dialogue_gpt_classification_model import DialogueGPTClassificationModel -from nemo.collections.nlp.models.dialogue.dialogue_zero_shot_intent_model import DialogueZeroShotIntentModel -from nemo.collections.nlp.models.dialogue.intent_slot_classification_model import IntentSlotClassificationModel -from nemo.collections.nlp.models.dialogue.sgdqa_model import SGDQAModel diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py deleted file mode 100644 index 6c7472b95c42..000000000000 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py +++ /dev/null @@ -1,805 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections -import copy -import os -import random -from typing import Dict, Optional, Union - -import numpy as np -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig -from torch.utils.data import DataLoader -from transformers import AutoModelWithLMHead - -from nemo.collections.nlp.data.dialogue import DialogueGPTClassificationDataset, DialogueSGDDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.design_data_processor import DialogueDesignDataProcessor -from nemo.collections.nlp.metrics.classification_report import ClassificationReport -from nemo.collections.nlp.metrics.dialogue_metrics import DialogueClassificationMetrics -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import ( - MegatronGPTPromptLearningModel, -) -from nemo.collections.nlp.models.nlp_model import NLPModel -from nemo.collections.nlp.modules.common import VirtualPromptSource, VirtualPromptStyle -from nemo.collections.nlp.modules.common.text_generation_utils import ( - get_default_sampling_params, - megatron_gpt_generate, -) -from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam -from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueGPTClassificationModel'] - - -class DialogueGPTClassificationModel(NLPModel): - def __init__( - self, - cfg: DictConfig, - trainer: Trainer = None, - ): - # deprecation warning - deprecated_warning("DialogueGPTClassificationModel") - - self.cfg = cfg - self.eval_mode = cfg.dataset.eval_mode - self.data_prepared = False - self.epoch_number = 0 - self.prompt_learning = self.cfg.prompt_learning - super().__init__(cfg=cfg, trainer=trainer, no_lm_init=True) - - if self.cfg.library == "huggingface": - self.language_model = AutoModelWithLMHead.from_pretrained(cfg.language_model.pretrained_model_name) - self.language_model.resize_token_embeddings(len(self.tokenizer.tokenizer)) - self.unreduced_loss_fct = torch.nn.CrossEntropyLoss(reduction='none') - elif self.cfg.library == "megatron": - if self.prompt_learning: - if os.path.exists(cfg.prompt_learning_nemo_path): - self.language_model = MegatronGPTPromptLearningModel.restore_from( - cfg.prompt_learning_nemo_path, - trainer=trainer, - save_restore_connector=NLPSaveRestoreConnector(), - ) - else: - # removing tokenizer cfg as this triggers tokenizer construction which is not helpful here as we have a separate tokenizer - new_cfg = copy.copy(cfg) - del new_cfg.tokenizer - new_cfg.nemo_path = cfg.prompt_learning_nemo_path - self.language_model = MegatronGPTPromptLearningModel(new_cfg, trainer) - else: - self.language_model = MegatronGPTModel.restore_from(cfg.language_model.lm_checkpoint, trainer=trainer) - - all_labels = list( - self._train_dl.dataset.all_possible_labels.union( - self._validation_dl.dataset.all_possible_labels, self._test_dl.dataset.all_possible_labels - ) - ) - self.label_to_ids = collections.defaultdict(int) - - for i in range(len(all_labels)): - self.label_to_ids[all_labels[i]] = i - - self.all_existing_labels = set(self.label_to_ids.keys()) - - self.token_to_words = {} - self.classification_report = ClassificationReport( - num_classes=len(self.label_to_ids) + 1, mode='micro', label_ids=self.label_to_ids, dist_sync_on_step=True - ) - - def setup_optimizer_param_groups(self): - """ - ModelPT override for prompt learning. - Optimizer will get self._optimizer_param_groups. - Makes two optimizer param groups, one for the frozen model params - and one for the prompt-table/prompt-encoder params. The learning - rate for the frozen model's params will always be zero effectively - freezing the model's params but still allowing for the needed gradients - to be passed around in pipeline parallel models. The prompt-encoder - and/or prompt table will use the learning rate set by the user. - """ - if not self.prompt_learning: - super().setup_optimizer_param_groups() - return - # Freeze frozen model - for param in self.language_model.frozen_model.parameters(): - param.requires_grad = False - - virtual_prompt_params = {'params': []} - - if self.language_model.frozen_model.model.pre_process: - virtual_prompt_params['params'].extend([param for param in self.language_model.prompt_table.parameters()]) - - if self.language_model.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER: - virtual_prompt_params['params'].extend( - [param for param in self.language_model.prompt_encoder.parameters()] - ) - self._optimizer_param_groups = (virtual_prompt_params,) - - def training_step(self, batch, batch_idx): - ( - input_ids, - attn_masks, - labels, - candidate_input_ids, - candidate_attn_masks, - template_length, - utterance_length, - correct_candidate, - ) = batch - # construct training samples as generating " Answer: yes/no" after " : " - if self.eval_mode == "binary_score": - new_input_ids = [] - new_attn_masks = [] - for i in range(candidate_input_ids.size(0)): - # in some datasets like assistant, there might be 60+ possible intents with 1 correct intent - # therefore we might not want to use all possible intents as negative samples - # instead use {binary_score_subsample_ratio} negative samples for every positive sample - if self.cfg.dataset.binary_score_subsample: - new_input_ids.append(candidate_input_ids[i, 2 * correct_candidate[i].item(), :]) - new_attn_masks.append(candidate_attn_masks[i, 2 * correct_candidate[i].item(), :]) - possible_negatives = [] - for j in range(0, candidate_input_ids.size(1), 2): - if j > 0 and torch.equal(candidate_input_ids[i, j, :], candidate_input_ids[i, 0, :]): - break - if j != 2 * correct_candidate[i].item(): - possible_negatives.append(j) - negative_samples = random.choices( - possible_negatives, k=int(self.cfg.dataset.binary_score_subsample_ratio) - ) - for negative_sample in negative_samples: - new_input_ids.append(candidate_input_ids[i, negative_sample, :]) - new_attn_masks.append(candidate_attn_masks[i, negative_sample, :]) - - else: - for j in range(0, candidate_input_ids.size(1), 2): - if j > 0 and torch.equal(candidate_input_ids[i, j, :], candidate_input_ids[i, 0, :]): - break - new_input_ids.append(candidate_input_ids[i, j, :]) - new_attn_masks.append(candidate_attn_masks[i, j, :]) - input_ids = torch.stack(new_input_ids) - attn_masks = torch.stack(new_attn_masks) - labels = self.get_binary_score_labels(input_ids) - - loss, _ = self(input_ids, attn_masks, labels, inference=False) - self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) - return {'loss': loss} - - def validation_step(self, batch, batch_idx): - loss = self.eval_step_helper(batch=batch) - self.validation_step_outputs.append(loss) - return loss - - def on_validation_epoch_end(self): - self.eval_epoch_end(self.validation_step_outputs, mode='val') - self.validation_step_outputs.clear() # free memory - - def on_test_epoch_end(self): - self.eval_epoch_end(self.test_step_outputs, mode='test') - self.test_step_outputs.clear() # free memory - - def eval_epoch_end(self, outputs, mode='val'): - - generated_field = [] - ground_truth_field = [] - inputs = [] - for output in outputs: - generated_field += output["generated_field"] - ground_truth_field += output["ground_truth_field"] - inputs += output["input"] - - with_slots = self.cfg.dataset.target_template == "with_slots" - - generated_labels, generated_slots = DialogueClassificationMetrics.split_label_and_slots( - generated_field, with_slots=with_slots - ) - ground_truth_labels, ground_truth_slots = DialogueClassificationMetrics.split_label_and_slots( - ground_truth_field, with_slots=with_slots - ) - - os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True) - filename = os.path.join( - self.cfg.dataset.dialogues_example_dir, f"{mode}_predictions_epoch{self.epoch_number}.jsonl" - ) - - DialogueClassificationMetrics.save_predictions( - filename, - generated_labels, - generated_slots, - ground_truth_labels, - ground_truth_slots, - generated_field, - ground_truth_field, - inputs, - ) - - label_acc = np.mean([int(generated_labels[i] == ground_truth_labels[i]) for i in range(len(generated_labels))]) - - generated_field_ids = torch.tensor([self.label_to_ids[label] for label in generated_labels], dtype=int).to( - self.classification_report.device - ) - - ground_truth_field_ids = torch.tensor( - [self.label_to_ids[label] for label in ground_truth_labels], dtype=int - ).to(self.classification_report.device) - - tp, fn, fp, _ = self.classification_report(generated_field_ids, ground_truth_field_ids) - - precision, recall, f1, report = self.classification_report.compute() - self.classification_report.reset() - - ( - slot_precision, - slot_recall, - slot_f1, - slot_joint_goal_accuracy, - ) = DialogueClassificationMetrics.get_slot_filling_metrics(generated_slots, ground_truth_slots) - - logging.info(report) - - self.log('{}_precision'.format(self.cfg.dataset.field), precision) - self.log('{}_f1'.format(self.cfg.dataset.field), f1) - self.log('{}_recall'.format(self.cfg.dataset.field), recall) - self.log('{}_{}_accuracy'.format(mode, self.cfg.dataset.field), label_acc * 100) - self.log('slot_precision', slot_precision) - self.log('slot_recall', slot_recall) - self.log('slot_f1', slot_f1) - self.log('slot_joint_goal_accuracy', slot_joint_goal_accuracy) - - if mode == 'val': - self.epoch_number += 1 - if self.cfg.save_model: - filename = '{}/epoch-{}-model.bin'.format(self.cfg.dataset.dialogues_example_dir, self.epoch_number) - torch.save(self.language_model.state_dict(), filename) - - def test_step(self, batch, batch_idx): - loss = self.eval_step_helper(batch=batch, mode='test') - self.test_step_outputs.append(loss) - return loss - - # for inference only - def predict_step(self, batch, batch_idx, dataloader_idx=None): - # return self(batch) - raise NotImplementedError() - - def on_train_end(self): - if self.prompt_learning: - self.language_model.on_train_end() - - def get_prompt_token_labels_for_megatron_gpt(self, input_ids, num_prompt_tokens): - - prompt_token_labels = torch.full( - size=(input_ids.size(0), num_prompt_tokens), - fill_value=self.tokenizer.tokenizer.pad_token_id, - dtype=torch.long, - ) - - if self.prompt_learning: - prompt_token_labels.data = torch.LongTensor( - np.tile(np.array(self.language_model.pseudo_token_ids), (input_ids.size(0), 1)) - ) - - prompt_token_labels = prompt_token_labels.to(input_ids.device) - - return prompt_token_labels - - def get_virtual_prompt_ids_for_megatron_gpt(self, input_ids): - if ( - self.cfg.virtual_prompt_style == VirtualPromptStyle.P_TUNING - or not self.prompt_learning - or self.trainer.testing - ): - prompt_ids = torch.tensor([0] * input_ids.size(0)).to(input_ids.device) if self.prompt_learning else None - else: - total_virtual_tokens = self.cfg.task_templates[0].total_virtual_tokens - init_text = self.cfg.task_templates[0].taskname - init_text_ids = self.tokenizer.text_to_ids(init_text) - init_text_ids = torch.tensor(init_text_ids).to(input_ids.device) - prompt_ids = init_text_ids.repeat(input_ids.size(0), 1)[:, :total_virtual_tokens] - return prompt_ids - - def forward(self, input_ids, attention_mask, labels, inference=True): - - if self.cfg.library == "huggingface": - output = self.language_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) - loss = output['loss'] - # calculate loss per sample - b_logits = output['logits'] - shift_logits = b_logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - unreduced_loss = self.unreduced_loss_fct( - shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) - ) - loss_per_sample = torch.mean(unreduced_loss.view(shift_labels.size()), dim=-1) - elif self.cfg.library == "megatron": - num_prompt_tokens = ( - len(self.language_model.pseudo_token_ids) if hasattr(self.language_model, 'pseudo_token_ids') else 0 - ) - position_ids = torch.arange( - start=0, - end=num_prompt_tokens + input_ids.size(1), - dtype=torch.long, - device=input_ids.device, - ) - - prompt_ids = self.get_virtual_prompt_ids_for_megatron_gpt(input_ids) - - attn_mask_add_on = torch.ones((attention_mask.size(0), num_prompt_tokens), device=attention_mask.device) - full_attention_mask = torch.cat([attn_mask_add_on, attention_mask], axis=-1) - full_attention_mask_expand = torch.tril( - full_attention_mask.unsqueeze(2).tile(full_attention_mask.size(1)) - ).unsqueeze(1) - - attn_mask = full_attention_mask_expand <= 0 - - prompt_token_labels = self.get_prompt_token_labels_for_megatron_gpt(input_ids, num_prompt_tokens) - - input_ids_new = torch.cat([prompt_token_labels, input_ids], axis=1) - make_up_last_column_input_ids = ( - torch.ones_like(input_ids_new[:, -1:]) * self.tokenizer.tokenizer.pad_token_id - ) - left_shifted_input_ids = torch.cat([input_ids_new[:, 1:], make_up_last_column_input_ids], axis=-1) - if self.prompt_learning: - unmasked_unreduced_loss = self.language_model( - input_ids_new, - position_ids, - attn_mask, - labels=left_shifted_input_ids, - taskname_ids=prompt_ids, - inference=inference, - ) - else: - unmasked_unreduced_loss = self.language_model( - input_ids, position_ids, attn_mask, labels=left_shifted_input_ids - ) - - if isinstance(unmasked_unreduced_loss, tuple): - unmasked_unreduced_loss = unmasked_unreduced_loss[0] - - labels = torch.cat([prompt_token_labels, labels], axis=1) - make_up_last_column_labels = torch.ones_like(labels[:, -1:]) * self.tokenizer.tokenizer.pad_token_id - new_labels = torch.cat([labels[:, 1:], make_up_last_column_labels], axis=-1) - filler = torch.zeros_like(new_labels) - labels_mask_0 = torch.where(new_labels != -100, new_labels, filler) - labels_mask = labels_mask_0 > 0 - - loss = self.mask_and_reduce_loss(labels_mask, unmasked_unreduced_loss) - loss_per_sample = self.mask_and_reduce_loss_per_sample(labels_mask, unmasked_unreduced_loss) - - return loss, loss_per_sample - - def mask_and_reduce_loss_per_sample(self, loss_mask, unmasked_unreduced_loss): - """ - Mask and reduce loss based on each sample in batch - Useful for ranking candidates with the same prompt in batch based on loss - """ - losses = unmasked_unreduced_loss.float() - loss_mask = loss_mask.view(-1).float() - masked_loss = losses.view(-1) * loss_mask - loss_per_sample = torch.mean(masked_loss.view(unmasked_unreduced_loss.size()), dim=-1) - return loss_per_sample - - def mask_and_reduce_loss(self, loss_mask, output_tensor): - losses = output_tensor.float() - loss_mask = loss_mask.view(-1).float() - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - return loss - - def decode(self, tokens): - if tokens not in self.token_to_words: - self.token_to_words[tokens] = self.tokenizer.tokenizer.decode(tokens) - return self.token_to_words[tokens] - - def binary_score_candidates( - self, - candidate_input_ids, - candidate_attn_masks, - utterance_length, - labels, - template_length, - correct_candidate, - minus_negative=True, - inference=False, - ): - best_candidate_input_ids = [] - - for i in range(candidate_input_ids.size(0)): - best_j = 0 - - lowest_loss = float("inf") - - for j in range(0, candidate_input_ids.size(1), 2): - - if j > 0 and torch.equal(candidate_input_ids[i, j, :], candidate_input_ids[i, 0, :]): - break - - start_yes = j if j // 2 == correct_candidate[i].item() else j + 1 - - cand_loss = self( - candidate_input_ids[i, start_yes : start_yes + 1, :], - candidate_attn_masks[i, start_yes : start_yes + 1, :], - self.get_binary_score_labels(candidate_input_ids[i, start_yes : start_yes + 1, :]), - inference=inference, - ) - - considered_loss = cand_loss.item() - - if minus_negative: - start_no = j + 1 if j // 2 == correct_candidate[i].item() else j - - negative_cand_loss = self( - candidate_input_ids[i, start_no : start_no + 1, :], - candidate_attn_masks[i, start_no : start_no + 1, :], - self.get_binary_score_labels(candidate_input_ids[i, start_no : start_no + 1, :]), - inference=inference, - ) - considered_loss -= negative_cand_loss.item() - - if considered_loss < lowest_loss: - best_j = start_yes - lowest_loss = considered_loss - - best_candidate_input_ids.append(candidate_input_ids[i, best_j, :]) - - candidate_tokens = torch.stack(best_candidate_input_ids) - generated_field, ground_truth_field = self.process_into_structured_fields( - candidate_tokens, labels, template_length=template_length - ) - return generated_field, ground_truth_field - - def get_binary_score_labels(self, input_ids): - # mask out every token except the last token for yes/no/true/false - labels = torch.zeros_like(input_ids) - - for i in range(input_ids.size(0)): - for j in range(input_ids.size(1)): - if input_ids.data[0, j] == self.tokenizer.tokenizer.pad_token_id: - stop_point = j - break - last_point = stop_point - 1 - labels.data[i, last_point] = input_ids[i, last_point] - - return labels - - def rank_candidates( - self, - candidate_input_ids, - candidate_attn_masks, - utterance_length, - labels, - template_length, - minus_prior=True, - inference=False, - ): - best_candidate_input_ids = [] - - for i in range(candidate_input_ids.size(0)): - # candidates are padded with first candidate to ensure equal number of candidates in batch - # run for loop to strip redundant candidates - last_j = candidate_input_ids.size(1) - for j in range(1, candidate_input_ids.size(1)): - if torch.equal(candidate_input_ids[i, j, :], candidate_input_ids[i, 0, :]): - last_j = j - break - - utterance_end = utterance_length[i].item() - # this might cause GPU memory pressure there are many candidates - # if OOM, re-write to do this in a for loop with as many as train_ds.batch_size - _, loss_per_sample = self( - candidate_input_ids[i, :last_j, :], - candidate_attn_masks[i, :last_j, :], - candidate_input_ids[i, :last_j, :], - inference=inference, - ) - - if minus_prior: - _, utterance_free_cand_loss_per_sample = self( - candidate_input_ids[i, :last_j, utterance_end:], - candidate_attn_masks[i, :last_j, utterance_end:], - candidate_input_ids[i, :last_j, utterance_end:], - inference=inference, - ) - considered_loss = loss_per_sample - utterance_free_cand_loss_per_sample - else: - considered_loss = loss_per_sample - best_j = torch.argmin(considered_loss) - best_candidate_input_ids.append(candidate_input_ids[i, best_j, :]) - - candidate_tokens = torch.stack(best_candidate_input_ids) - generated_field, ground_truth_field = self.process_into_structured_fields( - candidate_tokens, labels, template_length=template_length - ) - return generated_field, ground_truth_field - - def generate_candidates(self, labels, template_length, input_ids, attn_masks): - - tokens_to_generate = self.cfg.tokens_to_generate - - if self.cfg.library == "huggingface": - generated_tokens = [] - max_length = 0 - for i in range(input_ids.size(0)): - param_dict = { - "input_ids": input_ids[i : i + 1, : template_length[i]], - "max_length": template_length[i] + tokens_to_generate, - "pad_token_id": self.tokenizer.tokenizer.pad_token_id, - } - generated_tokens.append(self.language_model.generate(**param_dict)) - max_length = max(max_length, generated_tokens[-1].size(1)) - - # pad each generated to ensure they are of same length in dim 1, therefore stack-able - generated_tokens = [ - torch.cat( - [i, torch.ones((1, max_length - i.size(1))).to(i.device) * self.tokenizer.tokenizer.pad_token_id], - axis=-1, - ) - for i in generated_tokens - ] - generated_tokens = torch.cat(generated_tokens, axis=0) - num_prompt_tokens = 0 - - elif self.cfg.library == "megatron": - - prompt_ids = self.get_virtual_prompt_ids_for_megatron_gpt(input_ids) - - num_prompt_tokens = ( - len(self.language_model.pseudo_token_ids) if hasattr(self.language_model, 'pseudo_token_ids') else 0 - ) - - prompt_token_labels = self.get_prompt_token_labels_for_megatron_gpt(input_ids, num_prompt_tokens) - input_ids_without_answers = [ - torch.cat( - [ - input_ids[i, : template_length[i]], - torch.ones((input_ids.size(1) - template_length[i].item(),)).to(input_ids.device) - * self.tokenizer.tokenizer.pad_token_id, - ], - axis=-1, - ).type(input_ids.dtype) - for i in range(input_ids.size(0)) - ] - input_ids_without_answers = torch.stack(input_ids_without_answers) - input_ids_new = torch.cat( - [ - prompt_token_labels, - input_ids_without_answers, - torch.ones((input_ids.size(0), tokens_to_generate)).to(input_ids.device) - * self.tokenizer.tokenizer.pad_token_id, - ], - axis=1, - ).type(input_ids.dtype) - - tokens_for_generation = (input_ids_new, template_length + num_prompt_tokens) - - length_param: LengthParam = {"min_length": 0, "max_length": tokens_to_generate} - - generated_dict = megatron_gpt_generate( - self.language_model, - tokens_for_generation, - self.tokenizer, - length_param, - get_default_sampling_params(), - task_ids=prompt_ids, - ) - generated_tokens = torch.LongTensor(generated_dict['token_ids']) - - generated_field, ground_truth_field = self.process_into_structured_fields( - generated_tokens, labels, template_length=template_length + num_prompt_tokens - ) - return generated_field, ground_truth_field - - def eval_step_helper(self, batch, mode='val'): - ( - input_ids, - attn_masks, - labels, - candidate_input_ids, - candidate_attn_masks, - template_length, - utterance_length, - correct_candidate, - ) = batch - - inference = mode == 'test' - loss, _ = self(input_ids, attn_masks, labels, inference=inference) - self.log("{}_loss".format(mode), loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) - - # ranking using perplexity of candidates following the " :" - if self.eval_mode == "ranking": - generated_field, ground_truth_field = self.rank_candidates( - candidate_input_ids, - candidate_attn_masks, - utterance_length, - labels, - template_length, - inference=inference, - ) - # autoregressively generate candidates (possibly with constraint) - elif self.eval_mode == "generation": - generated_field, ground_truth_field = self.generate_candidates( - labels, template_length, input_ids, attn_masks - ) - # comparing likelihood based on the perplexity of generating " Answer: yes" after " : " - # (optionally, the difference of that with " Answer: no" using the flag minus_negative=True) - elif self.eval_mode == "binary_score": - generated_field, ground_truth_field = self.binary_score_candidates( - candidate_input_ids, - candidate_attn_masks, - utterance_length, - labels, - template_length, - correct_candidate, - inference=inference, - ) - - else: - raise ValueError( - "{} is not among supported options (ranking, generation, binary_score)".format(self.eval_mode) - ) - - return { - 'loss': loss, - 'input': self.tokenizer.tokenizer.batch_decode(input_ids, skip_special_tokens=True), - 'generated_field': generated_field, - 'ground_truth_field': ground_truth_field, - } - - def process_into_structured_fields(self, generated_tokens, labels, template_length=None): - - generated_field = [] - - for i in range(generated_tokens.size(0)): - start_point = 0 if template_length is None else template_length[i].item() - stop_point = generated_tokens.size(1) - - for j in range(start_point, stop_point): - if generated_tokens.data[i, j] == self.tokenizer.tokenizer.pad_token_id: - stop_point = j - break - - # this is to account for the tokens ' Answer: ' + 'yes'/'no'/'true'/'false' - if self.eval_mode == "binary_score": - stop_point -= 3 - - one_generated_field = self.decode(generated_tokens[i, start_point:stop_point]).strip() - generated_field.append(one_generated_field) - - ground_truth_field = self.process_ground_truth_field(labels) - - return generated_field, ground_truth_field - - def process_ground_truth_field(self, labels): - ground_truth_field = [] - - for i in range(labels.size(0)): - correct_label = tuple( - [j for j in labels.data[i] if j != self.tokenizer.tokenizer.pad_token_id and j != -100] - ) - ground_truth_field.append(self.decode(correct_label).strip()) - - return ground_truth_field - - def prepare_data(self): - """ - Preprocessed schema and dialogues and caches this - """ - if self.data_prepared: - return - - if self._cfg.dataset.task == 'sgd': - self.dialogues_processor = DialogueSGDDataProcessor( - data_dir=self._cfg.dataset.data_dir, - dialogues_example_dir=self._cfg.dataset.dialogues_example_dir, - tokenizer=self.tokenizer, - cfg=self._cfg.dataset, - ) - elif self._cfg.dataset.task in ['assistant', "zero_shot"]: - self.dialogues_processor = DialogueAssistantDataProcessor( - data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset - ) - elif self._cfg.dataset.task == 'design': - self.dialogues_processor = DialogueDesignDataProcessor( - data_dir=self._cfg.dataset.data_dir, - tokenizer=self.tokenizer, - cfg=self._cfg.dataset, - ) - else: - raise ValueError("Only sgd, assistant, zero_shot, design supported for Dialogue GPT Classification Model") - - self.data_prepared = True - - def setup(self, stage=None): - super().setup(stage) - if self.cfg.library == "megatron" and self.prompt_learning and stage == "fit": - if self.cfg.virtual_prompt_style == VirtualPromptStyle.P_TUNING: - self.language_model.init_prompt_encoder() - else: - raise ValueError( - "Use model.virtual_prompt_style='p-tuning' with model.p_tuning.encoder_type='embedding' to enable prompt-tuning." - ) - - def update_data_dirs(self, data_dir: str, dialogues_example_dir: str): - """ - Update data directories - - Args: - data_dir: path to data directory - dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created. - """ - if not os.path.exists(data_dir): - raise ValueError(f"{data_dir} is not found") - self._cfg.dataset.data_dir = data_dir - self._cfg.dataset.dialogues_example_dir = dialogues_example_dir - logging.info(f'Setting model.dataset.data_dir to {data_dir}.') - logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.') - - def setup_training_data(self, train_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, split=train_data_config.ds_item) - - def setup_multiple_validation_data(self, val_data_config: Optional[DictConfig] = None): - return self.setup_validation_data(val_data_config) - - def setup_validation_data(self, val_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, split=val_data_config.ds_item) - - def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict]): - self.setup_test_data(test_data_config) - - def setup_test_data(self, test_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, split=test_data_config.ds_item) - - def _setup_dataloader_from_config(self, cfg: DictConfig, split: str) -> DataLoader: - dataset_cfg = self._cfg.dataset - data_dir = dataset_cfg.data_dir - - if not os.path.exists(data_dir): - raise FileNotFoundError(f"Data directory is not found at: {data_dir}.") - - dataset = DialogueGPTClassificationDataset( - dataset_split=split, - dialogues_processor=self.dialogues_processor, - tokenizer=self.dialogues_processor._tokenizer, - cfg=dataset_cfg, - ) - - dl = torch.utils.data.DataLoader( - dataset=dataset, - batch_size=cfg.batch_size, - collate_fn=dataset.collate_fn, - drop_last=cfg.drop_last, - shuffle=cfg.shuffle, - num_workers=cfg.num_workers, - pin_memory=cfg.pin_memory, - ) - return dl - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - result = [] - return result diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py deleted file mode 100644 index 7fb0ba770189..000000000000 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py +++ /dev/null @@ -1,441 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -from typing import Dict, Optional, Union - -import numpy as np -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig -from torch.utils.data import DataLoader -from transformers import AutoModelWithLMHead - -from nemo.collections.nlp.data.dialogue.data_processor.mellon_qa_data_processor import DialogueMellonQADataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.ms_marco_data_processor import DialogueMSMarcoDataProcessor -from nemo.collections.nlp.data.dialogue.dataset.dialogue_gpt_generation_dataset import DialogueGPTGenerationDataset -from nemo.collections.nlp.metrics.dialogue_metrics import DialogueGenerationMetrics -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import ( - MegatronGPTPromptLearningModel, -) -from nemo.collections.nlp.models.nlp_model import NLPModel -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueGPTGenerationModel'] - -NUM_TASKS = 1 # focussing on intent currently 6 # number of multi-head tasks - - -class DialogueGPTGenerationModel(NLPModel): - def __init__( - self, - cfg: DictConfig, - trainer: Trainer = None, - ): - # deprecation warning - deprecated_warning("DialogueGPTGenerationModel") - - self.cfg = cfg - self.data_prepared = False - - self.setup_tokenizer(cfg.tokenizer) - self.tokenizer.tokenizer.pad_token = self.tokenizer.tokenizer.eos_token - self.epoch_number = 0 - self.prompt_learning = self.cfg.prompt_learning - super().__init__(cfg=cfg, trainer=trainer, no_lm_init=True) - - if self.cfg.library == "huggingface": - self.language_model = AutoModelWithLMHead.from_pretrained(cfg.language_model.pretrained_model_name) - self.language_model.resize_token_embeddings(len(self.tokenizer.tokenizer)) - if self.cfg.language_model.lm_checkpoint: - self.language_model.load_state_dict(torch.load(self.cfg.language_model.lm_checkpoint)) - elif self.cfg.library == "megatron": - if self.prompt_learning: - # removing tokenizer cfg as this triggers tokenizer construction which is not helpful here as we have a separate tokenizer - new_cfg = copy.copy(cfg) - del new_cfg.tokenizer - self.language_model = MegatronGPTPromptLearningModel(new_cfg, trainer) - else: - self.language_model = MegatronGPTModel.restore_from(cfg.language_model.lm_checkpoint, trainer=trainer) - - def training_step(self, batch, batch_idx): - input_ids, attn_masks, labels, _, _ = batch - - loss = self(input_ids, attn_masks, labels, inference=False) - self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) - return {'loss': loss} - - def validation_step(self, batch, batch_idx): - loss = self.eval_step_helper(batch=batch) - self.validation_step_outputs.append(loss) - return loss - - def on_validation_epoch_end(self): - self.eval_epoch_end(self.validation_step_outputs, mode='val') - self.validation_step_outputs.clear() # free memory - - def on_test_epoch_end(self): - self.eval_epoch_end(self.test_step_outputs, mode='test') - self.test_step_outputs.clear() # free memory - - def eval_epoch_end(self, outputs, mode='val'): - - generated_field = [] - ground_truth_field = [] - inputs = [] - loss = [] - - for output in outputs: - generated_field += output["generated_field"] - ground_truth_field += output["ground_truth_field"] - inputs += output["input"] - loss.append(output["loss"].item()) - - os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True) - filename = os.path.join( - self.cfg.dataset.dialogues_example_dir, f"{mode}_predictions_epoch{self.epoch_number}.jsonl" - ) - - DialogueGenerationMetrics.save_predictions( - filename, - generated_field, - ground_truth_field, - inputs, - ) - - label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))]) - precision, recall, f1 = DialogueGenerationMetrics.get_f1(generated_field, ground_truth_field) - bleu = DialogueGenerationMetrics.get_bleu(generated_field, ground_truth_field) - avg_loss = np.mean(loss) - ppl = np.exp(avg_loss) - - self.log('{}_accuracy'.format(mode), label_acc * 100) - self.log('precision', precision) - self.log('recall', recall) - self.log('f1', f1) - self.log('bleu', bleu) - self.log('{}_loss'.format(mode), avg_loss) - self.log('{}_ppl'.format(mode), ppl) - - if mode == 'val': - self.epoch_number += 1 - if self.cfg.save_model: - filename = '{}/val_loss-{}-epoch-{}-answer-extender.bin'.format( - self.cfg.dataset.dialogues_example_dir, avg_loss, self.epoch_number - ) - torch.save(self.language_model.state_dict(), filename) - - def test_step(self, batch, batch_idx): - loss = self.eval_step_helper(batch=batch, mode='test') - self.test_step_outputs.append(loss) - return loss - - # for inference only - def predict_step(self, batch, batch_idx, dataloader_idx=None): - # return self(batch) - raise NotImplementedError() - - def forward(self, input_ids, attention_mask, labels, inference=True): - - if self.cfg.library == "huggingface": - output = self.language_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) - loss = output['loss'] - - elif self.cfg.library == "megatron": - num_prompt_tokens = ( - len(self.language_model.pseudo_token_ids) if hasattr(self.language_model, 'pseudo_token_ids') else 0 - ) - - position_ids = torch.arange( - start=0, - end=num_prompt_tokens + input_ids.size(1), - dtype=torch.long, - device=input_ids.device, - ) - - position_ids = position_ids.unsqueeze(0).repeat(input_ids.size(0), 1) - - prompt_ids = torch.tensor([0] * input_ids.size(0)) if self.prompt_learning else None - - attn_mask_add_on = torch.ones((attention_mask.size(0), num_prompt_tokens), device=attention_mask.device) - full_attention_mask = torch.cat([attn_mask_add_on, attention_mask], axis=-1) - full_attention_mask_expand = torch.tril( - full_attention_mask.unsqueeze(2).tile(full_attention_mask.size(1)) - ).unsqueeze(1) - - attn_mask = full_attention_mask_expand > 0 - - prompt_token_labels = torch.full( - size=(input_ids.size(0), num_prompt_tokens), - fill_value=self.tokenizer.tokenizer.pad_token_id, - dtype=torch.long, - ) - - if self.prompt_learning: - prompt_token_labels.data = torch.LongTensor( - np.tile(np.array(self.language_model.pseudo_token_ids), (input_ids.size(0), 1)) - ) - - prompt_token_labels = prompt_token_labels.to(input_ids.device) - - input_ids_new = torch.cat([torch.zeros_like(prompt_token_labels), input_ids], axis=1) - make_up_last_column_input_ids = ( - torch.ones_like(input_ids_new[:, -1:]) * self.tokenizer.tokenizer.pad_token_id - ) - left_shifted_input_ids = torch.cat([input_ids_new[:, 1:], make_up_last_column_input_ids], axis=-1) - if self.prompt_learning: - unmasked_unreduced_loss = self.language_model( - input_ids_new, - position_ids, - attn_mask, - labels=left_shifted_input_ids, - taskname_ids=prompt_ids, - inference=inference, - ) - else: - unmasked_unreduced_loss = self.language_model( - input_ids, position_ids, attn_mask, labels=left_shifted_input_ids - ) - - if isinstance(unmasked_unreduced_loss, tuple): - unmasked_unreduced_loss = unmasked_unreduced_loss[0] - - labels = torch.cat([prompt_token_labels, labels], axis=1) - make_up_last_column_labels = torch.ones_like(labels[:, -1:]) * self.tokenizer.tokenizer.pad_token_id - new_labels = torch.cat([labels[:, 1:], make_up_last_column_labels], axis=-1) - filler = torch.zeros_like(new_labels) - labels_mask_0 = torch.where(new_labels != -100, new_labels, filler) - labels_mask = labels_mask_0 > 0 - - loss = self.mask_and_reduce_loss(labels_mask, unmasked_unreduced_loss) - return loss - - def mask_and_reduce_loss(self, loss_mask, output_tensor): - losses = output_tensor.float() - loss_mask = loss_mask.view(-1).float() - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - return loss - - def setup(self, stage=None): - super().setup(stage) - if self.cfg.library == "megatron" and self.prompt_learning: - self.language_model.init_new_prompts() - - def prepare_megatron_generation(self, labels, input_ids, template_length): - """ - # adapted from MegatronGPTModel._bucketize_gpt_inference - """ - batch_size = labels.size(0) - prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_learning else None - batch_tokens = input_ids.tolist() - - # unpad tokens - lens = template_length - indxs = [index for index in range(batch_size)] - for lenn, index in zip(lens, indxs): - batch_tokens[index] = batch_tokens[index][:lenn] - - # chunk tokens by same length - pre_buckets, lens = [], list(set(lens.tolist())) - for lenn in lens: - pre_buckets.append([(tokens, index) for index, tokens in enumerate(batch_tokens) if len(tokens) == lenn]) - - buckets, positions, bucket_prompt_tags = [], [], [] - - # get buckets and prompts initial positions - for bucket in pre_buckets: - buckets.append(torch.tensor([item[0] for item in bucket]).to(device=labels.device)) - positions.append([item[1] for item in bucket]) - - # bucket prompt tags identically to their corresponding examples - if prompt_tags: - bucket_prompt_tags.append([prompt_tags[item[1]] for item in bucket]) - - # Flatten position list - positions = [item for sublist in positions for item in sublist] - - # Flatten buckets and bucket_prompt_tags # temp fix for megatron complete issue. However, this is also slower than bucketized inference - buckets = [item.unsqueeze(0) for sublist in buckets for item in sublist] - bucket_prompt_tags = [[item] for sublist in bucket_prompt_tags for item in sublist] - - request = {"tokens": buckets, "prompt_tags": bucket_prompt_tags} - - return positions, request - - def post_process_megatron_generation(self, outputs): - text_outputs = [output[0] for output in outputs] - generated_tokens = self.tokenizer.tokenizer(text_outputs, padding=True, return_tensors="pt").data["input_ids"] - return generated_tokens - - def generate_candidates(self, labels, template_length, input_ids, attn_masks): - - tokens_to_generate = self.cfg.tokens_to_generate - if self.cfg.library == "huggingface": - generated_tokens = [] - max_length = 0 - for i in range(input_ids.size(0)): - param_dict = { - "input_ids": input_ids[i : i + 1, : template_length[i]], - "max_length": template_length[i] + tokens_to_generate, - "pad_token_id": self.tokenizer.tokenizer.pad_token_id, - } - generated_tokens.append(self.language_model.generate(**param_dict)) - max_length = max(max_length, generated_tokens[-1].size(1)) - - # pad each generated to ensure they are of same length in dim 1, therefore stack-able - generated_tokens = [ - torch.cat( - [i, torch.ones((1, max_length - i.size(1))).to(i.device) * self.tokenizer.tokenizer.pad_token_id], - axis=-1, - ) - for i in generated_tokens - ] - generated_tokens = torch.cat(generated_tokens, axis=0) - - elif self.cfg.library == "megatron": - positions, request = self.prepare_megatron_generation(labels, input_ids, template_length) - outputs = self.language_model.complete(request, positions, tokens_to_generate) - generated_tokens = self.post_process_megatron_generation(outputs) - - generated_field = self.process_into_structured_fields(generated_tokens, template_length=template_length) - - ground_truth_field = self.process_into_structured_fields(labels, template_length=template_length) - - return generated_field, ground_truth_field - - def process_into_structured_fields(self, full_seq_ids, template_length=None): - - structured_field = [] - for i in range(full_seq_ids.size(0)): - start_point = 0 if template_length is None else template_length[i].item() - stop_point = full_seq_ids.size(1) - - for j in range(start_point, stop_point): - if full_seq_ids.data[i, j] == self.tokenizer.tokenizer.pad_token_id: - stop_point = j - break - one_generated_field = self.tokenizer.tokenizer.decode(full_seq_ids[i, start_point:stop_point]).strip() - structured_field.append(one_generated_field) - return structured_field - - def eval_step_helper(self, batch, mode='val'): - - input_ids, attn_masks, labels, template_length, utterance_length = batch - - loss = self(input_ids, attn_masks, labels) - self.log("{}_loss".format(mode), loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) - - # autoregressively generate candidates (possibly with constraint) - generated_field, ground_truth_field = self.generate_candidates(labels, template_length, input_ids, attn_masks) - - return { - 'loss': loss, - 'input': self.tokenizer.tokenizer.batch_decode(input_ids, skip_special_tokens=True), - 'generated_field': generated_field, - 'ground_truth_field': ground_truth_field, - } - - def prepare_data(self): - """ - Preprocessed schema and dialogues and caches this - """ - if self.data_prepared: - return - - if self._cfg.dataset.task == "ms_marco": - self.dialogues_processor = DialogueMSMarcoDataProcessor( - data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset - ) - elif self._cfg.dataset.task == "mellon_qa": - self.dialogues_processor = DialogueMellonQADataProcessor( - data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset - ) - else: - raise ValueError("Only ms_marco and mellon_qa supported for Dialogue GPT Generation Model") - - self.data_prepared = True - - def update_data_dirs(self, data_dir: str, dialogues_example_dir: str): - """ - Update data directories - - Args: - data_dir: path to data directory - dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created. - """ - if not os.path.exists(data_dir): - raise ValueError(f"{data_dir} is not found") - self._cfg.dataset.data_dir = data_dir - self._cfg.dataset.dialogues_example_dir = dialogues_example_dir - logging.info(f'Setting model.dataset.data_dir to {data_dir}.') - logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.') - - def setup_training_data(self, train_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, split=train_data_config.ds_item) - - def setup_multiple_validation_data(self, val_data_config: Optional[DictConfig] = None): - return self.setup_validation_data(val_data_config) - - def setup_validation_data(self, val_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, split=val_data_config.ds_item) - - def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict]): - self.setup_test_data(test_data_config) - - def setup_test_data(self, test_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, split=test_data_config.ds_item) - - def _setup_dataloader_from_config(self, cfg: DictConfig, split: str) -> DataLoader: - dataset_cfg = self._cfg.dataset - data_dir = dataset_cfg.data_dir - - if not os.path.exists(data_dir): - raise FileNotFoundError(f"Data directory is not found at: {data_dir}.") - - dataset = DialogueGPTGenerationDataset( - dataset_split=split, - dialogues_processor=self.dialogues_processor, - tokenizer=self.dialogues_processor._tokenizer, - cfg=dataset_cfg, - ) - - dl = torch.utils.data.DataLoader( - dataset=dataset, - batch_size=cfg.batch_size, - collate_fn=dataset.collate_fn, - drop_last=cfg.drop_last, - shuffle=cfg.shuffle, - num_workers=cfg.num_workers, - pin_memory=cfg.pin_memory, - ) - return dl - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - result = [] - return result diff --git a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py deleted file mode 100644 index 9bf7ae2a9116..000000000000 --- a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py +++ /dev/null @@ -1,236 +0,0 @@ -# Copyright 2022 The HuggingFace Inc. team. -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Optional - -import numpy as np -import torch -import torch.nn.functional as F -from lightning.pytorch import Trainer -from omegaconf import DictConfig -from transformers import AutoModel - -from nemo.collections.nlp.data.dialogue import DialogueSGDDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.design_data_processor import DialogueDesignDataProcessor -from nemo.collections.nlp.data.dialogue.dataset.dialogue_nearest_neighbour_dataset import ( - DialogueNearestNeighbourDataset, -) -from nemo.collections.nlp.metrics.classification_report import ClassificationReport -from nemo.collections.nlp.metrics.dialogue_metrics import DialogueGenerationMetrics -from nemo.collections.nlp.models.nlp_model import NLPModel -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueNearestNeighbourModel'] - - -class DialogueNearestNeighbourModel(NLPModel): - """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # deprecation warning - deprecated_warning("DialogueNearestNeighbourModel") - - self.cfg = cfg - super().__init__(cfg=cfg, trainer=trainer) - if self.cfg.library == "huggingface": - self.language_model = AutoModel.from_pretrained(self.cfg.language_model.pretrained_model_name) - - def _setup_dataloader_from_config(self, cfg: DictConfig, dataset_split) -> 'torch.utils.data.DataLoader': - if self._cfg.dataset.task == "zero_shot": - self.data_processor = DialogueAssistantDataProcessor( - self.cfg.data_dir, self.tokenizer, cfg=self.cfg.dataset - ) - elif self._cfg.dataset.task == "design": - self.data_processor = DialogueDesignDataProcessor( - data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset - ) - elif self._cfg.dataset.task == 'sgd': - self.data_processor = DialogueSGDDataProcessor( - data_dir=self._cfg.dataset.data_dir, - dialogues_example_dir=self._cfg.dataset.dialogues_example_dir, - tokenizer=self.tokenizer, - cfg=self._cfg.dataset, - ) - else: - raise ValueError("Only zero_shot, design and sgd supported for Zero Shot Intent Model") - - dataset = DialogueNearestNeighbourDataset( - dataset_split, - self.data_processor, - self.tokenizer, - self.cfg.dataset, # this is the model.dataset cfg, which is diff from train_ds cfg etc - ) - - return torch.utils.data.DataLoader( - dataset=dataset, - collate_fn=dataset.collate_fn, - batch_size=cfg.batch_size, - shuffle=cfg.shuffle, - num_workers=cfg.get("num_workers", 0), - pin_memory=cfg.get("pin_memory", False), - drop_last=cfg.get("drop_last", False), - ) - - def forward(self, input_ids, attention_mask): - if self.cfg.library == 'huggingface': - output = self.language_model(input_ids=input_ids, attention_mask=attention_mask) - return output - - def training_step(self, batch, batch_idx): - raise NotImplementedError - - def test_step(self, batch, batch_idx): - loss = self.validation_step(batch, batch_idx, mode='test') - self.test_step_outputs.append(loss) - return loss - - @staticmethod - def mean_pooling(model_output, attention_mask): - token_embeddings = model_output[0] # First element of model_output contains all token embeddings - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) - - def validation_step(self, batch, batch_idx, mode='val'): - """ - Lightning calls this inside the validation loop with the data from the validation dataloader - passed in as `batch`. - """ - input_ids, input_mask, labels = batch - preds = [] - gts = [] - inputs = [] - for i in range(input_ids.size(0)): - output = self.forward(input_ids=input_ids[i], attention_mask=input_mask[i]) - sentence_embeddings = DialogueNearestNeighbourModel.mean_pooling(output, input_mask[i]) - sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) - cos_sim = F.cosine_similarity(sentence_embeddings[:1, :], sentence_embeddings[1:, :]) - pred = torch.argmax(cos_sim).item() + 1 - gt = torch.argmax(labels[i][1:]).item() + 1 - - preds.append(input_ids[i, pred]) - gts.append(input_ids[i, gt]) - inputs.append(input_ids[i, 0]) - - loss = {'preds': torch.stack(preds), 'labels': torch.stack(gts), 'inputs': torch.stack(inputs)} - self.validation_step_outputs.append(loss) - return loss - - def multi_test_epoch_end(self, outputs, dataloader_idx): - return self.on_validation_epoch_end() - - def on_validation_epoch_end(self): - """ - Get metrics based on the candidate label with the highest predicted likelihood and the ground truth label for intent - """ - prefix = "test" if self.trainer.testing else "val" - if prefix == "val": - outputs = self.validation_step_outputs - else: - outputs = self.test_step_outputs - output_preds = torch.cat([output['preds'] for output in outputs], dim=0) - output_labels = torch.cat([output['labels'] for output in outputs], dim=0) - inputs = torch.cat([output['inputs'] for output in outputs], dim=0) - - decoded_preds = self.tokenizer.tokenizer.batch_decode(output_preds, skip_special_tokens=True) - decoded_labels = self.tokenizer.tokenizer.batch_decode(output_labels, skip_special_tokens=True) - decoded_inputs = self.tokenizer.tokenizer.batch_decode(inputs, skip_special_tokens=True) - - prompt_len = len(self.cfg.dataset.prompt_template.strip()) - predicted_labels = [i[prompt_len:].strip() for i in decoded_preds] - ground_truth_labels = [i[prompt_len:].strip() for i in decoded_labels] - - os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True) - filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl") - - DialogueGenerationMetrics.save_predictions( - filename, - predicted_labels, - ground_truth_labels, - decoded_inputs, - ) - - label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))} - self.classification_report = ClassificationReport( - num_classes=len(label_to_ids), mode='micro', label_ids=label_to_ids, dist_sync_on_step=True - ).to(output_preds[0].device) - - predicted_label_ids = torch.tensor([label_to_ids[label] for label in predicted_labels]).to( - output_preds[0].device - ) - ground_truth_label_ids = torch.tensor([label_to_ids[label] for label in ground_truth_labels]).to( - output_preds[0].device - ) - - tp, fn, fp, _ = self.classification_report(predicted_label_ids, ground_truth_label_ids) - - precision, recall, f1, report = self.classification_report.compute() - label_acc = np.mean([int(predicted_labels[i] == ground_truth_labels[i]) for i in range(len(predicted_labels))]) - - logging.info(report) - - self.log('unified_precision', precision) - self.log('unified_f1', f1) - self.log('unified_recall', recall) - self.log('unfied_accuracy', label_acc * 100) - - self.classification_report.reset() - self.validation_step_outputs.clear() if prefix == 'val' else self.test_step_outputs.clear() - - def setup_training_data(self, train_data_config: Optional[DictConfig]): - if not train_data_config: - logging.info( - f"Dataloader config or file_name for the training set is missing, so no data loader for test is created!" - ) - self._test_dl = None - return - self._train_dl = self._setup_dataloader_from_config(train_data_config, "train") - - # self.create_loss_module() - - def setup_validation_data(self, val_data_config: Optional[DictConfig]): - if not val_data_config: - logging.info( - f"Dataloader config or file_path for the validation data set is missing, so no data loader for test is created!" - ) - self._test_dl = None - return - self._validation_dl = self._setup_dataloader_from_config(val_data_config, "dev") - - def setup_multiple_test_data(self, test_data_config: Optional[DictConfig]): - self.setup_test_data(test_data_config) - - def setup_test_data(self, test_data_config: Optional[DictConfig]): - if not test_data_config: - logging.info( - f"Dataloader config or file_path for the test data set is missing, so no data loader for test is created!" - ) - self._test_dl = None - return - self._test_dl = self._setup_dataloader_from_config(test_data_config, "test") - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained models which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - result = [] - return result diff --git a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py deleted file mode 100644 index 3f0d09d7dc66..000000000000 --- a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py +++ /dev/null @@ -1,381 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Dict, Optional, Union - -import numpy as np -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig, OmegaConf, open_dict -from torch.utils.data import DataLoader -from transformers import AutoModelForSeq2SeqLM - -from nemo.collections.nlp.data.dialogue import DialogueSGDDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.mellon_qa_data_processor import DialogueMellonQADataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.ms_marco_data_processor import DialogueMSMarcoDataProcessor -from nemo.collections.nlp.data.dialogue.dataset.dialogue_s2s_generation_dataset import DialogueS2SGenerationDataset -from nemo.collections.nlp.metrics.dialogue_metrics import DialogueGenerationMetrics -from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model -from nemo.collections.nlp.models.nlp_model import NLPModel -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - -try: - from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator - -except (ImportError, ModuleNotFoundError): - logging.warning("Megatron num_microbatches_calculator not found, using Apex version.") - from apex.transformer.pipeline_parallel.utils import ( - _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator, - ) - -__all__ = ['DialogueS2SGenerationModel'] - - -class DialogueS2SGenerationModel(NLPModel): - def __init__( - self, - cfg: DictConfig, - trainer: Trainer = None, - ): - # deprecation warning - deprecated_warning("DialogueS2SGenerationModel") - - self.cfg = cfg - self.data_prepared = False - self.epoch_number = 0 - if self.cfg.library == "huggingface": - self.setup_tokenizer(cfg.tokenizer) - elif self.cfg.library == "megatron": - # supporting MegatronT5Model in precision = fp16 - t5_cfg = MegatronT5Model.restore_from( - restore_path=cfg.language_model.lm_checkpoint, trainer=trainer, return_config=True - ) - # Override the T5 configuration with the one from the config file. - OmegaConf.set_struct(t5_cfg, True) - with open_dict(t5_cfg): - t5_cfg.masked_softmax_fusion = False - t5_cfg.precision = 16 - t5_cfg.encoder_arch = 'transformer' - t5_cfg.decoder_arch = 'transformer' - - language_model = MegatronT5Model.restore_from( - restore_path=cfg.language_model.lm_checkpoint, trainer=trainer, override_config_path=t5_cfg - ) - self.tokenizer = language_model.tokenizer - - super().__init__(cfg=cfg, trainer=trainer, no_lm_init=True) - - if self.cfg.library == "huggingface": - self.language_model = AutoModelForSeq2SeqLM.from_pretrained(cfg.language_model.pretrained_model_name) - self.language_model.resize_token_embeddings(len(self.tokenizer.tokenizer)) - if self.cfg.language_model.lm_checkpoint: - self.language_model.load_state_dict(torch.load(self.cfg.language_model.lm_checkpoint)) - elif self.cfg.library == "megatron": - self.language_model = language_model - - def training_step(self, batch, batch_idx): - input_ids, attn_masks, labels = batch - loss = self(input_ids, attn_masks, labels) - self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) - return {'loss': loss} - - def validation_step(self, batch, batch_idx): - loss = self.eval_step_helper(batch=batch) - self.validation_step_outputs.append(loss) - return loss - - def on_validation_epoch_end(self): - self.eval_epoch_end(self.validation_step_outputs, mode='val') - self.validation_step_outputs.clear() # free memory - - def on_test_epoch_end(self): - self.eval_epoch_end(self.test_step_outputs, mode='test') - self.test_step_outputs.clear() # free memory - - def eval_epoch_end(self, outputs, mode='val'): - - generated_field = [] - ground_truth_field = [] - inputs = [] - loss = [] - - for output in outputs: - generated_field += output["generated_field"] - ground_truth_field += output["ground_truth_field"] - inputs += output["input"] - loss.append(output["loss"].item()) - - os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True) - filename = os.path.join( - self.cfg.dataset.dialogues_example_dir, f"{mode}_predictions_epoch{self.epoch_number}.jsonl" - ) - - DialogueGenerationMetrics.save_predictions( - filename, - generated_field, - ground_truth_field, - inputs, - ) - - label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))]) - precision, recall, f1 = DialogueGenerationMetrics.get_f1(generated_field, ground_truth_field) - bleu = DialogueGenerationMetrics.get_bleu(generated_field, ground_truth_field) - avg_loss = np.mean(loss) - ppl = np.exp(avg_loss) - - self.log('{}_accuracy'.format(mode), label_acc * 100) - self.log('precision', precision) - self.log('recall', recall) - self.log('f1', f1) - self.log('bleu', bleu) - self.log('{}_loss'.format(mode), avg_loss) - self.log('{}_ppl'.format(mode), ppl) - - if mode == 'val': - self.epoch_number += 1 - if self.cfg.save_model: - filename = '{}/val_loss-{}-epoch-{}-answer-extender.bin'.format( - self.cfg.dataset.dialogues_example_dir, avg_loss, self.epoch_number - ) - torch.save(self.language_model.state_dict(), filename) - - def test_step(self, batch, batch_idx): - loss = self.eval_step_helper(batch=batch, mode='test') - self.test_step_outputs.append(loss) - return loss - - # for inference only - def predict_step(self, batch, batch_idx, dataloader_idx=None): - # return self(batch) - raise NotImplementedError() - - def forward(self, input_ids, attention_masks, labels): - if self.cfg.library == "huggingface": - output = self.language_model(input_ids=input_ids, attention_mask=attention_masks, labels=labels) - loss = output['loss'] - elif self.cfg.library == "megatron": - - labels = torch.where(labels != -100, labels, torch.zeros_like(labels)) - decoder_attn_masks = torch.where(labels > 0, torch.ones_like(labels), torch.zeros_like(labels)) - - unmasked_unreduced_loss = self.language_model( - input_ids, labels[:, :-1], attention_masks, decoder_attn_masks[:, :-1], lm_labels=labels[:, 1:] - ) - loss = self.language_model.loss_func(decoder_attn_masks[:, 1:].contiguous(), unmasked_unreduced_loss) - return loss - - def prepare_megatron_generation(self, labels, input_ids, template_length): - """ - # adapted from MegatronGPTModel._bucketize_gpt_inference - """ - batch_size = labels.size(0) - prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_tags else None - batch_tokens = input_ids.tolist() - - # unpad tokens - lens = template_length - indxs = [index for index in range(batch_size)] - for lenn, index in zip(lens, indxs): - batch_tokens[index] = batch_tokens[index][:lenn] - - # chunk tokens by same length - pre_buckets, lens = [], list(set(lens.tolist())) - for lenn in lens: - pre_buckets.append([(tokens, index) for index, tokens in enumerate(batch_tokens) if len(tokens) == lenn]) - - buckets, positions, bucket_prompt_tags = [], [], [] - - # get buckets and prompts initial positions - for bucket in pre_buckets: - buckets.append(torch.tensor([item[0] for item in bucket]).to(device=labels.device)) - positions.append([item[1] for item in bucket]) - - # bucket prompt tags identically to their corresponding examples - if prompt_tags: - bucket_prompt_tags.append([prompt_tags[item[1]] for item in bucket]) - - # Flatten position list - positions = [item for sublist in positions for item in sublist] - - # Flatten buckets and bucket_prompt_tags # temp fix for megatron complete issue. However, this is also slower than bucketized inference - buckets = [item.unsqueeze(0) for sublist in buckets for item in sublist] - bucket_prompt_tags = [[item] for sublist in bucket_prompt_tags for item in sublist] - - request = {"tokens": buckets, "prompt_tags": bucket_prompt_tags} - - return positions, request - - def post_process_megatron_generation(self, outputs): - text_outputs = [output[0] for output in outputs] - generated_tokens = self.tokenizer.tokenizer(text_outputs, padding=True, return_tensors="pt").data["input_ids"] - return generated_tokens - - def generate_candidates(self, input_ids, attn_masks, labels): - - tokens_to_generate = self.cfg.tokens_to_generate - if self.cfg.library == "huggingface": - - param_dict = { - "input_ids": input_ids, - "attention_mask": attn_masks, - "max_length": tokens_to_generate, - } - generated_tokens = self.language_model.generate(**param_dict) - - elif self.cfg.library == 'megatron': - reconfigure_num_microbatches_calculator( - rank=0, # This doesn't matter since it is only used for logging - rampup_batch_size=None, - global_batch_size=1, - micro_batch_size=1, # Make sure that there is no "grad acc" while decoding. - data_parallel_size=1, # We check above to make sure that dataparallel size is always 1 at inference. - ) - generated_tokens, _ = self.language_model.decode(input_ids, attn_masks, tokens_to_generate) - - generated_field = self.process_into_structured_fields(generated_tokens) - ground_truth_field = self.process_into_structured_fields(labels) - - return generated_field, ground_truth_field - - def process_into_structured_fields(self, full_seq_ids, template_length=None): - - structured_field = [] - for i in range(full_seq_ids.size(0)): - start_point = 0 if template_length is None else template_length[i].item() - stop_point = full_seq_ids.size(1) - - for j in range(start_point, stop_point): - if full_seq_ids.data[i, j] in [self.tokenizer.tokenizer.pad_token_id, -100] and j != 0: - stop_point = j - break - token_ids = full_seq_ids[i, start_point:stop_point] - one_generated_field = self.tokenizer.tokenizer.decode(token_ids, skip_special_tokens=True).strip() - structured_field.append(one_generated_field) - return structured_field - - def eval_step_helper(self, batch, mode='val'): - - input_ids, attn_masks, labels = batch - - loss = self(input_ids, attn_masks, labels) - self.log("{}_loss".format(mode), loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) - - generated_field, ground_truth_field = self.generate_candidates(input_ids, attn_masks, labels) - - return { - 'loss': loss, - 'input': self.tokenizer.tokenizer.batch_decode(input_ids, skip_special_tokens=True), - 'generated_field': generated_field, - 'ground_truth_field': ground_truth_field, - } - - def prepare_data(self): - """ - Preprocessed schema and dialogues and caches this - """ - if self.data_prepared: - return - - if self._cfg.dataset.task == "ms_marco": - self.dialogues_processor = DialogueMSMarcoDataProcessor( - data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset - ) - elif self._cfg.dataset.task == "sgd_generation": - self.dialogues_processor = DialogueSGDDataProcessor( - data_dir=self._cfg.dataset.data_dir, - dialogues_example_dir=self._cfg.dataset.dialogues_example_dir, - tokenizer=self.tokenizer, - cfg=self._cfg.dataset, - ) - elif self._cfg.dataset.task == "mellon_qa": - self.dialogues_processor = DialogueMellonQADataProcessor( - data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset - ) - else: - raise ValueError("Only ms_marco, sgd_generation and mellon_qa supported for Dialogue GPT Generation Model") - - self.data_prepared = True - - def update_data_dirs(self, data_dir: str, dialogues_example_dir: str): - """ - Update data directories - - Args: - data_dir: path to data directory - dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created. - """ - if not os.path.exists(data_dir): - raise ValueError(f"{data_dir} is not found") - self._cfg.dataset.data_dir = data_dir - self._cfg.dataset.dialogues_example_dir = dialogues_example_dir - logging.info(f'Setting model.dataset.data_dir to {data_dir}.') - logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.') - - def setup_training_data(self, train_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, split=train_data_config.ds_item) - - def setup_multiple_validation_data(self, val_data_config: Optional[DictConfig] = None): - return self.setup_validation_data(val_data_config) - - def setup_validation_data(self, val_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, split=val_data_config.ds_item) - - def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict]): - self.setup_test_data(test_data_config) - - def setup_test_data(self, test_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, split=test_data_config.ds_item) - - def _setup_dataloader_from_config(self, cfg: DictConfig, split: str) -> DataLoader: - dataset_cfg = self._cfg.dataset - data_dir = dataset_cfg.data_dir - - if not os.path.exists(data_dir): - raise FileNotFoundError(f"Data directory is not found at: {data_dir}.") - - dataset = DialogueS2SGenerationDataset( - dataset_split=split, - dialogues_processor=self.dialogues_processor, - tokenizer=self.dialogues_processor._tokenizer, - cfg=dataset_cfg, - ) - - dl = torch.utils.data.DataLoader( - dataset=dataset, - batch_size=cfg.batch_size, - collate_fn=dataset.collate_fn, - drop_last=cfg.drop_last, - shuffle=cfg.shuffle, - num_workers=cfg.num_workers, - pin_memory=cfg.pin_memory, - ) - return dl - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - result = [] - return result diff --git a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py deleted file mode 100644 index 1df19cf8a556..000000000000 --- a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py +++ /dev/null @@ -1,454 +0,0 @@ -# Copyright 2018 The HuggingFace Inc. team. -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from collections import defaultdict -from typing import Dict, List, Optional, Union - -import numpy as np -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig -from transformers import AutoModelForSequenceClassification, AutoTokenizer - -from nemo.collections.nlp.data.dialogue import DialogueSGDDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.design_data_processor import DialogueDesignDataProcessor -from nemo.collections.nlp.data.dialogue.dataset.dialogue_zero_shot_intent_dataset import DialogueZeroShotIntentDataset -from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import ( - ZeroShotIntentInferenceDataset, - calc_class_weights_from_dataloader, -) -from nemo.collections.nlp.metrics.classification_report import ClassificationReport -from nemo.collections.nlp.metrics.dialogue_metrics import DialogueGenerationMetrics -from nemo.collections.nlp.models import TextClassificationModel -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - -__all__ = ['DialogueZeroShotIntentModel'] - - -class DialogueZeroShotIntentModel(TextClassificationModel): - """TextClassificationModel to be trained on two- or three-class textual entailment data, to be used for zero shot intent recognition.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # deprecation warning - deprecated_warning("DialogueZeroShotIntentModel") - - self.cfg = cfg - super().__init__(cfg=cfg, trainer=trainer) - - if self.cfg.library == 'megatron': - # zero shot intent classification loading - # cannot directly load as .nemo uses the pre-refactor model - # therefore transfer its attributes over - if self.cfg.original_nemo_checkpoint is not None: - original_model = DialogueZeroShotIntentModel.restore_from(self.cfg.original_nemo_checkpoint) - self.classifier = original_model.classifier - self.bert_model = original_model.bert_model - self.loss = original_model.loss - self.classification_report = original_model.classification_report - elif self.cfg.library == "huggingface": - self.nli_model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli') - self.bert_model = self.nli_model.model - self.classifier = self.nli_model.classification_head - original_model = DialogueZeroShotIntentModel.restore_from(self.cfg.original_nemo_checkpoint) - self.loss = original_model.loss - self.classification_report = original_model.classification_report - self.tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli') - self.tokenizer.max_seq_length = self.cfg.dataset.max_seq_length - - def _setup_dataloader_from_config(self, cfg: DictConfig, dataset_split) -> 'torch.utils.data.DataLoader': - if self._cfg.dataset.task == "zero_shot": - self.data_processor = DialogueAssistantDataProcessor( - self.cfg.data_dir, self.tokenizer, cfg=self.cfg.dataset - ) - elif self._cfg.dataset.task == "design": - self.data_processor = DialogueDesignDataProcessor( - data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset - ) - elif self._cfg.dataset.task == 'sgd': - self.data_processor = DialogueSGDDataProcessor( - data_dir=self._cfg.dataset.data_dir, - dialogues_example_dir=self._cfg.dataset.dialogues_example_dir, - tokenizer=self.tokenizer, - cfg=self._cfg.dataset, - ) - else: - raise ValueError("Only zero_shot, design and sgd supported for Zero Shot Intent Model") - - dataset = DialogueZeroShotIntentDataset( - dataset_split, - self.data_processor, - self.tokenizer, - self.cfg.dataset, # this is the model.dataset cfg, which is diff from train_ds cfg etc - ) - - return torch.utils.data.DataLoader( - dataset=dataset, - collate_fn=dataset.collate_fn, - batch_size=cfg.batch_size, - shuffle=cfg.shuffle, - num_workers=cfg.get("num_workers", 0), - pin_memory=cfg.get("pin_memory", False), - drop_last=cfg.get("drop_last", False), - ) - - def forward(self, input_ids, attention_mask, token_type_ids): - if self.cfg.library == 'megatron': - hidden_states = self.bert_model( - input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask - ) - if isinstance(hidden_states, tuple): - hidden_states = hidden_states[0] - logits = self.classifier(hidden_states=hidden_states) - elif self.cfg.library == 'huggingface': - output = self.nli_model(input_ids=input_ids, attention_mask=attention_mask) - logits = output['logits'] - return logits - - def setup_training_data(self, train_data_config: Optional[DictConfig]): - if not train_data_config: - logging.info( - f"Dataloader config or file_name for the training set is missing, so no data loader for test is created!" - ) - self._test_dl = None - return - self._train_dl = self._setup_dataloader_from_config(train_data_config, "train") - - # calculate the class weights to be used in the loss function - if self.cfg.dataset.class_balancing == 'weighted_loss': - self.class_weights = calc_class_weights_from_dataloader( - self._train_dl, self.cfg.dataset.num_classes, self.cfg.dataset.data_dir - ) - else: - self.class_weights = None - # we need to create/update the loss module by using the weights calculated from the training data - self.create_loss_module() - - def setup_validation_data(self, val_data_config: Optional[DictConfig]): - if not val_data_config: - logging.info( - f"Dataloader config or file_path for the validation data set is missing, so no data loader for test is created!" - ) - self._test_dl = None - return - self._validation_dl = self._setup_dataloader_from_config(val_data_config, "dev") - - def setup_test_data(self, test_data_config: Optional[DictConfig]): - if not test_data_config: - logging.info( - f"Dataloader config or file_path for the test data set is missing, so no data loader for test is created!" - ) - self._test_dl = None - return - self._test_dl = self._setup_dataloader_from_config(test_data_config, "test") - - def _setup_infer_dataloader( - self, - queries: List[str], - candidate_labels: List[str], - hypothesis_template=str, - batch_size=1, - max_seq_length: int = -1, - ) -> 'torch.utils.data.DataLoader': - """ - Setup method for inference data loader. Here the premise-hypothesis pairs are made from queries and candidate labels. - - Args: - queries: the queries to classify - candidate_labels: strings to be used as labels - hypothesis_template: the template used to turn each label into an NLI-style hypothesis. Must include a {} - or similar syntax for the candidate label to be inserted. - batch_size: batch size to use during inference - max_seq_length: maximum length of queries, default is -1 for no limit - Returns: - A pytorch DataLoader. - """ - dataset = ZeroShotIntentInferenceDataset( - queries=queries, - candidate_labels=candidate_labels, - tokenizer=self.tokenizer, - max_seq_length=max_seq_length, - hypothesis_template=hypothesis_template, - ) - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=batch_size, - shuffle=False, - num_workers=2, - pin_memory=False, - drop_last=False, - collate_fn=dataset.collate_fn, - ) - - def validation_step(self, batch, batch_idx, split='val'): - """ - Lightning calls this inside the validation loop with the data from the validation dataloader - passed in as `batch`. - """ - input_ids, input_type_ids, input_mask, labels = batch - logits = self.forward(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) - - val_loss = self.loss(logits=logits, labels=labels) - - preds = torch.argmax(logits, axis=-1) - - tp, fn, fp, _ = self.classification_report(preds, labels) - - loss = { - 'val_loss': val_loss, - 'tp': tp, - 'fn': fn, - 'fp': fp, - 'logits': logits, - 'input_ids': input_ids, - 'labels': labels, - } - self.validation_step_outputs.append(loss) - return loss - - def on_validation_epoch_end(self, split="val"): - """ - Get metrics based on the candidate label with the highest predicted likelihood and the ground truth label for intent - """ - output_logits = torch.cat([output['logits'] for output in self.validation_step_outputs], dim=0) - output_input_ids = torch.cat([output['input_ids'] for output in self.validation_step_outputs], dim=0) - output_labels = torch.cat([output['labels'] for output in self.validation_step_outputs], dim=0) - - if self.cfg.library == 'huggingface': - entail_logits = output_logits[..., 2] - decoded_input_ids = [self.tokenizer.decode(output_input_ids[i]) for i in range(len(output_input_ids))] - utterance_candidate_pairs = [i.split(self.tokenizer.sep_token) for i in decoded_input_ids] - utterances = [ - i[0].replace(self.tokenizer.bos_token, '').replace(self.tokenizer.eos_token, '') - for i in utterance_candidate_pairs - ] - - elif self.cfg.library == 'megatron': - entail_logits = output_logits[..., 1] - decoded_input_ids = [ - self.tokenizer.tokenizer.decode(output_input_ids[i]) for i in range(len(output_input_ids)) - ] - utterance_candidate_pairs = [i.split(self.tokenizer.tokenizer.sep_token) for i in decoded_input_ids] - utterances = [ - i[0].replace(self.tokenizer.tokenizer.bos_token, '').replace(self.tokenizer.tokenizer.eos_token, '') - for i in utterance_candidate_pairs - ] - - # account for uncased tokenization - candidates = [ - i[1] - .replace(self.cfg.dataset.prompt_template.lower(), '') - .replace(self.cfg.dataset.prompt_template, '') - .strip() - for i in utterance_candidate_pairs - ] - utterance_to_idx = defaultdict(list) - for idx, utterance in enumerate(utterances): - utterance_to_idx[utterance].append(idx) - - predicted_labels = [] - ground_truth_labels = [] - utterances = [] - for utterance, idxs in utterance_to_idx.items(): - utterance_candidates = [candidates[idx] for idx in idxs] - logits = [entail_logits[idx].item() for idx in idxs] - labels = [output_labels[idx].item() for idx in idxs] - correct_candidate = utterance_candidates[np.argmax(labels)] - predicted_candidate = utterance_candidates[np.argmax(logits)] - predicted_labels.append(predicted_candidate) - ground_truth_labels.append(correct_candidate) - utterances.append(utterance) - - os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True) - filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl") - - DialogueGenerationMetrics.save_predictions( - filename, - predicted_labels, - ground_truth_labels, - utterances, - ) - - label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))} - self.classification_report = ClassificationReport( - num_classes=len(label_to_ids), mode='micro', label_ids=label_to_ids, dist_sync_on_step=True - ).to(output_logits[0].device) - predicted_label_ids = torch.tensor([label_to_ids[label] for label in predicted_labels]).to( - output_logits[0].device - ) - ground_truth_label_ids = torch.tensor([label_to_ids[label] for label in ground_truth_labels]).to( - output_logits[0].device - ) - - tp, fn, fp, _ = self.classification_report(predicted_label_ids, ground_truth_label_ids) - precision, recall, f1, report = self.classification_report.compute() - label_acc = np.mean([int(predicted_labels[i] == ground_truth_labels[i]) for i in range(len(predicted_labels))]) - - avg_loss = torch.stack([x[f'val_loss'] for x in self.validation_step_outputs]).mean() - - logging.info(report) - - self.log('unified_precision', precision) - self.log('unified_f1', f1) - self.log('unified_recall', recall) - self.log('unfied_accuracy', label_acc * 100) - self.log('val_loss', avg_loss, prog_bar=True) - - self.validation_step_outputs.clear() # free memory - self.classification_report.reset() - - def predict( - self, - queries: Union[str, List[str]], - candidate_labels: Union[str, List[str]], - hypothesis_template='This example is {}.', - batch_size=1, - multi_label=True, - entailment_idx=1, - contradiction_idx=0, - ) -> List[Dict]: - """ - Given a list of queries and a list of candidate labels, return a ranked list of labels and scores for each query. - - Example usage: - queries = ["I'd like a veggie burger, fries, and a coke", "Turn off the lights in the living room",] - candidate_labels = ["Food order", "Change lighting"] - model.predict(queries, candidate_labels) - - Example output: - [{'sentence': "I'd like a veggie burger, fries, and a coke", - 'labels': ['Food order', 'Change lighting'], - 'scores': [0.8557153344154358, 0.12036784738302231]}, - {'sentence': 'Turn off the lights in the living room', - 'labels': ['Change lighting', 'Food order'], - 'scores': [0.8506497144699097, 0.06594637036323547]}] - - - Args: - queries: the query or list of queries to classify - candidate_labels: string or list of strings to be used as labels - hypothesis_template: the template used to turn each label into an NLI-style hypothesis. Must include a {} - or similar syntax for the candidate label to be inserted. - batch_size: the batch size to use for inference. - multi_label: whether or not multiple candidate labels can be true. If False, the scores are normalized - such that all class probabilities sum to 1. If True, the labels are - considered independent and probabilities are normalized for each candidate by doing a softmax of - the entailment score vs. the contradiction score. - entailment_idx: the index of the "entailment" class in the trained model; models trained on MNLI - using NeMo's glue_benchmark.py or zero_shot_intent_model.py use an index of 1 by default. - contradiction_idx: the index of the "contradiction" class in the trained model; models trained on MNLI - using NeMo's glue_benchmark.py or zero_shot_intent_model.py use an index of 0 by default. - - Returns: - list of dictionaries; one dict per input query. Each dict has keys "sentence", "labels", "scores". - labels and scores are parallel lists (with each score corresponding to the label at the same index), - sorted from highest to lowest score. - - """ - if not queries: - raise ValueError("No queries were passed for classification!") - if not candidate_labels: - raise ValueError("No candidate labels were provided!") - - queries = [queries] if isinstance(queries, str) else queries - candidate_labels = [candidate_labels] if isinstance(candidate_labels, str) else candidate_labels - - if len(candidate_labels) == 1: - multi_label = True - - mode = self.training - try: - device = 'cuda' if torch.cuda.is_available() else 'cpu' - - # Switch model to evaluation mode - self.eval() - self.to(device) - - infer_datalayer = self._setup_infer_dataloader( - queries, - candidate_labels, - hypothesis_template=hypothesis_template, - batch_size=batch_size, - max_seq_length=self._cfg.dataset.max_seq_length, - ) - - all_batch_logits = [] - for batch in infer_datalayer: - input_ids, input_type_ids, input_mask, _ = batch - - logits = self.forward( - input_ids=input_ids.to(device), - token_type_ids=input_type_ids.to(device), - attention_mask=input_mask.to(device), - ) - all_batch_logits.append(logits.detach().cpu().numpy()) - - all_logits = np.concatenate(all_batch_logits) - outputs = all_logits.reshape((len(queries), len(candidate_labels), -1)) - - if not multi_label: - # softmax the "entailment" logits over all candidate labels - entail_logits = outputs[..., entailment_idx] - scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True) - else: - # softmax over the entailment vs. contradiction dim for each label independently - entail_contr_logits = outputs[..., [contradiction_idx, entailment_idx]] - scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True) - scores = scores[..., 1] - - result = [] - for i in range(len(queries)): - sorted_idxs = list(reversed(scores[i].argsort())) - result.append( - { - "sentence": queries[i], - "labels": [candidate_labels[j] for j in sorted_idxs], - "scores": scores[i][sorted_idxs].tolist(), - } - ) - - finally: - # set mode back to its original value - self.train(mode=mode) - return result - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained models which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - result = [] - result.append( - PretrainedModelInfo( - pretrained_model_name="zeroshotintent_en_bert_base_uncased", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/zeroshotintent_en_bert_base_uncased/versions/1.4.1/files/zeroshotintent_en_bert_base_uncased.nemo", - description="DialogueZeroShotIntentModel trained by fine tuning BERT-base-uncased on the MNLI (Multi-Genre Natural Language Inference) dataset, which achieves an accuracy of 84.9% and 84.8% on the matched and mismatched dev sets, respectively.", - ) - ) - result.append( - PretrainedModelInfo( - pretrained_model_name="zeroshotintent_en_megatron_uncased", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/zeroshotintent_en_megatron_uncased/versions/1.4.1/files/zeroshotintent_en_megatron_uncased.nemo", - description="DialogueZeroShotIntentModel trained by fine tuning Megatron-BERT-345m=M-uncased on the MNLI (Multi-Genre Natural Language Inference) dataset, which achieves an accuracy of 90.0% and 89.9% on the matched and mismatched dev sets, respectively", - ) - ) - return result diff --git a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py deleted file mode 100644 index 09a81b33c973..000000000000 --- a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py +++ /dev/null @@ -1,631 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Dict, List, Optional - -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig, OmegaConf -from torch.utils.data import DataLoader - -from nemo.collections.common.losses import AggregatorLoss, CrossEntropyLoss -from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor -from nemo.collections.nlp.data.dialogue.dataset.dialogue_bert_dataset import ( - DialogueBERTDataset, - DialogueIntentSlotInferenceDataset, -) -from nemo.collections.nlp.data.intent_slot_classification import IntentSlotDataDesc -from nemo.collections.nlp.metrics.classification_report import ClassificationReport -from nemo.collections.nlp.metrics.dialogue_metrics import DialogueClassificationMetrics -from nemo.collections.nlp.models.nlp_model import NLPModel -from nemo.collections.nlp.modules.common import SequenceTokenClassifier -from nemo.collections.nlp.parts.utils_funcs import tensor2list -from nemo.core.classes import typecheck -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - - -class IntentSlotClassificationModel(NLPModel): - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - """Initializes BERT Joint Intent and Slot model.""" - # deprecation warning - deprecated_warning("IntentSlotClassificationModel") - - self.max_seq_length = cfg.dataset.max_seq_length - self.cfg = cfg - # Check the presence of data_dir. - if not cfg.dataset.data_dir or not os.path.exists(cfg.dataset.data_dir): - # Set default values of data_desc. - self._set_defaults_data_desc(cfg) - else: - self.data_dir = cfg.dataset.data_dir - # Update configuration of data_desc. - self._set_data_desc_to_cfg(cfg, cfg.dataset.data_dir, cfg.train_ds, cfg.validation_ds) - # init superclass - super().__init__(cfg=cfg, trainer=trainer) - - # Initialize Classifier. - self._reconfigure_classifier() - - def _set_defaults_data_desc(self, cfg): - """ - Method makes sure that cfg.data_desc params are set. - If not, set's them to "dummy" defaults. - """ - if not hasattr(cfg, "data_desc"): - OmegaConf.set_struct(cfg, False) - cfg.data_desc = {} - # Intents. - cfg.data_desc.intent_labels = " " - cfg.data_desc.intent_label_ids = {" ": 0} - cfg.data_desc.intent_weights = [1] - # Slots. - cfg.data_desc.slot_labels = " " - cfg.data_desc.slot_label_ids = {" ": 0} - cfg.data_desc.slot_weights = [1] - - cfg.data_desc.pad_label = "O" - OmegaConf.set_struct(cfg, True) - - def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds): - """Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc.""" - # Save data from data desc to config - so it can be reused later, e.g. in inference. - data_desc = IntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix]) - OmegaConf.set_struct(cfg, False) - if not hasattr(cfg, "data_desc") or cfg.data_desc is None: - cfg.data_desc = {} - # Intents. - cfg.data_desc.intent_labels = list(data_desc.intents_label_ids.keys()) - cfg.data_desc.intent_label_ids = data_desc.intents_label_ids - cfg.data_desc.intent_weights = data_desc.intent_weights - # Slots. - cfg.data_desc.slot_labels = list(data_desc.slots_label_ids.keys()) - cfg.data_desc.slot_label_ids = data_desc.slots_label_ids - cfg.data_desc.slot_weights = data_desc.slot_weights - - cfg.data_desc.pad_label = data_desc.pad_label - - # for older(pre - 1.0.0.b3) configs compatibility - if not hasattr(cfg, "class_labels") or cfg.class_labels is None: - cfg.class_labels = {} - cfg.class_labels = OmegaConf.create( - {'intent_labels_file': 'intent_labels.csv', 'slot_labels_file': 'slot_labels.csv'} - ) - - slot_labels_file = os.path.join(data_dir, cfg.class_labels.slot_labels_file) - intent_labels_file = os.path.join(data_dir, cfg.class_labels.intent_labels_file) - self._save_label_ids(data_desc.slots_label_ids, slot_labels_file) - self._save_label_ids(data_desc.intents_label_ids, intent_labels_file) - - self.register_artifact('class_labels.intent_labels_file', intent_labels_file) - self.register_artifact('class_labels.slot_labels_file', slot_labels_file) - OmegaConf.set_struct(cfg, True) - - def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None: - """Saves label ids map to a file""" - with open(filename, 'w') as out: - labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1])) - out.write('\n'.join(labels)) - logging.info(f'Labels: {label_ids}') - logging.info(f'Labels mapping saved to : {out.name}') - - def _reconfigure_classifier(self): - """Method reconfigures the classifier depending on the settings of model cfg.data_desc""" - - self.classifier = SequenceTokenClassifier( - hidden_size=self.hidden_size, - num_intents=len(self.cfg.data_desc.intent_labels), - num_slots=len(self.cfg.data_desc.slot_labels), - dropout=self.cfg.classifier_head.fc_dropout, - num_layers=self.cfg.classifier_head.num_output_layers, - log_softmax=False, - ) - - # define losses - if self.cfg.class_balancing == 'weighted_loss': - # You may need to increase the number of epochs for convergence when using weighted_loss - self.intent_loss = CrossEntropyLoss(logits_ndim=2, weight=self.cfg.data_desc.intent_weights) - self.slot_loss = CrossEntropyLoss(logits_ndim=3, weight=self.cfg.data_desc.slot_weights) - else: - self.intent_loss = CrossEntropyLoss(logits_ndim=2) - self.slot_loss = CrossEntropyLoss(logits_ndim=3) - - self.total_loss = AggregatorLoss( - num_inputs=2, weights=[self.cfg.intent_loss_weight, 1.0 - self.cfg.intent_loss_weight] - ) - - # setup to track metrics - self.intent_classification_report = ClassificationReport( - num_classes=len(self.cfg.data_desc.intent_labels), - label_ids=self.cfg.data_desc.intent_label_ids, - dist_sync_on_step=True, - mode='micro', - ) - self.slot_classification_report = ClassificationReport( - num_classes=len(self.cfg.data_desc.slot_labels), - label_ids=self.cfg.data_desc.slot_label_ids, - dist_sync_on_step=True, - mode='micro', - ) - - def update_data_dir_for_training(self, data_dir: str, train_ds, validation_ds) -> None: - """ - Update data directory and get data stats with Data Descriptor. - Also, reconfigures the classifier - to cope with data with e.g. different number of slots. - - Args: - data_dir: path to data directory - """ - logging.info(f'Setting data_dir to {data_dir}.') - self.data_dir = data_dir - # Update configuration with new data. - self._set_data_desc_to_cfg(self.cfg, data_dir, train_ds, validation_ds) - # Reconfigure the classifier for different settings (number of intents, slots etc.). - self._reconfigure_classifier() - - def update_data_dir_for_testing(self, data_dir) -> None: - """ - Update data directory. - - Args: - data_dir: path to data directory - """ - logging.info(f'Setting data_dir to {data_dir}.') - self.data_dir = data_dir - - @typecheck() - def forward(self, input_ids, attention_mask, token_type_ids): - """ - No special modification required for Lightning, define it as you normally would - in the `nn.Module` in vanilla PyTorch. - """ - if self._cfg.tokenizer.get('library', '') == 'megatron': - hidden_states, _ = self.bert_model(input_ids, attention_mask, tokentype_ids=token_type_ids, lm_labels=None) - else: - hidden_states = self.bert_model( - input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask - ) - intent_logits, slot_logits = self.classifier(hidden_states=hidden_states) - return intent_logits, slot_logits - - def training_step(self, batch, batch_idx): - """ - Lightning calls this inside the training loop with the data from the training dataloader - passed in as `batch`. - """ - # forward pass - input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, intent_labels, slot_labels = batch - intent_logits, slot_logits = self( - input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask - ) - - # calculate combined loss for intents and slots - intent_loss = self.intent_loss(logits=intent_logits, labels=intent_labels) - slot_loss = self.slot_loss(logits=slot_logits, labels=slot_labels, loss_mask=loss_mask) - train_loss = self.total_loss(loss_1=intent_loss, loss_2=slot_loss) - lr = self._optimizer.param_groups[0]['lr'] - - self.log('train_loss', train_loss) - self.log('lr', lr, prog_bar=True) - - return { - 'loss': train_loss, - 'lr': lr, - } - - def validation_step(self, batch, batch_idx): - """ - Lightning calls this inside the validation loop with the data from the validation dataloader - passed in as `batch`. - """ - input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, intent_labels, slot_labels = batch - intent_logits, slot_logits = self( - input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask - ) - - # calculate combined loss for intents and slots - intent_loss = self.intent_loss(logits=intent_logits, labels=intent_labels) - slot_loss = self.slot_loss(logits=slot_logits, labels=slot_labels, loss_mask=loss_mask) - val_loss = self.total_loss(loss_1=intent_loss, loss_2=slot_loss) - - # calculate accuracy metrics for intents and slot reporting - # intents - intent_preds = torch.argmax(intent_logits, axis=-1) - self.intent_classification_report.update(intent_preds, intent_labels) - # slots - - subtokens_mask = subtokens_mask > 0.5 - slot_preds = torch.argmax(slot_logits, axis=-1) - self.slot_classification_report.update(slot_preds[subtokens_mask], slot_labels[subtokens_mask]) - - loss = { - 'val_loss': val_loss, - 'intent_tp': self.intent_classification_report.tp, - 'intent_fn': self.intent_classification_report.fn, - 'intent_fp': self.intent_classification_report.fp, - 'slot_tp': self.slot_classification_report.tp, - 'slot_fn': self.slot_classification_report.fn, - 'slot_fp': self.slot_classification_report.fp, - 'intent_preds': intent_preds, - 'intent_labels': intent_labels, - 'slot_preds': slot_preds, - 'slot_labels': slot_labels, - 'input': input_ids, - 'subtokens_mask': subtokens_mask, - } - self.validation_step_outputs.append(loss) - return loss - - @staticmethod - def get_continuous_slots(slot_ids, utterance_tokens): - """ - Extract continuous spans of slot_ids - Args: - Slot_ids: list of str representing slot of each word token - For instance, 'O', 'email_address', 'email_address', 'email_address', 'O', 'O', 'O', 'O'] - Corresponds to ['enter', 'atdfd@yahoo', 'dot', 'com', 'into', 'my', 'contact', 'list'] - Returns: - list of str where each element is a slot name-value pair - e.g. ['email_address(atdfd@yahoo dot com)'] - - """ - slot_id_stack = [] - position_stack = [] - for i, slot_id in enumerate(slot_ids): - if not slot_id_stack or slot_id != slot_id_stack[-1]: - slot_id_stack.append(slot_id) - position_stack.append([]) - position_stack[-1].append(i) - - slot_id_to_start_and_exclusive_end = { - slot_id_stack[i]: [position_stack[i][0], position_stack[i][-1] + 1] - for i in range(len(position_stack)) - if slot_id_stack[i] != 'O' - } - - slot_to_words = { - slot: ' '.join(utterance_tokens[position[0] : position[1]]) - for slot, position in slot_id_to_start_and_exclusive_end.items() - } - - slot_name_and_values = ["{}({})".format(slot, value) for slot, value in slot_to_words.items()] - - return slot_name_and_values - - def get_utterance_tokens(self, token_ids, token_masks): - """ - Get utterance tokens based on initial utterance tokenization using token_masks, - which shows the starting subtoken of each utterance token. - - Args: - token_ids: IntTensor of size (max_seq_len, ) - token_masks: BoolTensor of size (max_seq_len, ) - - Returns - token_list: List of Str (list of tokens with len <= max_seq_len) - """ - tokens_stack = [] - tokens = self.tokenizer.tokenizer.convert_ids_to_tokens(token_ids) - - for token_idx, token in enumerate(tokens): - if token_masks[token_idx].item(): - tokens_stack.append([token]) - elif tokens_stack: - clean_token = ( - token.replace("##", '') - .replace(self.tokenizer.tokenizer.sep_token, '') - .replace(self.tokenizer.tokenizer.pad_token, '') - ) - tokens_stack[-1].append(clean_token) - token_list = [''.join(token) for token in tokens_stack] - return token_list - - def get_unified_metrics(self, outputs): - slot_preds = [] - slot_labels = [] - subtokens_mask = [] - inputs = [] - intent_preds = [] - intent_labels = [] - - for output in outputs: - slot_preds += output['slot_preds'] - slot_labels += output["slot_labels"] - subtokens_mask += output["subtokens_mask"] - inputs += output["input"] - intent_preds += output["intent_preds"] - intent_labels += output["intent_labels"] - - ground_truth_labels = self.convert_intent_ids_to_intent_names(intent_labels) - generated_labels = self.convert_intent_ids_to_intent_names(intent_preds) - - predicted_slots = self.mask_unused_subword_slots(slot_preds, subtokens_mask) - ground_truth_slots = self.mask_unused_subword_slots(slot_labels, subtokens_mask) - - all_generated_slots = [] - all_ground_truth_slots = [] - all_utterances = [] - - for i in range(len(predicted_slots)): - utterance_tokens = self.get_utterance_tokens(inputs[i], subtokens_mask[i]) - ground_truth_slot_names = ground_truth_slots[i].split() - predicted_slot_names = predicted_slots[i].split() - processed_ground_truth_slots = IntentSlotClassificationModel.get_continuous_slots( - ground_truth_slot_names, utterance_tokens - ) - processed_predicted_slots = IntentSlotClassificationModel.get_continuous_slots( - predicted_slot_names, utterance_tokens - ) - - all_generated_slots.append(processed_predicted_slots) - all_ground_truth_slots.append(processed_ground_truth_slots) - all_utterances.append(' '.join(utterance_tokens)) - - os.makedirs(self.cfg.dataset.dialogues_example_dir, exist_ok=True) - filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "predictions.jsonl") - - DialogueClassificationMetrics.save_predictions( - filename, - generated_labels, - all_generated_slots, - ground_truth_labels, - all_ground_truth_slots, - ['' for i in range(len(generated_labels))], - ['' for i in range(len(generated_labels))], - all_utterances, - ) - - ( - slot_precision, - slot_recall, - slot_f1, - slot_joint_goal_accuracy, - ) = DialogueClassificationMetrics.get_slot_filling_metrics(all_generated_slots, all_ground_truth_slots) - - return slot_precision, slot_recall, slot_f1, slot_joint_goal_accuracy - - def on_validation_epoch_end(self): - """ - Called at the end of validation to aggregate outputs. - :param outputs: list of individual outputs of each validation step. - """ - - prefix = "test" if self.trainer.testing else "val" - if prefix == "val": - outputs = self.validation_step_outputs - else: - outputs = self.test_step_outputs - ( - unified_slot_precision, - unified_slot_recall, - unified_slot_f1, - unified_slot_joint_goal_accuracy, - ) = self.get_unified_metrics(outputs) - - avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() - - # calculate metrics and log classification report (separately for intents and slots) - intent_precision, intent_recall, intent_f1, intent_report = self.intent_classification_report.compute() - logging.info(f'Intent report: {intent_report}') - - slot_precision, slot_recall, slot_f1, slot_report = self.slot_classification_report.compute() - logging.info(f'Slot report: {slot_report}') - - self.log(f'{prefix}_loss', avg_loss) - self.log('intent_precision', intent_precision) - self.log('intent_recall', intent_recall) - self.log('intent_f1', intent_f1) - self.log('slot_precision', slot_precision) - self.log('slot_recall', slot_recall) - self.log('slot_f1', slot_f1) - self.log('unified_slot_precision', unified_slot_precision) - self.log('unified_slot_recall', unified_slot_recall) - self.log('unified_slot_f1', unified_slot_f1) - self.log('unified_slot_joint_goal_accuracy', unified_slot_joint_goal_accuracy) - - self.intent_classification_report.reset() - self.slot_classification_report.reset() - - self.validation_step_outputs.clear() if prefix == 'val' else self.test_step_outputs.clear() - return { - 'val_loss': avg_loss, - 'intent_precision': intent_precision, - 'intent_recall': intent_recall, - 'intent_f1': intent_f1, - 'slot_precision': slot_precision, - 'slot_recall': slot_recall, - 'slot_f1': slot_f1, - 'unified_slot_precision': unified_slot_precision, - 'unified_slot_recall': unified_slot_recall, - 'unified_slot_f1': unified_slot_f1, - 'unified_slot_joint_goal_accuracy': unified_slot_joint_goal_accuracy, - } - - def test_step(self, batch, batch_idx): - """ - Lightning calls this inside the test loop with the data from the test dataloader - passed in as `batch`. - """ - loss = self.validation_step(batch, batch_idx) - self.test_step_outputs.append(loss) - return loss - - def on_test_epoch_end(self): - """ - Called at the end of test to aggregate outputs. - :param outputs: list of individual outputs of each test step. - """ - return self.on_validation_epoch_end() - - def setup_training_data(self, train_data_config: Optional[DictConfig]): - self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, dataset_split='train') - - def setup_validation_data(self, val_data_config: Optional[DictConfig]): - self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, dataset_split='dev') - - def setup_test_data(self, test_data_config: Optional[DictConfig]): - self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, dataset_split='test') - - def _setup_dataloader_from_config(self, cfg: DictConfig, dataset_split: str): - data_processor = DialogueAssistantDataProcessor(self.data_dir, self.tokenizer, cfg=self.cfg.dataset) - - dataset = DialogueBERTDataset( - dataset_split, - data_processor, - self.tokenizer, - self.cfg.dataset, # this is the model.dataset cfg, which is diff from train_ds cfg etc - ) - - return DataLoader( - dataset=dataset, - batch_size=cfg.batch_size, - shuffle=cfg.shuffle, - num_workers=cfg.num_workers, - pin_memory=cfg.pin_memory, - drop_last=cfg.drop_last, - collate_fn=dataset.collate_fn, - ) - - def _setup_infer_dataloader(self, queries: List[str], test_ds) -> 'torch.utils.data.DataLoader': - """ - Setup function for a infer data loader. - Args: - queries: text - batch_size: batch size to use during inference - Returns: - A pytorch DataLoader. - """ - - dataset = DialogueIntentSlotInferenceDataset( - tokenizer=self.tokenizer, queries=queries, max_seq_length=-1, do_lower_case=False - ) - - return torch.utils.data.DataLoader( - dataset=dataset, - collate_fn=dataset.collate_fn, - batch_size=test_ds.batch_size, - shuffle=test_ds.shuffle, - num_workers=test_ds.num_workers, - pin_memory=test_ds.pin_memory, - drop_last=test_ds.drop_last, - ) - - def update_data_dirs(self, data_dir: str, dialogues_example_dir: str): - """ - Update data directories - - Args: - data_dir: path to data directory - dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created. - """ - if not os.path.exists(data_dir): - raise ValueError(f"{data_dir} is not found") - self.cfg.dataset.data_dir = data_dir - self.cfg.dataset.dialogues_example_dir = dialogues_example_dir - logging.info(f'Setting model.dataset.data_dir to {data_dir}.') - logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.') - - def predict_from_examples(self, queries: List[str], test_ds) -> List[List[str]]: - """ - Get prediction for the queries (intent and slots) - Args: - queries: text sequences - test_ds: Dataset configuration section. - Returns: - predicted_intents, predicted_slots: model intent and slot predictions - """ - - predicted_intents = [] - predicted_slots = [] - mode = self.training - - device = 'cuda' if torch.cuda.is_available() else 'cpu' - - # Switch model to evaluation mode - self.eval() - self.to(device) - - # Dataset. - infer_datalayer = self._setup_infer_dataloader(queries, test_ds) - - for batch in infer_datalayer: - input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch - - intent_logits, slot_logits = self.forward( - input_ids=input_ids.to(device), - token_type_ids=input_type_ids.to(device), - attention_mask=input_mask.to(device), - ) - - # predict intents - intent_preds = tensor2list(torch.argmax(intent_logits, axis=-1)) - predicted_intents += self.convert_intent_ids_to_intent_names(intent_preds) - - # predict slots - slot_preds = torch.argmax(slot_logits, axis=-1) - predicted_slots += self.mask_unused_subword_slots(slot_preds, subtokens_mask) - - # set mode back to its original value - self.train(mode=mode) - - return predicted_intents, predicted_slots - - def convert_intent_ids_to_intent_names(self, intent_preds): - # Retrieve intent and slot vocabularies from configuration. - intent_labels = self.cfg.data_desc.intent_labels - - predicted_intents = [] - - # convert numerical outputs to Intent and Slot labels from the dictionaries - for intent_num in intent_preds: - # if intent_num < len(intent_labels): - predicted_intents.append(intent_labels[int(intent_num)]) - # else: - # # should not happen - # predicted_intents.append("Unknown Intent") - return predicted_intents - - def mask_unused_subword_slots(self, slot_preds, subtokens_mask): - # Retrieve intent and slot vocabularies from configuration. - slot_labels = self.cfg.data_desc.slot_labels - predicted_slots = [] - for slot_preds_query, mask_query in zip(slot_preds, subtokens_mask): - query_slots = '' - for slot, mask in zip(slot_preds_query, mask_query): - if mask == 1: - # if slot < len(slot_labels): - query_slots += slot_labels[int(slot)] + ' ' - # else: - # query_slots += 'Unknown_slot ' - predicted_slots.append(query_slots.strip()) - return predicted_slots - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - result = [] - model = PretrainedModelInfo( - pretrained_model_name="Joint_Intent_Slot_Assistant", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemonlpmodels/versions/1.0.0a5/files/Joint_Intent_Slot_Assistant.nemo", - description="This models is trained on this https://github.com/xliuhw/NLU-Evaluation-Data dataset which includes 64 various intents and 55 slots. Final Intent accuracy is about 87%, Slot accuracy is about 89%.", - ) - result.append(model) - return result diff --git a/nemo/collections/nlp/models/dialogue/sgdqa_model.py b/nemo/collections/nlp/models/dialogue/sgdqa_model.py deleted file mode 100644 index 6cd2243423a4..000000000000 --- a/nemo/collections/nlp/models/dialogue/sgdqa_model.py +++ /dev/null @@ -1,607 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -''' -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/baseline/train_and_predict.py -''' - -import os -from typing import List, Optional - -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig -from torch.utils.data import DataLoader - -from nemo.collections.nlp.data.dialogue import DialogueSGDBERTDataset, DialogueSGDDataProcessor -from nemo.collections.nlp.data.dialogue.sgd.evaluate import evaluate, get_in_domain_services -from nemo.collections.nlp.data.dialogue.sgd.prediction_utils import write_predictions_to_file -from nemo.collections.nlp.losses import SGDDialogueStateLoss -from nemo.collections.nlp.models.nlp_model import NLPModel -from nemo.collections.nlp.modules import SGDDecoder, SGDEncoder -from nemo.collections.nlp.parts.utils_funcs import tensor2list -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - -__all__ = ['SGDQAModel'] - - -class SGDQAModel(NLPModel): - """ - Dialogue State Tracking Model SGD-QA (https://arxiv.org/abs/2105.08049) - - The SGD-QA model is a fast multi-pass schema-guided state-tracking model, that is trained on the Google schema-guided state tracking dataset (https://arxiv.org/abs/1909.05855). - The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs. - The model consists of two components: a neural natural language understanding model (NLU), and a rule-based state tracker. - The NLU takes in a dialogue turn and different schema (entity) information options and outputs their match score. The state tracker takes the highest rated entities and composes - the dialogue state across turns. - """ - - @property - def output_module(self): - return self.decoder - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # deprecation warning - deprecated_warning("SGDQAModel") - - self.data_prepared = False - super().__init__(cfg=cfg, trainer=trainer) - self.encoder = SGDEncoder(hidden_size=self.bert_model.config.hidden_size, dropout=self._cfg.encoder.dropout) - self.decoder = SGDDecoder(embedding_dim=self.bert_model.config.hidden_size) - self.loss = SGDDialogueStateLoss(reduction="mean") - - @typecheck() - def forward(self, input_ids, attention_mask, token_type_ids): - token_embeddings = self.bert_model( - input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask - ) - if isinstance(token_embeddings, tuple): - token_embeddings = token_embeddings[0] - - encoded_utterance, token_embeddings = self.encoder(hidden_states=token_embeddings) - ( - logit_intent_status, - logit_req_slot_status, - logit_cat_slot_status, - logit_cat_slot_value_status, - logit_noncat_slot_status, - logit_spans, - ) = self.decoder( - encoded_utterance=encoded_utterance, token_embeddings=token_embeddings, utterance_mask=attention_mask - ) - return ( - logit_intent_status, - logit_req_slot_status, - logit_cat_slot_status, - logit_cat_slot_value_status, - logit_noncat_slot_status, - logit_spans, - ) - - def training_step(self, batch, batch_idx): - ( - example_id_num, - service_id, - utterance_ids, - token_type_ids, - attention_mask, - intent_status, - requested_slot_status, - categorical_slot_status, - categorical_slot_value_status, - noncategorical_slot_status, - noncategorical_slot_value_start, - noncategorical_slot_value_end, - start_char_idx, - end_char_idx, - task_mask, - ) = batch - ( - logit_intent_status, - logit_req_slot_status, - logit_cat_slot_status, - logit_cat_slot_value_status, - logit_noncat_slot_status, - logit_spans, - ) = self(input_ids=utterance_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) - loss = self.loss( - logit_intent_status=logit_intent_status, - intent_status=intent_status, - logit_req_slot_status=logit_req_slot_status, - requested_slot_status=requested_slot_status, - logit_cat_slot_status=logit_cat_slot_status, - categorical_slot_status=categorical_slot_status, - logit_cat_slot_value_status=logit_cat_slot_value_status, - categorical_slot_value_status=categorical_slot_value_status, - logit_noncat_slot_status=logit_noncat_slot_status, - noncategorical_slot_status=noncategorical_slot_status, - logit_spans=logit_spans, - noncategorical_slot_value_start=noncategorical_slot_value_start, - noncategorical_slot_value_end=noncategorical_slot_value_end, - task_mask=task_mask, - ) - lr = self._optimizer.param_groups[0]['lr'] - - self.log('train_loss', loss) - self.log('lr', lr, prog_bar=True) - - return { - 'loss': loss, - 'lr': lr, - } - - def validation_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_idx: int = 0) -> dict: - """ - Called at every validation step to aggregate and postprocess outputs on each GPU - Args: - batch: input batch at validation step - batch_idx: batch index - dataloader_idx: dataloader index - """ - loss, tensors = self.eval_step_helper(batch=batch) - self.log(f'val_loss', loss) - if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - self.validation_step_outputs[dataloader_idx].append({f'val_loss': loss, f'tensors': tensors}) - else: - self.validation_step_outputs.append({f'val_loss': loss, f'tensors': tensors}) - - return {f'val_loss': loss, f'tensors': tensors} - - def test_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_idx: int = 0) -> dict: - """ - Called at every test step to aggregate and postprocess outputs on each GPU - Args: - batch: input batch at test step - batch_idx: batch index - dataloader_idx: dataloader index - """ - loss, tensors = self.eval_step_helper(batch=batch) - if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx].append({f'test_loss': loss, f'tensors': tensors}) - else: - self.test_step_outputs.append({f'test_loss': loss, f'tensors': tensors}) - - return {f'test_loss': loss, f'tensors': tensors} - - def eval_step_helper(self, batch: List[torch.Tensor]): - """ - Helper called at every validation/test step to aggregate and postprocess outputs on each GPU - Args: - batch: input batch at step - Returns: - loss: averaged batch loss - tensors: collection of aggregated output tensors across all GPU workers - """ - ( - example_id_num, - service_id, - utterance_ids, - token_type_ids, - attention_mask, - intent_status, - requested_slot_status, - categorical_slot_status, - categorical_slot_value_status, - noncategorical_slot_status, - noncategorical_slot_value_start, - noncategorical_slot_value_end, - start_char_idx, - end_char_idx, - task_mask, - ) = batch - ( - logit_intent_status, - logit_req_slot_status, - logit_cat_slot_status, - logit_cat_slot_value_status, - logit_noncat_slot_status, - logit_spans, - ) = self(input_ids=utterance_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) - loss = self.loss( - logit_intent_status=logit_intent_status, - intent_status=intent_status, - logit_req_slot_status=logit_req_slot_status, - requested_slot_status=requested_slot_status, - logit_cat_slot_status=logit_cat_slot_status, - categorical_slot_status=categorical_slot_status, - logit_cat_slot_value_status=logit_cat_slot_value_status, - categorical_slot_value_status=categorical_slot_value_status, - logit_noncat_slot_status=logit_noncat_slot_status, - noncategorical_slot_status=noncategorical_slot_status, - logit_spans=logit_spans, - noncategorical_slot_value_start=noncategorical_slot_value_start, - noncategorical_slot_value_end=noncategorical_slot_value_end, - task_mask=task_mask, - ) - - all_example_id_num = [] - all_service_id = [] - all_logit_intent_status = [] - all_logit_req_slot_status = [] - all_logit_cat_slot_status = [] - all_logit_cat_slot_value_status = [] - all_logit_noncat_slot_status = [] - all_logit_spans = [] - all_start_char_idx = [] - all_end_char_idx = [] - - if self.trainer.num_devices and self.trainer.world_size > 1: - world_size = self.trainer.world_size - for ind in range(world_size): - all_example_id_num.append(torch.empty_like(example_id_num)) - all_service_id.append(torch.empty_like(service_id)) - all_logit_intent_status.append(torch.empty_like(logit_intent_status)) - all_logit_req_slot_status.append(torch.empty_like(logit_req_slot_status)) - all_logit_cat_slot_status.append(torch.empty_like(logit_cat_slot_status)) - all_logit_cat_slot_value_status.append(torch.empty_like(logit_cat_slot_value_status)) - all_logit_noncat_slot_status.append(torch.empty_like(logit_noncat_slot_status)) - all_logit_spans.append(torch.empty_like(logit_spans)) - all_start_char_idx.append(torch.empty_like(start_char_idx)) - all_end_char_idx.append(torch.empty_like(end_char_idx)) - - torch.distributed.all_gather(all_example_id_num, example_id_num) - torch.distributed.all_gather(all_service_id, service_id) - torch.distributed.all_gather(all_logit_intent_status, logit_intent_status) - torch.distributed.all_gather(all_logit_req_slot_status, logit_req_slot_status) - torch.distributed.all_gather(all_logit_cat_slot_status, logit_cat_slot_status) - torch.distributed.all_gather(all_logit_cat_slot_value_status, logit_cat_slot_value_status) - torch.distributed.all_gather(all_logit_noncat_slot_status, logit_noncat_slot_status) - torch.distributed.all_gather(all_logit_spans, logit_spans) - torch.distributed.all_gather(all_start_char_idx, start_char_idx) - torch.distributed.all_gather(all_end_char_idx, end_char_idx) - else: - all_example_id_num.append(example_id_num) - all_service_id.append(service_id) - all_logit_intent_status.append(logit_intent_status) - all_logit_req_slot_status.append(logit_req_slot_status) - all_logit_cat_slot_status.append(logit_cat_slot_status) - all_logit_cat_slot_value_status.append(logit_cat_slot_value_status) - all_logit_noncat_slot_status.append(logit_noncat_slot_status) - all_logit_spans.append(logit_spans) - all_start_char_idx.append(start_char_idx) - all_end_char_idx.append(end_char_idx) - - # after this: all_x is list of tensors, of length world_size - example_id_num = torch.cat(all_example_id_num) - service_id = torch.cat(all_service_id) - logit_intent_status = torch.cat(all_logit_intent_status) - logit_req_slot_status = torch.cat(all_logit_req_slot_status) - logit_cat_slot_status = torch.cat(all_logit_cat_slot_status) - logit_cat_slot_value_status = torch.cat(all_logit_cat_slot_value_status) - logit_noncat_slot_status = torch.cat(all_logit_noncat_slot_status) - logit_spans = torch.cat(all_logit_spans) - start_char_idx = torch.cat(all_start_char_idx) - end_char_idx = torch.cat(all_end_char_idx) - - intent_status = torch.nn.Sigmoid()(logit_intent_status) - - # Scores are output for each requested slot. - req_slot_status = torch.nn.Sigmoid()(logit_req_slot_status) - - # For categorical slots, the status of each slot and the predicted value are output. - cat_slot_status_dist = torch.nn.Softmax(dim=-1)(logit_cat_slot_status) - - cat_slot_status = torch.argmax(logit_cat_slot_status, axis=-1) - cat_slot_status_p = torch.max(cat_slot_status_dist, axis=-1)[0] - cat_slot_value_status = torch.nn.Sigmoid()(logit_cat_slot_value_status) - - # For non-categorical slots, the status of each slot and the indices for spans are output. - noncat_slot_status_dist = torch.nn.Softmax(dim=-1)(logit_noncat_slot_status) - - noncat_slot_status = torch.argmax(logit_noncat_slot_status, axis=-1) - noncat_slot_status_p = torch.max(noncat_slot_status_dist, axis=-1)[0] - - softmax = torch.nn.Softmax(dim=1) - - scores = softmax(logit_spans) - start_scores, end_scores = torch.unbind(scores, dim=-1) - - batch_size, max_num_tokens = end_scores.size() - # Find the span with the maximum sum of scores for start and end indices. - total_scores = torch.unsqueeze(start_scores, axis=2) + torch.unsqueeze(end_scores, axis=1) - start_idx = torch.arange(max_num_tokens, device=total_scores.get_device()).view(1, -1, 1) - end_idx = torch.arange(max_num_tokens, device=total_scores.get_device()).view(1, 1, -1) - invalid_index_mask = (start_idx > end_idx).repeat(batch_size, 1, 1) - total_scores = torch.where( - invalid_index_mask, - torch.zeros(total_scores.size(), device=total_scores.get_device(), dtype=total_scores.dtype), - total_scores, - ) - max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens**2), axis=-1) - max_span_p = torch.max(total_scores.view(-1, max_num_tokens**2), axis=-1)[0] - - span_start_index = torch.floor_divide(max_span_index, max_num_tokens) - span_end_index = torch.fmod(max_span_index, max_num_tokens) - - tensors = { - 'example_id_num': example_id_num, - 'service_id': service_id, - 'intent_status': intent_status, - 'req_slot_status': req_slot_status, - 'cat_slot_status': cat_slot_status, - 'cat_slot_status_p': cat_slot_status_p, - 'cat_slot_value_status': cat_slot_value_status, - 'noncat_slot_status': noncat_slot_status, - 'noncat_slot_status_p': noncat_slot_status_p, - 'noncat_slot_p': max_span_p, - 'noncat_slot_start': span_start_index, - 'noncat_slot_end': span_end_index, - 'noncat_alignment_start': start_char_idx, - 'noncat_alignment_end': end_char_idx, - } - return loss, tensors - - def multi_validation_epoch_end(self, outputs: List[dict], dataloader_idx: int = 0): - """ - Called at the end of validation to post process outputs into human readable format - Args: - outputs: list of individual outputs of each validation step - dataloader_idx: dataloader index - """ - avg_loss = torch.stack([x[f'val_loss'] for x in outputs]).mean() - split = self._validation_names[dataloader_idx][:-1] - dataloader = self._validation_dl[dataloader_idx] - metrics = self.multi_eval_epoch_end_helper(outputs=outputs, split=split, dataloader=dataloader) - - for k, v in metrics.items(): - self.log(f'{split}_{k}', v, rank_zero_only=True) - - self.log(f'val_loss', avg_loss, prog_bar=True, rank_zero_only=True) - - def multi_test_epoch_end(self, outputs: List[dict], dataloader_idx: int = 0): - """ - Called at the end of test to post process outputs into human readable format - Args: - outputs: list of individual outputs of each test step - dataloader_idx: dataloader index - """ - avg_loss = torch.stack([x[f'test_loss'] for x in outputs]).mean() - split = self._test_names[dataloader_idx][:-1] - dataloader = self._test_dl[dataloader_idx] - metrics = self.multi_eval_epoch_end_helper(outputs=outputs, split=split, dataloader=dataloader) - - for k, v in metrics.items(): - self.log(f'{split}_{k}', v, rank_zero_only=True) - - self.log(f'test_loss', avg_loss, prog_bar=True, rank_zero_only=True) - - def multi_eval_epoch_end_helper( - self, outputs: List[dict], split: str, dataloader: torch.utils.data.DataLoader - ) -> dict: - """ - Helper called at the end of evaluation to post process outputs into human readable format - Args: - outputs: list of individual outputs of each step - split: data split - dataloader: dataloader - Returns: - metrics: metrics collection - """ - - def get_str_example_id(split: str, ids_to_service_names_dict: dict, example_id_num: torch.Tensor) -> str: - """ - Constructs string representation of example ID - Args: - split: evaluation data split - ids_to_service_names_dict: id to service name mapping - example_id_num: tensor example id - """ - - def format_turn_id(ex_id_num): - dialog_id_1, dialog_id_2, turn_id, service_id, model_task_id, slot_intent_id, value_id = ex_id_num - return "{}-{}_{:05d}-{:02d}-{}-{}-{}-{}".format( - split, - dialog_id_1, - dialog_id_2, - turn_id, - ids_to_service_names_dict[service_id], - model_task_id, - slot_intent_id, - value_id, - ) - - return list(map(format_turn_id, tensor2list(example_id_num))) - - def combine_predictions_in_example(predictions: dict, batch_size: int): - ''' - Combines predicted values to a single example. - Args: - predictions: predictions ordered by keys then batch - batch_size: batch size - Returns: - examples_preds: predictions ordered by batch then key - ''' - examples_preds = [{} for _ in range(batch_size)] - for k, v in predictions.items(): - if k != 'example_id': - v = torch.chunk(v, batch_size) - - for i in range(batch_size): - if k == 'example_id': - examples_preds[i][k] = v[i] - else: - examples_preds[i][k] = v[i].view(-1) - return examples_preds - - example_id_num = torch.cat([x[f'tensors']['example_id_num'] for x in outputs]) - service_id = torch.cat([x[f'tensors']['service_id'] for x in outputs]) - intent_status = torch.cat([x[f'tensors']['intent_status'] for x in outputs]) - req_slot_status = torch.cat([x[f'tensors']['req_slot_status'] for x in outputs]) - cat_slot_status = torch.cat([x[f'tensors']['cat_slot_status'] for x in outputs]) - cat_slot_status_p = torch.cat([x[f'tensors']['cat_slot_status_p'] for x in outputs]) - cat_slot_value_status = torch.cat([x[f'tensors']['cat_slot_value_status'] for x in outputs]) - noncat_slot_status = torch.cat([x[f'tensors']['noncat_slot_status'] for x in outputs]) - noncat_slot_status_p = torch.cat([x[f'tensors']['noncat_slot_status_p'] for x in outputs]) - noncat_slot_p = torch.cat([x[f'tensors']['noncat_slot_p'] for x in outputs]) - noncat_slot_start = torch.cat([x[f'tensors']['noncat_slot_start'] for x in outputs]) - noncat_slot_end = torch.cat([x[f'tensors']['noncat_slot_end'] for x in outputs]) - noncat_alignment_start = torch.cat([x[f'tensors']['noncat_alignment_start'] for x in outputs]) - noncat_alignment_end = torch.cat([x[f'tensors']['noncat_alignment_end'] for x in outputs]) - - ids_to_service_names_dict = self.dialogues_processor.schemas._services_id_to_vocab - example_id = get_str_example_id(dataloader.dataset, ids_to_service_names_dict, example_id_num) - - metrics = {} - try: - prediction_dir = self.trainer.log_dir if self.trainer.log_dir is not None else "" - except: - prediction_dir = "" - - if self.trainer.global_rank == 0: - prediction_dir = os.path.join( - prediction_dir, 'predictions', 'pred_res_{}_{}'.format(split, self._cfg.dataset.task_name) - ) - os.makedirs(prediction_dir, exist_ok=True) - - input_json_files = DialogueSGDDataProcessor.get_dialogue_files( - self._cfg.dataset.data_dir, split, self._cfg.dataset.task_name - ) - - predictions = {} - predictions['example_id'] = example_id - predictions['service_id'] = service_id - predictions['intent_status'] = intent_status - predictions['req_slot_status'] = req_slot_status - predictions['cat_slot_status'] = cat_slot_status - predictions['cat_slot_status_p'] = cat_slot_status_p - predictions['cat_slot_value_status'] = cat_slot_value_status - predictions['noncat_slot_status'] = noncat_slot_status - predictions['noncat_slot_status_p'] = noncat_slot_status_p - predictions['noncat_slot_p'] = noncat_slot_p - predictions['noncat_slot_start'] = noncat_slot_start - predictions['noncat_slot_end'] = noncat_slot_end - predictions['noncat_alignment_start'] = noncat_alignment_start - predictions['noncat_alignment_end'] = noncat_alignment_end - - in_domain_services = get_in_domain_services( - os.path.join(self._cfg.dataset.data_dir, split, "schema.json"), - self.dialogues_processor.get_seen_services("train"), - ) - predictions = combine_predictions_in_example(predictions, service_id.shape[0]) - - # write predictions to file in Dstc8/SGD format - write_predictions_to_file( - predictions, - input_json_files, - output_dir=prediction_dir, - schemas=self.dialogues_processor.schemas, - state_tracker=self._cfg.dataset.state_tracker, - eval_debug=False, - in_domain_services=in_domain_services, - ) - metrics = evaluate( - prediction_dir, - self._cfg.dataset.data_dir, - split, - in_domain_services, - joint_acc_across_turn=self._cfg.dataset.joint_acc_across_turn, - use_fuzzy_match=self._cfg.dataset.use_fuzzy_match, - ) - - return metrics - - def prepare_data(self): - """ - Preprocessed schema and dialogues and caches this - """ - if self.data_prepared: - return - - self.dialogues_processor = DialogueSGDDataProcessor( - data_dir=self._cfg.dataset.data_dir, - dialogues_example_dir=self._cfg.dataset.dialogues_example_dir, - tokenizer=self.tokenizer, - cfg=self._cfg.dataset, - ) - - self.data_prepared = True - - def update_data_dirs(self, data_dir: str, dialogues_example_dir: str): - """ - Update data directories - - Args: - data_dir: path to data directory - dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created. - """ - if not os.path.exists(data_dir): - raise ValueError(f"{data_dir} is not found") - self._cfg.dataset.data_dir = data_dir - self._cfg.dataset.dialogues_example_dir = dialogues_example_dir - logging.info(f'Setting model.dataset.data_dir to {data_dir}.') - logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.') - - def setup_training_data(self, train_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, split=train_data_config.ds_item) - - def setup_validation_data(self, val_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, split=val_data_config.ds_item) - - def setup_test_data(self, test_data_config: Optional[DictConfig] = None): - self.prepare_data() - self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, split=test_data_config.ds_item) - - def _setup_dataloader_from_config(self, cfg: DictConfig, split: str) -> DataLoader: - dataset_cfg = self._cfg.dataset - data_dir = dataset_cfg.data_dir - - if not os.path.exists(data_dir): - raise FileNotFoundError(f"Data directory is not found at: {data_dir}.") - - # dataset = SGDDataset(dataset_split=split, dialogues_processor=self.dialogues_processor) - - dataset = DialogueSGDBERTDataset( - dataset_split=split, - dialogues_processor=self.dialogues_processor, - tokenizer=self.dialogues_processor._tokenizer, - schemas=self.dialogues_processor.schemas, - schema_config=self.dialogues_processor.schema_config, - cfg=dataset_cfg, - ) - - dl = torch.utils.data.DataLoader( - dataset=dataset, - batch_size=cfg.batch_size, - collate_fn=dataset.collate_fn, - drop_last=cfg.drop_last, - shuffle=cfg.shuffle, - num_workers=cfg.num_workers, - pin_memory=cfg.pin_memory, - ) - return dl - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - result = [] - - result.append( - PretrainedModelInfo( - pretrained_model_name="sgdqa_bertbasecased", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/sgdqa_bertbasecased/versions/1.0.0/files/sgdqa_bertbasecased.nemo", - description="Dialogue State Tracking model finetuned from NeMo BERT Base Cased on Google SGD dataset which has a joint goal accuracy of 59.72% on dev set and 45.85% on test set.", - ) - ) - return result diff --git a/nemo/collections/nlp/modules/__init__.py b/nemo/collections/nlp/modules/__init__.py index 17c768705bdd..82f0ee3bbcd1 100644 --- a/nemo/collections/nlp/modules/__init__.py +++ b/nemo/collections/nlp/modules/__init__.py @@ -13,21 +13,17 @@ # limitations under the License. -from nemo.collections.nlp.modules.common import ( - AlbertEncoder, - BertEncoder, - BertModule, - CamembertEncoder, - DistilBertEncoder, - PromptEncoder, - RobertaEncoder, - SequenceClassifier, - SequenceRegression, - SequenceTokenClassifier, - get_lm_model, - get_pretrained_lm_models_list, - get_tokenizer, - get_tokenizer_list, -) -from nemo.collections.nlp.modules.dialogue_state_tracking.sgd_decoder import SGDDecoder -from nemo.collections.nlp.modules.dialogue_state_tracking.sgd_encoder import SGDEncoder +from nemo.collections.nlp.modules.common import AlbertEncoder # noqa: F401 +from nemo.collections.nlp.modules.common import BertEncoder # noqa: F401 +from nemo.collections.nlp.modules.common import BertModule # noqa: F401 +from nemo.collections.nlp.modules.common import CamembertEncoder # noqa: F401 +from nemo.collections.nlp.modules.common import DistilBertEncoder # noqa: F401 +from nemo.collections.nlp.modules.common import PromptEncoder # noqa: F401 +from nemo.collections.nlp.modules.common import RobertaEncoder # noqa: F401 +from nemo.collections.nlp.modules.common import SequenceClassifier # noqa: F401 +from nemo.collections.nlp.modules.common import SequenceRegression # noqa: F401 +from nemo.collections.nlp.modules.common import SequenceTokenClassifier # noqa: F401 +from nemo.collections.nlp.modules.common import get_lm_model # noqa: F401 +from nemo.collections.nlp.modules.common import get_pretrained_lm_models_list # noqa: F401 +from nemo.collections.nlp.modules.common import get_tokenizer # noqa: F401 +from nemo.collections.nlp.modules.common import get_tokenizer_list # noqa: F401 diff --git a/nemo/collections/nlp/modules/dialogue_state_tracking/__init__.py b/nemo/collections/nlp/modules/dialogue_state_tracking/__init__.py deleted file mode 100644 index 9e3250071955..000000000000 --- a/nemo/collections/nlp/modules/dialogue_state_tracking/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_decoder.py b/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_decoder.py deleted file mode 100644 index 2ffe5330183e..000000000000 --- a/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_decoder.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional - -import torch -from torch import nn as nn - -from nemo.core.classes import NeuralModule, typecheck -from nemo.core.neural_types import LogitsType, NeuralType - -__all__ = ['SGDDecoder'] - - -class LogitsQA(nn.Module): - def __init__(self, num_classes: int, embedding_dim: int): - """Get logits for elements by conditioning on input embedding. - Args: - num_classes: An int containing the number of classes for which logits are to be generated. - embedding_dim: hidden size of the BERT - - Returns: - A tensor of shape (batch_size, num_classes) containing the logits. - """ - super().__init__() - self.num_classes = num_classes - self.utterance_proj = nn.Linear(embedding_dim, embedding_dim) - self.activation = nn.functional.gelu - - self.layer1 = nn.Linear(embedding_dim, num_classes) - - def forward(self, encoded_utterance): - """ - Args: - encoded_utterance: [CLS] token hidden state from BERT encoding of the utterance - """ - - # Project the utterance embeddings. - utterance_embedding = self.utterance_proj(encoded_utterance) - utterance_embedding = self.activation(utterance_embedding) - - logits = self.layer1(utterance_embedding) - return logits - - -class SGDDecoder(NeuralModule): - """ - SGDDecoder - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """ - Returns definitions of module output ports. - """ - return { - "logit_intent_status": NeuralType(('B', 'T'), LogitsType()), #'B' - "logit_req_slot_status": NeuralType(('B', 'T'), LogitsType()), #'B' - "logit_cat_slot_status": NeuralType(('B', 'T'), LogitsType()), - "logit_cat_slot_value_status": NeuralType(('B', 'T'), LogitsType()), #'B' - "logit_noncat_slot_status": NeuralType(('B', 'T'), LogitsType()), - "logit_spans": NeuralType(('B', 'T', 'D'), LogitsType()), - } - - def __init__(self, embedding_dim: int) -> None: - - """Get logits for elements by conditioning on utterance embedding. - - Args: - embedding_dim: hidden size of the BERT - """ - super().__init__() - - projection_module = LogitsQA - - self.intent_layer = projection_module(1, embedding_dim) - self.requested_slots_layer = projection_module(1, embedding_dim) - - self.cat_slot_value_layer = projection_module(1, embedding_dim) - - # Slot status values: none, dontcare, active. - self.slot_status_layer = projection_module(3, embedding_dim) - - # dim 2 for non_categorical slot - to represent start and end position - self.noncat_layer1 = nn.Linear(embedding_dim, embedding_dim) - self.noncat_activation = nn.functional.gelu - self.noncat_layer2 = nn.Linear(embedding_dim, 2) - - @typecheck() - def forward(self, encoded_utterance, token_embeddings, utterance_mask): - """ - Args: - encoded_utterance: [CLS] token hidden state from BERT encoding of the utterance - token_embeddings: token embeddings from BERT encoding of the utterance - utterance_mask: utterance mask wiht 0 for padding - """ - _, _ = encoded_utterance.size() - logit_intent_status = self._get_intents(encoded_utterance) - - logit_req_slot_status = self._get_requested_slots(encoded_utterance) - - logit_cat_slot_status, logit_cat_slot_value_status = self._get_categorical_slot_goals(encoded_utterance) - - (logit_noncat_slot_status, logit_spans) = self._get_noncategorical_slot_goals( - encoded_utterance=encoded_utterance, utterance_mask=utterance_mask, token_embeddings=token_embeddings - ) - - return ( - logit_intent_status, - logit_req_slot_status, - logit_cat_slot_status, - logit_cat_slot_value_status, - logit_noncat_slot_status, - logit_spans, - ) - - def _get_intents(self, encoded_utterance): - """Obtain logits for intents. - Args: - encoded_utterance: representation of utterance - """ - logits = self.intent_layer(encoded_utterance=encoded_utterance,) - return logits - - def _get_requested_slots(self, encoded_utterance): - """Obtain logits for requested slots. - Args: - encoded_utterance: representation of utterance - """ - - logits = self.requested_slots_layer(encoded_utterance=encoded_utterance) - return logits - - def _get_categorical_slot_goals(self, encoded_utterance): - """ - Obtain logits for status and values for categorical slots - Slot status values: none, dontcare, active - Args: - encoded_utterance: representation of utterance - """ - - # Predict the status of all categorical slots. - status_logits = self.slot_status_layer(encoded_utterance=encoded_utterance) - - value_status_logits = self.cat_slot_value_layer(encoded_utterance=encoded_utterance) - return status_logits, value_status_logits - - def _get_noncategorical_slot_goals(self, encoded_utterance, utterance_mask, token_embeddings): - """ - Obtain logits for status and slot spans for non-categorical slots. - Slot status values: none, dontcare, active - Args: - encoded_utterance: [CLS] token hidden state from BERT encoding of the utterance - utterance_mask: utterance mask wiht 0 for padding - token_embeddings: token embeddings from BERT encoding of the utterance - """ - status_logits = self.slot_status_layer(encoded_utterance=encoded_utterance) - - # Project the combined embeddings to obtain logits, Shape: (batch_size, max_num_slots, max_num_tokens, 2) - span_logits = self.noncat_layer1(token_embeddings) - span_logits = self.noncat_activation(span_logits) - span_logits = self.noncat_layer2(span_logits) - - # Mask out invalid logits for padded tokens. - utterance_mask = utterance_mask.to(bool) # Shape: (batch_size, max_num_tokens). - repeated_utterance_mask = utterance_mask.unsqueeze(-1) - negative_logits = (torch.finfo(span_logits.dtype).max * -0.7) * torch.ones( - span_logits.size(), device=span_logits.get_device(), dtype=span_logits.dtype - ) - - span_logits = torch.where(repeated_utterance_mask, span_logits, negative_logits) - - return status_logits, span_logits - - def _get_negative_logits(self, logits): - """Returns tensor with negative logits that will be used to mask out unused values for a particular service - Args: - logits: logits whose shape and type will be used to create negative tensor - """ - negative_logits = (torch.finfo(logits.dtype).max * -0.7) * torch.ones( - logits.size(), dtype=logits.dtype, device=logits.get_device() - ) - return negative_logits diff --git a/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_encoder.py b/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_encoder.py deleted file mode 100644 index 948a806ad37c..000000000000 --- a/nemo/collections/nlp/modules/dialogue_state_tracking/sgd_encoder.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional - -from torch import nn as nn - -from nemo.collections.nlp.modules.common.classifier import Classifier -from nemo.core.classes import typecheck -from nemo.core.neural_types import ChannelType, LogitsType, NeuralType - -__all__ = ['SGDEncoder'] - -ACT2FN = {"tanh": nn.functional.tanh, "relu": nn.functional.relu} - - -class SGDEncoder(Classifier): - """ - Neural module which encodes BERT hidden states - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """ - Returns definitions of module output ports. - """ - - return { - "logits": NeuralType(('B', 'T'), LogitsType()), - 'hidden_states': NeuralType(('B', 'T', 'C'), ChannelType()), - } - - def __init__( - self, hidden_size: int, activation: str = 'tanh', dropout: float = 0.0, use_transformer_init: bool = True, - ) -> None: - - """ - Args: - hidden_size: hidden size of the BERT model - activation: activation function applied - dropout: dropout ratio - use_transformer_init: use transformer initialization - """ - super().__init__(hidden_size=hidden_size, dropout=dropout) - self.fc = nn.Linear(hidden_size, hidden_size) - - if activation not in ACT2FN: - raise ValueError(f'{activation} is not in supported ' + '{ACTIVATIONS_F.keys()}') - - self.activation = ACT2FN[activation] - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - self.post_init(use_transformer_init=use_transformer_init) - - @typecheck() - def forward(self, hidden_states): - """ - Args: - hidden_states: bert output hidden states - """ - first_token_hidden_states = hidden_states[:, 0] - logits = self.fc(first_token_hidden_states) - logits = self.activation(logits) - logits = self.dropout1(logits) - return logits, self.dropout2(hidden_states)