diff --git a/examples/nlp/entity_linking/build_index.py b/examples/nlp/entity_linking/build_index.py deleted file mode 100644 index eeba5c83130e..000000000000 --- a/examples/nlp/entity_linking/build_index.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import pickle as pkl -import random -from argparse import ArgumentParser - -import h5py -import numpy as np -import torch -from omegaconf import DictConfig, OmegaConf -from sklearn.decomposition import PCA -from tqdm import tqdm - -from nemo.collections.nlp.models import EntityLinkingModel -from nemo.utils import logging - -try: - import faiss -except ModuleNotFoundError: - logging.warning("Faiss is required for building the index. Please install faiss-gpu") - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - -def build_index(cfg: DictConfig, model: object): - """ - Builds faiss index from index dataset specified in the config. - - Args: - cfg (DictConfig): Config file specifying index parameters - model (object): Encoder model - """ - - # Get index dataset embeddings - # PCA model exists and index embeddings have already been PCAed, no need to re-extract/PCA them - if cfg.apply_pca and os.path.isfile(cfg.pca.pca_save_name) and os.path.isfile(cfg.pca_embeddings_save_name): - logging.info("Loading reduced dimensionality embeddings") - embeddings = h5py.File(cfg.pca_embeddings_save_name, "r") - embeddings = embeddings[cfg.index_ds.name][:] - - elif os.path.isfile(cfg.embedding_save_name): - logging.info("Loading previously extracted index dataset embeddings") - embeddings = h5py.File(cfg.embedding_save_name, "r") - embeddings = embeddings[cfg.index_ds.name][:] - - else: - logging.info("Encoding index dataset, this may take a while") - index_dataloader = model.setup_dataloader(cfg.index_ds, is_index_data=True) - embeddings, concept_ids = get_index_embeddings(cfg, index_dataloader, model) - - # Create pca model to reduce dimensionality of index dataset and decrease memory footprint - if cfg.apply_pca: - - # Need to train PCA model and apply PCA transformation with newly trained model - if not os.path.isfile(cfg.pca.pca_save_name): - logging.info("Fitting PCA model for embedding dimensionality reduction") - pca_train_set = random.sample(list(embeddings), k=int(len(embeddings) * cfg.pca.sample_fraction)) - pca = PCA(n_components=cfg.pca.output_dim) - pca.fit(pca_train_set) - pkl.dump(pca, open(cfg.pca.pca_save_name, "wb")) - embeddings = reduce_embedding_dim(pca, embeddings, cfg) - - # PCA model already trained, just need to reduce dimensionality of all embeddings - elif not os.path.isfile(cfg.pca_embeddings_save_name): - pca = pkl.load(open(cfg.pca.pca_save_name, "rb")) - embeddings = reduce_embedding_dim(pca, embeddings, cfg) - - # Build faiss index from embeddings - logging.info(f"Training index with embedding dim size {cfg.dims} using {faiss.get_num_gpus()} gpus") - quantizer = faiss.IndexFlatL2(cfg.dims) - index = faiss.IndexIVFFlat(quantizer, cfg.dims, cfg.nlist) - index = faiss.index_cpu_to_all_gpus(index) - index.train(embeddings) - - logging.info("Adding dataset embeddings to index") - for i in tqdm(range(0, embeddings.shape[0], cfg.index_batch_size)): - index.add(embeddings[i : i + cfg.index_batch_size]) - - logging.info("Saving index") - faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index_save_name) - logging.info("Index built and saved") - - -def reduce_embedding_dim(pca, embeddings, cfg): - """Apply PCA transformation to index dataset embeddings""" - - logging.info("Applying PCA transformation to entire index dataset") - embeddings = np.array(pca.transform(embeddings), dtype=np.float32) - emb_file = h5py.File(cfg.pca_embeddings_save_name, "w") - emb_file.create_dataset(cfg.index_ds.name, data=embeddings) - emb_file.close() - - return embeddings - - -def get_index_embeddings(cfg: DictConfig, dataloader: object, model: object): - """Use entity linking encoder to get embeddings for full index dataset""" - embeddings = [] - concept_ids = [] - - with torch.no_grad(): - for batch in tqdm(dataloader): - input_ids, token_type_ids, input_mask, batch_concept_ids = batch - input_ids = input_ids.to(device) - token_type_ids = token_type_ids.to(device) - input_mask = input_mask.to(device) - batch_embeddings = model.forward( - input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=input_mask - ) - - embeddings.extend(batch_embeddings.detach().cpu().numpy()) - concept_ids.extend(batch_concept_ids.numpy()) - - emb_file = h5py.File(cfg.embedding_save_name, "w") - emb_file.create_dataset(cfg.index_ds.name, data=embeddings) - emb_file.close() - - pkl.dump(concept_ids, open(cfg.concept_id_save_name, "wb")) - - return embeddings, concept_ids - - -def load_model(cfg: DictConfig, restore: bool): - """ - Loads encoder model. - - Args: - cfg: Config file specifying model parameters - restore: Whether to restore model weights trained - by the user. Otherwise will load weights - used before self alignment pretraining. - """ - - if restore: - model = EntityLinkingModel.restore_from(cfg.nemo_path) - else: - cfg.train_ds = None - cfg.validation_ds = None - cfg.test_ds = None - model = EntityLinkingModel(cfg) - - model = model.to(device) - - return model - - -def main(cfg: DictConfig, restore: bool): - """ - Builds new index if one hasn't been built yet. - - Args: - cfg: Config file specifying index parameters - restore: Whether to restore model weights trained - by the user. Otherwise will load weights - used before self alignment pretraining. - """ - - logging.info("Loading entity linking encoder model") - model = load_model(cfg.model, restore) - - if not os.path.isfile(cfg.index.index_save_name) or ( - cfg.apply_pca and not os.path.isfile(cfg.index.pca.pca_save_name) - ): - logging.info("Building index") - build_index(cfg.index, model) - else: - logging.info("Index and pca model (if required) already exists. Skipping build index step.") - - if not os.path.isfile(cfg.index.idx_to_id): - logging.info("Mapping entity index postions to ids") - map_idx_to_ids(cfg.index) - else: - logging.info("Map from concept index to id already exists. Skipping mapping step.") - - -if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument( - "--restore", action="store_true", help="Whether to restore encoder model weights from nemo path" - ) - parser.add_argument("--project_dir", required=False, type=str, default=".") - parser.add_argument("--cfg", required=False, type=str, default="./conf/umls_medical_entity_linking_config.yaml") - args = parser.parse_args() - - cfg = OmegaConf.load(args.cfg) - cfg.project_dir = args.project_dir - - main(cfg, args.restore) diff --git a/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml b/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml deleted file mode 100644 index b7f538ccd68f..000000000000 --- a/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -project_dir: null -name: SelfAlignmentPretrainingForMedicalEntityLinking -trainer: - devices: 1 - num_nodes: 1 - max_epochs: 2 - max_steps: -1 - accumulate_grad_batches: 1 - precision: 16 - accelerator: gpu - strategy: ddp - gradient_clip_val: 0.0 - log_every_n_steps: 1 - val_check_interval: 2 - enable_checkpointing: False - logger: false -model: - nemo_path: ??? - max_seq_length: 128 - language_model: - pretrained_model_name: bert-base-uncased - config_file: null - config: null - lm_checkpoint: null - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} - vocab_file: null - tokenizer_model: null - do_lower_case: true - loss_params: null - train_ds: - data_file: ??? - max_seq_length: ${model.max_seq_length} - batch_size: 8 - shuffle: true - num_workers: 2 - pin_memory: false - drop_last: false - validation_ds: - data_file: ??? - max_seq_length: ${model.max_seq_length} - batch_size: 8 - shuffle: false - num_workers: 2 - pin_memory: false - drop_last: false - optim: - name: adam - lr: 3.0e-05 - weight_decay: 0.0 - sched: - name: CosineAnnealing - warmup_steps: null - warmup_ratio: 0.1 - min_lr: 0.0 - last_epoch: -1 -index: - dims: 768 - nlist: 2 - top_n: 3 - query_num_factor: 20 - index_save_name: ??? - index_batch_size: 10 - index_ds: - name: tiny_example - data_file: ??? - max_seq_length: ${model.max_seq_length} - batch_size: 100 - shuffle: false - num_workers: 2 - pin_memory: false - drop_last: false - idx_to_id: ${project_dir}/idx_to_id.pkl - id_to_string: ${project_dir}/id_to_string.pkl - concept_id_save_name: ${project_dir}/tiny_example_concept_ids.pkl - embedding_save_name: ${project_dir}/tiny_example_concept_embeddings.hdf5 - pca_embeddings_save_name: null - apply_pca: false - pca: null -exp_manager: - exp_dir: . - name: ${project_dir}/SelfAlignmentPretrainingTinyExample - create_tensorboard_logger: true - create_checkpoint_callback: true -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml b/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml deleted file mode 100644 index ad636ef23e18..000000000000 --- a/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml +++ /dev/null @@ -1,95 +0,0 @@ -project_dir: ??? -name: SelfAlignmentPretrainingForMedicalEntityLinking -trainer: - devices: 1 - num_nodes: 1 - max_epochs: 2 - max_steps: -1 - accumulate_grad_batches: 1 - precision: 16 - accelerator: gpu - strategy: ddp - gradient_clip_val: 0.0 - log_every_n_steps: 1 - val_check_interval: 1000 - enable_checkpointing: False - logger: false -model: - nemo_path: ${project_dir}/sap_bert_umls.nemo - raw_data: ${project_dir}/data/MRCONSO.RRF - max_seq_length: 128 - language_model: - pretrained_model_name: bert-base-uncased - config_file: null - config: null - lm_checkpoint: null - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} - vocab_file: null - tokenizer_model: null - do_lower_case: true - train_ds: - data_file: ${project_dir}/data/umls_train_pairs.tsv - max_seq_length: ${model.max_seq_length} - batch_size: 128 - shuffle: true - num_workers: 2 - pin_memory: false - drop_last: false - validation_ds: - data_file: ${project_dir}/data/umls_validation_pairs.tsv - max_seq_length: ${model.max_seq_length} - batch_size: 128 - shuffle: false - num_workers: 2 - pin_memory: false - drop_last: false - optim: - name: adam - lr: 3.0e-05 - weight_decay: 0.0 - sched: - name: CosineAnnealing - warmup_steps: null - warmup_ratio: 0.1 - min_lr: 0.0 - last_epoch: -1 -index: - dims: 256 - nlist: 300 - top_n: 5 - query_num_factor: 20 - index_save_name: ${project_dir}/medical_entity_linking_index - index_batch_size: 1000 - raw_data: ${model.raw_data} - index_ds: - name: umls - data_file: ${project_dir}/data/umls_index_concepts.tsv - max_seq_length: ${model.max_seq_length} - batch_size: 128 - shuffle: false - num_workers: 2 - pin_memory: false - drop_last: false - idx_to_id: ${project_dir}/data/idx_to_id.pkl - id_to_string: ${project_dir}/data/id_to_string.pkl - concept_id_save_name: ${project_dir}/data/concept_ids.pkl - embedding_save_name: ${project_dir}/data/medical_concept_embeddings.hdf5 - pca_embeddings_save_name: ${project_dir}/data/medical_concept_reduced_${index.dims}dim_embeddings.hdf5 - apply_pca: true - pca: - input_dim: 756 - output_dim: ${index.dims} - sample_fraction: 0.5 - pca_save_name: ${project_dir}/${index.pca.input_dim}_to_${index.pca.output_dim}_pca_model.pkl -exp_manager: - exp_dir: ${project_dir}/medical_entity_linking_experiments - name: sap_bert_umls - create_tensorboard_logger: true - create_checkpoint_callback: true -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/examples/nlp/entity_linking/data/umls_dataset_processing.py b/examples/nlp/entity_linking/data/umls_dataset_processing.py deleted file mode 100644 index 03a17da3c0bc..000000000000 --- a/examples/nlp/entity_linking/data/umls_dataset_processing.py +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -import pickle as pkl -import random -from argparse import ArgumentParser - -import pandas as pd -from omegaconf import OmegaConf -from tqdm import tqdm - -# Info on these headers can be found here on the UMLS website https://www.ncbi.nlm.nih.gov/books/NBK9685/ -# section 3.3.4 Table 1 -HEADERS = [ - 'CUI', - 'LAT', - 'TS', - 'LUI', - 'STT', - 'SUI', - 'ISPREF', - 'AUI', - 'SAUI', - 'SCUI', - 'SDUI', - 'SAB', - 'TTY', - 'CODE', - 'STR', - 'SRL', - 'SUPPRESS', - 'CVF', -] - - -def process_umls_training_dataset(data_path, train_save_name, val_save_name, max_pairs, train_split, headers): - """ - Generates and saves UMLS self alignment pretraining train and validation data. Takes the raw .RRF UMLS - data file and creates different pair combinations for entities with the same CUI. Each row in the output - will be formatted as 'CUI EntitySynonym1 EntitySynonym2' with each item in a row separated by tabs. - Saves two .tsv output files, one for the train split and one for the validation split. - Only data marked as English is added to the train and val splits. - - Arguments: - data_path (str): path to MRCONSO.RRF UMLS data file - train_save_name (str): path to where training data will be saved - val_save_name (str): path to where validation data will be saved - max_pairs (int): max number of pairs for any one CUI added to the train - or validation splits - train_split (float): precentage of raw data to be added to train set split - headers (list): column lables within MRCONSO.RRF - """ - - print("Loading training data file...") - df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|') - train_file = open(train_save_name, 'w') - val_file = open(val_save_name, 'w') - - cui = df["CUI"].iloc[0] - names = [] - random.seed(2021) - - for idx in tqdm(range(len(df))): - # Address incorrectly formatted data - if type(df["STR"].iloc[idx]) != str or "|" in df["STR"].iloc[idx]: - continue - - # Collect all english concept strings matching the current CUI - if df["CUI"].iloc[idx] == cui and df["LAT"].iloc[idx] == "ENG": - concept_string = df["STR"].iloc[idx] - names.append(concept_string) - - else: - # Pair off concept synonyms to make training and val sets - pairs = list(itertools.combinations(names, 2)) - - if len(pairs) == 0: - # Not enough concepts gathered to make a pair - cui = df["CUI"].iloc[idx] - names = [df["STR"].iloc[idx]] - continue - - # Removing leading C to convert label string to int - cui = int(cui[1:]) - random.shuffle(pairs) - - # Keep up to max pairs number pairs for any one concept - for pair in pairs[:max_pairs]: - - # Want concepts in train and val splits to be randomly selected and mutually exclusive - add_to_train = random.random() - - if add_to_train <= train_split: - train_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n') - else: - val_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n') - - # Switch to next concept - cui = df["CUI"].iloc[idx] - names = [df["STR"].iloc[idx]] - - train_file.close() - val_file.close() - print("Finished making training and validation data") - - -def process_umls_index_dataset(data_path, data_savename, id2string_savename, headers): - """ - Generates data file needed to build a UMLS index and a hash table mapping each - CUI to one canonical concept string. Takes the raw .RRF data file and saves - a .tsv indec concept file as well as the a .pkl file of cui to concept string - mappings. Only data marked as English is added to the index data file. - - Arguments: - data_path (str): path to MRCONSO.RRF UMLS data file - data_savename (str): path to where .tsv index data will be saved - id2string_savename (str): path to where .pkl cui to string mapping will - be saved - headers (list): column lables within MRCONSO.RRF - """ - - print("Loading index data file...") - df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|') - id2string = {} - - with open(data_savename, "w") as outfile: - for idx, row in tqdm(df.iterrows(), total=df.shape[0]): - # Address incorrectly formatted data - if type(row["STR"]) != str or "|" in row["STR"]: - continue - - cui = row["CUI"] - sent = row["STR"] - - # Removing leading C to convert label string to int - cui = int(cui[1:]) - - # Only keeping english concepts - if row["LAT"] == "ENG": - outfile.write(f'{cui}\t{sent}\n') - - # Matching each cui to one canonical string represention - if cui not in id2string and ":" not in sent: - id2string[cui] = sent - - outfile.close() - pkl.dump(id2string, open(id2string_savename, "wb")) - print("Finished saving index data and id to concept mapping") - - -if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument("--index", action="store_true", help="Whether to process data for building an index") - parser.add_argument("--project_dir", required=False, type=str, default=".") - parser.add_argument("--cfg", required=False, type=str, default="conf/umls_medical_entity_linking_config.yaml") - parser.add_argument( - "--max_pairs", required=False, type=int, default=50, help="Max number of train pairs for a single concepts" - ) - parser.add_argument( - "--train_split", required=False, type=float, default=0.99, help="Precentage of data to add to train set" - ) - - args = parser.parse_args() - cfg = OmegaConf.load(args.cfg) - cfg.project_dir = args.project_dir - - if args.index: - process_umls_index_dataset(cfg.index.raw_data, cfg.index.index_ds.data_file, cfg.index.id_to_string, HEADERS) - else: - process_umls_training_dataset( - cfg.model.raw_data, - cfg.model.train_ds.data_file, - cfg.model.validation_ds.data_file, - args.max_pairs, - args.train_split, - HEADERS, - ) diff --git a/examples/nlp/entity_linking/query_index.py b/examples/nlp/entity_linking/query_index.py deleted file mode 100644 index 6cb51a7de160..000000000000 --- a/examples/nlp/entity_linking/query_index.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import pickle as pkl -from argparse import ArgumentParser -from collections import OrderedDict -from typing import Dict - -import numpy as np -import torch -from build_index import load_model -from omegaconf import DictConfig, OmegaConf - -from nemo.utils import logging - -try: - import faiss -except ModuleNotFoundError: - logging.warning("Faiss is required for building the index. Please install faiss-gpu") - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - -def get_query_embedding(query, model): - """Use entity linking encoder to get embedding for index query""" - model_input = model.tokenizer( - query, - add_special_tokens=True, - padding=True, - truncation=True, - max_length=512, - return_token_type_ids=True, - return_attention_mask=True, - ) - - query_emb = model.forward( - input_ids=torch.LongTensor([model_input["input_ids"]]).to(device), - token_type_ids=torch.LongTensor([model_input["token_type_ids"]]).to(device), - attention_mask=torch.LongTensor([model_input["attention_mask"]]).to(device), - ) - - return query_emb - - -def query_index( - query: str, cfg: DictConfig, model: object, index: object, pca: object, idx2id: dict, id2string: dict, -) -> Dict: - - """ - Query the nearest neighbor index of entities to find the - concepts in the index dataset that are most similar to the - query. - - Args: - query (str): entity to look up in the index - cfg (DictConfig): config object to specifiy query parameters - model (EntityLinkingModel): entity linking encoder model - index (object): faiss index - pca (object): sklearn pca transformation to be applied to queries - idx2id (dict): dictionary mapping unique concept dataset index to - its CUI - id2string (dict): dictionary mapping each unqiue CUI to a - representative english description of - the concept - Returns: - A dictionary with the concept ids of the index's most similar - entities as the keys and a tuple containing the string - representation of that concept and its cosine similarity to - the query as the values. - """ - query_emb = get_query_embedding(query, model).detach().cpu().numpy() - - if cfg.apply_pca: - query_emb = pca.transform(query_emb) - - dist, neighbors = index.search(query_emb.astype(np.float32), cfg.query_num_factor * cfg.top_n) - dist, neighbors = dist[0], neighbors[0] - unique_ids = OrderedDict() - neighbor_idx = 0 - - # Many of nearest neighbors could map to the same concept id, their idx is their unique identifier - while len(unique_ids) < cfg.top_n and neighbor_idx < len(neighbors): - concept_id_idx = neighbors[neighbor_idx] - concept_id = idx2id[concept_id_idx] - - # Only want one instance of each unique concept - if concept_id not in unique_ids: - concept = id2string[concept_id] - unique_ids[concept_id] = (concept, 1 - dist[neighbor_idx]) - - neighbor_idx += 1 - - unique_ids = dict(unique_ids) - - return unique_ids - - -def main(cfg: DictConfig, restore: bool): - """ - Loads faiss index and allows commandline queries - to the index. Builds new index if one hasn't been built yet. - - Args: - cfg: Config file specifying index parameters - restore: Whether to restore model weights trained - by the user. Otherwise will load weights - used before self alignment pretraining. - """ - - if not os.path.isfile(cfg.index.index_save_name) or ( - cfg.apply_pca and not os.path.isfile(cfg.index.pca.pca_save_name) or not os.path.isfile(cfg.index.idx_to_id) - ): - logging.warning("Either no index and/or no mapping from entity idx to ids exists. Please run `build_index.py`") - return - - logging.info("Loading entity linking encoder model") - model = load_model(cfg.model, restore) - - logging.info("Loading index and associated files") - index = faiss.read_index(cfg.index.index_save_name) - idx2id = pkl.load(open(cfg.index.idx_to_id, "rb")) - id2string = pkl.load(open(cfg.index.id_to_string, "rb")) # Should be created during dataset prep - - if cfg.index.apply_pca: - pca = pkl.load(open(cfg.index.pca.pca_save_name, "rb")) - - while True: - query = input("enter index query: ") - output = query_index(query, cfg.top_n, cfg.index, model, index, pca, idx2id, id2string) - - if query == "exit": - break - - for concept_id in output: - concept_details = output[concept_id] - concept_id = "C" + str(concept_id).zfill(7) - print(concept_id, concept_details) - - print("----------------\n") - - -if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument( - "--restore", action="store_true", help="Whether to restore encoder model weights from nemo path" - ) - parser.add_argument("--project_dir", required=False, type=str, default=".") - parser.add_argument("--cfg", required=False, type=str, default="./conf/umls_medical_entity_linking_config.yaml") - args = parser.parse_args() - - cfg = OmegaConf.load(args.cfg) - cfg.project_dir = args.project_dir - - main(cfg, args.restore) diff --git a/examples/nlp/entity_linking/self_alignment_pretraining.py b/examples/nlp/entity_linking/self_alignment_pretraining.py deleted file mode 100644 index 58b20f384d04..000000000000 --- a/examples/nlp/entity_linking/self_alignment_pretraining.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# Please see tutorial at Nemo/tutorials/nlp/Entity_Linking_Medical.ipynb for -# more information on entity linking and self alignment pretraining. - -from lightning.pytorch import Trainer -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import EntityLinkingModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="umls_medical_entity_linking_config.yaml") -def main(cfg: DictConfig) -> None: - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - logging.info(f"\nConfig Params:\n{OmegaConf.to_yaml(cfg)}") - trainer = Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - logging.info(f"Loading weights from pretrained model {cfg.model.language_model.pretrained_model_name}") - model = EntityLinkingModel(cfg=cfg.model, trainer=trainer) - logging.info("===========================================================================================") - logging.info('Starting training...') - trainer.fit(model) - logging.info('Training finished!') - logging.info("===========================================================================================") - - if cfg.model.nemo_path: - # '.nemo' file contains the last checkpoint and the params to initialize the model - model.save_to(cfg.model.nemo_path) - logging.info(f'Model is saved into `.nemo` file: {cfg.model.nemo_path}') - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/glue_benchmark/glue_benchmark.py b/examples/nlp/glue_benchmark/glue_benchmark.py deleted file mode 100644 index 28efb9520fbd..000000000000 --- a/examples/nlp/glue_benchmark/glue_benchmark.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -## Tasks -This script works with all GLUE Benchmark tasks, more details about the GLUE Benchmark could be found at -https://gluebenchmark.com/ - -More details on how to use this script could be found in tutorials/nlp/GLUE_Benchmark.ipynb - -## Model Training - -To train GLUEModel with the default config file, run: - python glue_benchmark.py \ - model.dataset.data_dir= \ - model.task_name=TASK_NAME \ - trainer.max_epochs= \ - trainer.devices="[] - -Supported task names: -["cola", "sst-2", "mrpc", "sts-b", "qqp", "mnli", "qnli", "rte", "wnli"] -Note, MNLI task includes both matched and mismatched dev sets -""" - -import os - -import lightning.pytorch as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import GLUEModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_name="glue_benchmark_config") -def main(cfg: DictConfig) -> None: - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters like here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager_cfg = cfg.get("exp_manager", None) - - if exp_manager_cfg: - exp_manager_cfg.name = cfg.model.task_name - logging.info(f'Setting task_name to {exp_manager_cfg.name} in exp_manager') - exp_manager(trainer, exp_manager_cfg) - - if cfg.model.nemo_path and os.path.exists(cfg.model.nemo_path): - model = GLUEModel.restore_from(cfg.model.nemo_path) - logging.info(f'Restoring model from {cfg.model.nemo_path}') - model.update_data_dir(data_dir=cfg.model.dataset.data_dir) - model.setup_training_data() - model.setup_multiple_validation_data() - trainer.fit(model) - else: - model = GLUEModel(cfg.model, trainer=trainer) - trainer.fit(model) - if cfg.model.nemo_path: - model.save_to(cfg.model.nemo_path) - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/glue_benchmark/glue_benchmark_config.yaml b/examples/nlp/glue_benchmark/glue_benchmark_config.yaml deleted file mode 100644 index 21cdc04db22f..000000000000 --- a/examples/nlp/glue_benchmark/glue_benchmark_config.yaml +++ /dev/null @@ -1,82 +0,0 @@ -# GLUE Benchmark with pre-trained BERT models -supported_tasks: &supported_tasks ['cola', 'sst-2', 'mrpc', 'sts-b', 'qqp', 'mnli', 'qnli', 'rte', 'wnli'] - -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 3 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 16 - accelerator: gpu - strategy: ddp - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - -model: - task_name: &task_name mrpc # choose from: ["cola", "sst-2", "mrpc", "sts-b", "qqp", "mnli", "qnli", "rte", "wnli"] GLUE task name, MNLI includes both matched and mismatched dev sets - supported_tasks: *supported_tasks - output_dir: null # dir to write write predictions - nemo_path: null # filename to save the model and associated artifacts to .nemo file - dataset: - data_dir: ??? # /path/to/data - max_seq_length: 128 - use_cache: true - - # shared across dataloaders: - num_workers: 2 - pin_memory: false - drop_last: false - - train_ds: - ds_item: 'train.tsv' - shuffle: true - num_samples: -1 - batch_size: 32 - - validation_ds: - ds_item: 'dev.tsv' # for MNLI 'dev_matched.tsv' and 'dev_mismatched.tsv' will de used - shuffle: false - num_samples: -1 - batch_size: 32 - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently. - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - optim: - name: adam - lr: 5e-5 - weight_decay: 0.00 - - sched: - name: WarmupAnnealing - # Scheduler params - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./NeMo_experiments" - name: *task_name # The name of your model - create_tensorboard_logger: True # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: True # Whether you want exp_manager to create a modelcheckpoint callback - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/examples/nlp/information_retrieval/bert_dpr.py b/examples/nlp/information_retrieval/bert_dpr.py deleted file mode 100644 index 4fc791da04fd..000000000000 --- a/examples/nlp/information_retrieval/bert_dpr.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import lightning.pytorch as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import BertDPRModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="bert_ir_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - bert_dpr_model = BertDPRModel(cfg.model, trainer=trainer) - trainer.fit(bert_dpr_model) - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/information_retrieval/bert_joint_ir.py b/examples/nlp/information_retrieval/bert_joint_ir.py deleted file mode 100644 index f95cdd04e036..000000000000 --- a/examples/nlp/information_retrieval/bert_joint_ir.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import lightning.pytorch as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import BertJointIRModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="bert_ir_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - bert_joint_ir_model = BertJointIRModel(cfg.model, trainer=trainer) - trainer.fit(bert_joint_ir_model) - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/information_retrieval/conf/bert_ir_config.yaml b/examples/nlp/information_retrieval/conf/bert_ir_config.yaml deleted file mode 100644 index 56e573e0bcf6..000000000000 --- a/examples/nlp/information_retrieval/conf/bert_ir_config.yaml +++ /dev/null @@ -1,99 +0,0 @@ -# Fine-tuning BERT model for information retrieval -name: &name BertIR -trainer: - devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices - num_nodes: 1 - max_epochs: 2 # the number of training epochs - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 16 # 16 to use AMP - accelerator: gpu - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 0.05 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - enable_checkpointing: False # provided by exp_manager - logger: false # provided by exp_manager - -model: - nemo_path: null # exported .nemo path - - language_model: - pretrained_model_name: bert-base-uncased - sim_score_dropout: 0.1 - lm_checkpoint: null - config: - attention_probs_dropout_prob: 0.1 - hidden_act: gelu - hidden_dropout_prob: 0.1 - hidden_size: 768 - initializer_range: 0.02 - intermediate_size: 3072 - max_position_embeddings: 512 - num_attention_heads: 12 - num_hidden_layers: 12 - type_vocab_size: 2 - vocab_size: 30522 - config_file: null # json file, precedence over config - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec - vocab_file: null # path to vocab file - tokenizer_model: null # tokenizer model for sentencepiece - special_tokens: null - - train_ds: - passages: null # path to file with passages and their indices - queries: null # path to file with training queries and their indices - query_to_passages: null - # path to file with training examples which have the form of - # (query_id, relevant_passage_id, irrelevant_passage_1_id, ..., irrelevant_passage_n_id) - num_negatives: 10 - batch_size: 6 - psg_cache_format: npz - shuffle: true - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 1 - drop_last: false - pin_memory: false - - validation_ds: - passages: null # path to file with passages and their indices - queries: null # path to file with validation queries and their indices - query_to_passages: null # path to file with passages to re-rank for each validation query - num_negatives: 10 - batch_size: 6 - psg_cache_format: pkl - shuffle: false - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 1 - drop_last: false - pin_memory: false - - optim: - name: adam - lr: 1e-5 - betas: [0.9, 0.999] - weight_decay: 0 - - sched: - name: WarmupAnnealing - warmup_steps: null - warmup_ratio: 0.05 - last_epoch: -1 - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -exp_manager: - exp_dir: null # where to store logs and checkpoints - name: *name # name of experiment - create_tensorboard_logger: True - create_checkpoint_callback: True - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/examples/nlp/information_retrieval/conf/megatron_bert_embedding_config.yaml b/examples/nlp/information_retrieval/conf/megatron_bert_embedding_config.yaml deleted file mode 100644 index 7e4ecf09f5a0..000000000000 --- a/examples/nlp/information_retrieval/conf/megatron_bert_embedding_config.yaml +++ /dev/null @@ -1,160 +0,0 @@ -name: megatron_bert -restore_from_path: null # used when starting from a .nemo file - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: 50 - limit_test_batches: 500 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - benchmark: False - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_bert - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - filename: 'megatron_bert--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - - -model: - # model parallelism - mcore_bert: True - micro_batch_size: 4 - global_batch_size: 8 - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - virtual_pipeline_model_parallel_size: null - - # model architecture - encoder_seq_length: 512 - max_position_embeddings: ${.encoder_seq_length} - position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental. - num_layers: 24 - hidden_size: 1024 - ffn_hidden_size: 4096 # Transformer FFN hidden size. Usually 4 * hidden_size. - num_attention_heads: 16 - transformer_block_type: post_ln - add_pooler: True - add_lm_head: False - init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') - hidden_dropout: 0.1 # Dropout probability for hidden state transformer. - kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null - apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number. - normalization: layernorm - layernorm_epsilon: 1e-12 - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - pre_process: True # add embedding - post_process: True # add pooler - bert_binary_head: True # BERT binary head - megatron_legacy: False - tokenizer: - library: 'huggingface' - type: 'intfloat/e5-large-unsupervised' - model: null - vocab_file: null - merge_file: null - - # embedding-specific arguemnts - softmax_temp: 0.02 # softmax temp for contrastive loss - global_inbatch_negatives: True # whether to use in-batch negatives from other ranks during training - backprop_type: 'global' # whether to use `global` or `local` backpropagation during training. Refer to Flava paper for details. - - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - fp32_residual_connection: False # Move residual connections to fp32 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters - grad_allreduce_chunk_size_mb: 125 - grad_div_ar_fusion: False - - # miscellaneous - seed: 1234 - use_cpu_initialization: False # Init weights on the CPU (slow for large models) - onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. - gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - ## Activation Checkpointing - # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. - # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - # 'full' will checkpoint the entire transformer layer. - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null - # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. - # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. - num_micro_batches_with_partial_activation_checkpoints: null - # This feature is valid only when used with pipeline-model-parallelism. - # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed - # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is - # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint - # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. - # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. - activations_checkpoint_layers_per_pipeline: null - # This feature is valid only when used with pipeline-model-parallelism. - # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later - # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than - # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage - # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', - # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. - sequence_parallel: False - - data: - # Path to data must be specified by the user. - data_train: null - data_validation: null - hard_negatives_to_train: 4 # number of hard negatives to use per example for training - index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: mmap - splits_string: 900,50,50 - seq_length: ${model.encoder_seq_length} - skip_warmup: True - num_workers: 0 - dataloader_type: single # cyclic, LDDL - reset_position_ids: False # Reset position ids after end-of-document token - reset_attention_mask: False # Reset attention mask after end-of-document token - eod_mask_loss: False # Mask loss for the end of document tokens - masked_lm_prob: 0.15 # Probability of replacing a token with mask. - short_seq_prob: 0.1 # Probability of producing a short sequence. - - optim: - name: fused_adam - lr: 2e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 500 - constant_steps: 50000 - min_lr: 2e-5 diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml deleted file mode 100644 index e407aec167e9..000000000000 --- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml +++ /dev/null @@ -1,221 +0,0 @@ -name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: bf16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 9999 - max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 # frequency with which training steps are logged - val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.test_ds.metric.name} - save_top_k: 1 - mode: min - save_nemo_on_train_end: True - filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' - model_parallel_size: ${model.tensor_model_parallel_size} - always_save_nemo: True - save_best_model: True - -model: - seed: 1234 - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - - global_batch_size: 1 - micro_batch_size: 1 - restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. - sync_batch_comm: False - megatron_amp_O2: False - - ## Sequence Parallelism - # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: False - - ## Activation Checkpoint - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null # not used with 'selective' - activations_checkpoint_layers_per_pipeline: null - gradient_as_bucket_view: False - - hidden_dropout: 0.0 - attention_dropout: 0.0 - ffn_dropout: 0.0 - temperature: 0.02 - num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only - use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only - post_process: False # should be False. - transformer_engine: True # required to be True for newer versions of Megatron-LM based models - mcore_gpt: True # required to be True for newer versions of Megatron-LM based models - use_flash_attention: True - precision: bf16 - - peft: - peft_scheme: "lora" # can be either adapter,ia3, or ptuning - restore_from_path: null - restore_from_ckpt: - checkpoint_dir: null - checkpoint_name: null - - # Used for adapter peft training - adapter_tuning: - type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' - adapter_dim: 32 - adapter_dropout: 0.0 - norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] - layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - lora_tuning: - target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2) - adapter_dim: 32 - alpha: ${peft.lora_tuning.adapter_dim} - adapter_dropout: 0.0 - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - # Used for p-tuning peft training - p_tuning: - virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence - bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck - embedding_dim: 1024 # the size of the prompt encoder embeddings - init_std: 0.023 - - ia3_tuning: - layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - - selective_tuning: - tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre - - data: - test_ds: - query_file_names: ??? # Path to a list of JSONL files corresponding to the query data. Data format is identical to validation_ds. - doc_file_names: ??? # Path to a list of JSONL files corresponding to the doc data. Data format is identical to validation_ds. - names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics. - global_batch_size: ${global_batch_size} - micro_batch_size: ${micro_batch_size} - shuffle: False - num_workers: 0 - pin_memory: True - max_seq_length: 2048 - min_seq_length: 1 - drop_last: False - add_eos: True - add_bos: False - write_embeddings_to_file: True - output_file_path_prefix: "test_embeddings" # Prefix of the file to write predictions to. - index_mapping_dir: null # Path to a directory to write index mapping files. - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - -inference: - greedy: True # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - outfile_path: output.txt - compute_attention_mask: True - -# server-related configs -server: False # whether launch the API server -port: 5555 # the port number for the inference server -web_server: False # whether launch the web inference server -share: True # whether create a public URL -username: test # user name for web client -password: test2 # password for web client -web_port: 9889 # the port number of the web server 1058 -chat: False # use the chat interface -chatbot_config: - value: False # whether to inject the value attributes - attributes: - - name: Quality - min: 0 - max: 4 - key: quality - type: int - default: 4 - - name: Toxicity - min: 0 - max: 4 - key: toxcity - type: int - default: 0 - - name: Humor - min: 0 - max: 4 - key: humor - type: int - default: 0 - - name: Creativity - min: 0 - max: 4 - key: creativity - type: int - default: 0 - - name: Violence - min: 0 - max: 4 - key: violence - type: int - default: 0 - - name: Helpfulness - min: 0 - max: 4 - key: helpfulness - type: int - default: 4 - - name: Not_Appropriate - min: 0 - max: 4 - key: not_appropriate - type: int - default: 0 - - name: Language - choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh'] - key: lang - type: list - default: en - - user: User - assistant: Assistant - system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml deleted file mode 100644 index 1c2db1a862f4..000000000000 --- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml +++ /dev/null @@ -1,220 +0,0 @@ -name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: bf16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: null - max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 # frequency with which training steps are logged - val_check_interval: ${trainer.max_steps} # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch - gradient_clip_val: null - num_sanity_val_steps: 0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.validation_ds.metric.name} - save_top_k: 1 - mode: min - save_nemo_on_train_end: True - filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' - model_parallel_size: ${model.tensor_model_parallel_size} - always_save_nemo: False - save_best_model: True - create_early_stopping_callback: False - early_stopping_callback_params: - monitor: "val_loss" - mode: "min" - min_delta: 0.001 - patience: 10 - verbose: True - strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. - -model: - seed: 1234 - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - - global_batch_size: 128 - micro_batch_size: 4 - restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. - sync_batch_comm: False - megatron_amp_O2: True - - ## Sequence Parallelism - # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: False - - ## Activation Checkpoint - activations_checkpoint_granularity: selective # 'selective' or 'full' - activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null # not used with 'selective' - activations_checkpoint_layers_per_pipeline: null - gradient_as_bucket_view: False - - hidden_dropout: 0.0 - attention_dropout: 0.0 - ffn_dropout: 0.0 - temperature: 0.02 - num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only - use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only - post_process: False # should be False. - transformer_engine: True # required to be True for newer versions of Megatron-LM based models - mcore_gpt: True # required to be True for newer versions of Megatron-LM based models - use_flash_attention: True - precision: bf16 - apply_rope_fusion: False - reward_model_loss: False # Set this to true to perform RLHF style reward model loss -log(sigmoid(accept_logit - reject_logit)) - - peft: - peft_scheme: "lora" # can be either adapter,ia3, or ptuning - restore_from_path: null - - # Used for adapter peft training - adapter_tuning: - type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' - adapter_dim: 32 - adapter_dropout: 0.0 - norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] - layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - lora_tuning: - target_modules: ['attention_qkv', 'attention_dense', 'mlp_fc1', 'mlp_fc2'] # - adapter_dim: 32 - adapter_dropout: 0.0 - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - # Used for p-tuning peft training - p_tuning: - virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence - bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck - embedding_dim: 1024 # the size of the prompt encoder embeddings - init_std: 0.023 - - ia3_tuning: - layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - - selective_tuning: - tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre - - data: - train_ds: - # Example of how to specify paths to multiple datasets - # file_names: - # - /path/to/squad.jsonl - # - /path/to/mnli.jsonl - # - /path/to/boolq.jsonl - # Example of how each dataset is formatted - # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} - file_names: ??? # Path to a list of JSONL files corresponding to the source data. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: True - num_workers: 0 - memmap_workers: 2 - pin_memory: True - max_seq_length: 512 # Even if the base model can handle longer sequences, 512 is generally a good choice for training efficiency. - min_seq_length: 1 - drop_last: True - # Example of how to specify concat_sampling_probabilities - # concat_sampling_probabilities: - # - 0.5 - # - 0.25 - # - 0.25 - concat_sampling_probabilities: - - 1.0 - label_key: 'output' - add_eos: True - add_bos: False - index_mapping_dir: null # Path to a directory to write index mapping files. - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - validation_ds: - query_file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - doc_file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: False - num_workers: 0 - memmap_workers: ${model.data.train_ds.memmap_workers} - pin_memory: True - max_seq_length: ${model.data.train_ds.max_seq_length} - min_seq_length: 1 - drop_last: False - label_key: ${model.data.train_ds.label_key} - add_eos: ${model.data.train_ds.add_eos} - add_bos: ${model.data.train_ds.add_bos} - write_embeddings_to_file: False - output_file_path_prefix: "validation_embeddings" # Prefix of the file to write predictions to. - index_mapping_dir: null # Path to a directory to write index mapping files. - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - test_ds: - file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: null # Names of the corresponding datasets used to log metrics. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: False - num_workers: 0 - memmap_workers: ${model.data.train_ds.memmap_workers} - pin_memory: True - max_seq_length: ${model.data.train_ds.max_seq_length} - min_seq_length: 1 - drop_last: False - add_eos: ${model.data.train_ds.add_eos} - add_bos: ${model.data.train_ds.add_bos} - write_predictions_to_file: True - output_file_path_prefix: "test_embeddings" # Prefix of the file to write predictions to. - index_mapping_dir: null # Path to a directory to write index mapping files. - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - - optim: - name: fused_adam - lr: 1e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 50 - min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 - constant_steps: 0 # Constant steps should also be 0 when min_lr=0 - monitor: val_loss - reduce_on_plateau: false \ No newline at end of file diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_reranker_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_reranker_tuning_config.yaml deleted file mode 100644 index 863b5fb475a0..000000000000 --- a/examples/nlp/information_retrieval/conf/megatron_gpt_reranker_tuning_config.yaml +++ /dev/null @@ -1,222 +0,0 @@ -name: megatron_gpt_peft_reranker_tuning - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: bf16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: null - max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 # frequency with which training steps are logged - val_check_interval: ${trainer.max_steps} # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch - gradient_clip_val: null - num_sanity_val_steps: 0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.validation_ds.metric.name} - save_top_k: 1 - mode: min - save_nemo_on_train_end: True - filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' - model_parallel_size: ${model.tensor_model_parallel_size} - always_save_nemo: False - save_best_model: True - create_early_stopping_callback: False - early_stopping_callback_params: - monitor: "val_loss" - mode: "min" - min_delta: 0.001 - patience: 10 - verbose: True - strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. - -model: - seed: 1234 - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - - global_batch_size: 128 - micro_batch_size: 4 - restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. - sync_batch_comm: False - megatron_amp_O2: True - - ## Sequence Parallelism - # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: False - - ## Activation Checkpoint - activations_checkpoint_granularity: selective # 'selective' or 'full' - activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null # not used with 'selective' - activations_checkpoint_layers_per_pipeline: null - gradient_as_bucket_view: False - - hidden_dropout: 0.0 - attention_dropout: 0.0 - ffn_dropout: 0.0 - temperature: 0.02 - num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only - use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only - post_process: False # should be False. - apply_rope_fusion: False - transformer_engine: True # required to be True for newer versions of Megatron-LM based models - mcore_gpt: True # required to be True for newer versions of Megatron-LM based models - use_flash_attention: True - precision: bf16 - - peft: - peft_scheme: "mlp_head,lora" # can be either adapter,ia3, or ptuning - restore_from_path: null - - # Used for adapter peft training - adapter_tuning: - type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' - adapter_dim: 32 - adapter_dropout: 0.0 - norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] - layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - lora_tuning: - target_modules: ['attention_qkv', 'attention_dense', 'mlp_fc1', 'mlp_fc2'] # - adapter_dim: 32 - adapter_dropout: 0.0 - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - # Used for p-tuning peft training - p_tuning: - virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence - bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck - embedding_dim: 1024 # the size of the prompt encoder embeddings - init_std: 0.023 - - # Instead of using the GPT LM Head, we can use a custom head for the reranking task - mlp_head_tuning: - out_features: 1 - - ia3_tuning: - layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - - selective_tuning: - tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre - - data: - train_ds: - # Example of how to specify paths to multiple datasets - # file_names: - # - /path/to/squad.jsonl - # - /path/to/mnli.jsonl - # - /path/to/boolq.jsonl - # Example of how each dataset is formatted - # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} - file_names: ??? # Path to a list of JSONL files corresponding to the source data. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: True - num_workers: 0 - memmap_workers: 2 - pin_memory: True - max_seq_length: 512 # Even if the base model can handle longer sequences, 512 is generally a good choice for training efficiency. - min_seq_length: 1 - drop_last: True - # Example of how to specify concat_sampling_probabilities - # concat_sampling_probabilities: - # - 0.5 - # - 0.25 - # - 0.25 - concat_sampling_probabilities: - - 1.0 - label_key: 'output' - add_eos: True - add_bos: False - index_mapping_dir: null # Path to a directory to write index mapping files. - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - validation_ds: - file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: ["validation"] # Names of the corresponding datasets used to log metrics. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: False - num_workers: 0 - memmap_workers: ${model.data.train_ds.memmap_workers} - pin_memory: True - max_seq_length: ${model.data.train_ds.max_seq_length} - min_seq_length: 1 - drop_last: False - label_key: ${model.data.train_ds.label_key} - add_eos: ${model.data.train_ds.add_eos} - add_bos: ${model.data.train_ds.add_bos} - write_embeddings_to_file: False - output_file_path_prefix: "validation_rankings" # Prefix of the file to write predictions to. - index_mapping_dir: null # Path to a directory to write index mapping files. - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - test_ds: - file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: null # Names of the corresponding datasets used to log metrics. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: False - num_workers: 0 - memmap_workers: ${model.data.train_ds.memmap_workers} - pin_memory: True - max_seq_length: ${model.data.train_ds.max_seq_length} - min_seq_length: 1 - drop_last: False - add_eos: ${model.data.train_ds.add_eos} - add_bos: ${model.data.train_ds.add_bos} - write_predictions_to_file: True - output_file_path_prefix: "test_embeddings" # Prefix of the file to write predictions to. - index_mapping_dir: null # Path to a directory to write index mapping files. - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - - optim: - name: fused_adam - lr: 1e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 50 - min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 - constant_steps: 0 # Constant steps should also be 0 when min_lr=0 - monitor: val_loss - reduce_on_plateau: false \ No newline at end of file diff --git a/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py b/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py deleted file mode 100644 index 7486b470425a..000000000000 --- a/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict - -from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronBertTrainerBuilder -from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="megatron_bert_embedding_config") -def main(cfg) -> None: - if cfg.model.data.dataloader_type != "LDDL": - mp.set_start_method("spawn", force=True) - - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - trainer = MegatronBertTrainerBuilder(cfg).create_trainer() - exp_manager(trainer, cfg.exp_manager) - - model_cfg = MegatronBertEmbeddingModel.merge_cfg_with(cfg.restore_from_path, cfg) - - assert ( - model_cfg.micro_batch_size * cfg.trainer.devices * cfg.trainer.num_nodes == model_cfg.global_batch_size - ), "Gradiant accumulation is not supported for contrastive learning yet" - - OmegaConf.set_struct(model_cfg, True) - with open_dict(model_cfg): - model_cfg.precision = trainer.precision - - logging.info(f"Loading model from {cfg.restore_from_path}") - model = MegatronBertEmbeddingModel.restore_from( - restore_path=cfg.restore_from_path, - trainer=trainer, - save_restore_connector=NLPSaveRestoreConnector(), - override_config_path=model_cfg, - strict=True, - ) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/information_retrieval/megatron_bert_embedding_generate.py b/examples/nlp/information_retrieval/megatron_bert_embedding_generate.py deleted file mode 100644 index 9814129b837d..000000000000 --- a/examples/nlp/information_retrieval/megatron_bert_embedding_generate.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict - -from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronBertTrainerBuilder -from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="megatron_bert_embedding_config") -def main(cfg) -> None: - if cfg.model.data.dataloader_type != "LDDL": - mp.set_start_method("spawn", force=True) - - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - trainer = MegatronBertTrainerBuilder(cfg).create_trainer() - exp_manager(trainer, cfg.exp_manager) - - model_cfg = MegatronBertEmbeddingModel.merge_cfg_with(cfg.restore_from_path, cfg) - - OmegaConf.set_struct(model_cfg, True) - with open_dict(model_cfg): - model_cfg.precision = trainer.precision - - logging.info(f"Loading model from {cfg.restore_from_path}") - model = MegatronBertEmbeddingModel.restore_from( - restore_path=cfg.restore_from_path, - trainer=trainer, - save_restore_connector=NLPSaveRestoreConnector(), - override_config_path=model_cfg, - strict=True, - ) - - trainer.test(model) - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py b/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py deleted file mode 100644 index 9cb5cb5d3d19..000000000000 --- a/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections.abc import MutableMapping - -import torch.multiprocessing as mp -from lightning.pytorch.loggers import WandbLogger -from omegaconf.omegaconf import OmegaConf - -from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import MegatronGPTEmbeddingModel -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder -from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - - -def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.') -> MutableMapping: - items = [] - for k, v in d.items(): - new_key = parent_key + sep + k if parent_key else k - if isinstance(v, MutableMapping): - items.extend(flatten_dict(v, new_key, sep=sep).items()) - else: - items.append((new_key, v)) - return dict(items) - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_embedder_tuning_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() - exp_manager(trainer, cfg.exp_manager) - - model_cfg = MegatronGPTEmbeddingModel.merge_cfg_with(cfg.model.restore_from_path, cfg) - if trainer.global_rank == 0: - for logger in trainer.loggers: - if isinstance(logger, WandbLogger): - fd = flatten_dict(dict(model_cfg), sep="/") - logger.experiment.config.update(fd) - model = MegatronGPTEmbeddingModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) - peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme] - - if cfg.model.peft.restore_from_path is not None: - # initialize peft weights from a checkpoint instead of randomly - # This is not the same as resume training because optimizer states are not restored. - logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path) - model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg)) - elif peft_cfg_cls is not None: - logging.info("Adding adapter weights to the model for PEFT") - model.add_adapter(peft_cfg_cls(model_cfg)) - else: - logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}") - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py b/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py deleted file mode 100644 index d66ddb339773..000000000000 --- a/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import asyncio -import os -import threading -from functools import partial - -import torch -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict - -from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import MegatronGPTEmbeddingModel -from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer -from nemo.collections.nlp.modules.common.text_generation_utils import generate -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder -from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.model_utils import inject_model_parallel_rank - -try: - from megatron.core import parallel_state - - HAVE_MEGATRON_CORE = True -except (ImportError, ModuleNotFoundError): - - HAVE_MEGATRON_CORE = False - -mp.set_start_method("spawn", force=True) - - -def use_inference_server(cfg, model, trainer): - if not HAVE_MEGATRON_CORE: - raise ValueError('Megatron-core needs to be installed to use this feature!') - - from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo - - trainer.test(model, dataloaders=None) - - if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0: - if cfg.web_server: - if cfg.chat: - defaults = { - 'user': cfg.chatbot_config.user, - 'assistant': cfg.chatbot_config.assistant, - 'system': cfg.chatbot_config.system, - } - web_ui = partial( - get_chatbot_demo, - defaults=defaults, - value=cfg.chatbot_config.value, - attributes=cfg.chatbot_config.attributes, - ) - else: - web_ui = get_demo - loop = asyncio.new_event_loop() - thread = threading.Thread( - target=web_ui, - daemon=True, - args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop), - ) - thread.start() - server = MegatronServer(model.cuda()) - server.run("0.0.0.0", port=cfg.port) - - while True: - choice = torch.cuda.LongTensor(1) - torch.distributed.broadcast(choice, 0) - if choice[0].item() == 0: - generate(model.cuda()) - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_embedder_generate_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f"\n{OmegaConf.to_yaml(cfg)}") - trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() - - if cfg.model.peft.restore_from_path: - model_cfg = MegatronGPTEmbeddingModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg) - else: - model_cfg = MegatronGPTEmbeddingModel.merge_inference_cfg(cfg.model.restore_from_path, cfg) - - with open_dict(model_cfg): - model_cfg.post_process = False - - model = MegatronGPTEmbeddingModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) - - if cfg.model.peft.restore_from_path: - model.load_adapters(cfg.model.peft.restore_from_path) - elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name: - peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme] - checkpoint_path = os.path.join( - cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name - ) - # checkpoint_path is a dir in case of distributed checkpointing - if not os.path.isdir(checkpoint_path): - # legacy checkpoint needs model parallel rank injection - checkpoint_path = inject_model_parallel_rank( - os.path.join( - cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name - ) - ) - model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg)) - else: - raise NotImplementedError("distributed checkpointing of PEFT weights is not supported") - - model.freeze() - logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}") - - if not cfg.model.get('use_flash_attention', False): - cfg.inference.compute_attention_mask = True - config = OmegaConf.to_container(cfg.inference, resolve=True) - model.set_inference_config(config) - - if not cfg.server: - trainer.test(model) - else: - use_inference_server(cfg, model, trainer) - - -if __name__ == "__main__": - main() diff --git a/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py b/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py deleted file mode 100644 index 5aad85646e3b..000000000000 --- a/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections.abc import MutableMapping - -import torch.multiprocessing as mp -from lightning.pytorch.loggers import WandbLogger -from omegaconf.omegaconf import OmegaConf - -from nemo.collections.nlp.models.information_retrieval.megatron_gpt_reranker_model import MegatronGPTRerankerModel -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder -from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - - -def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.') -> MutableMapping: - items = [] - for k, v in d.items(): - new_key = parent_key + sep + k if parent_key else k - if isinstance(v, MutableMapping): - items.extend(flatten_dict(v, new_key, sep=sep).items()) - else: - items.append((new_key, v)) - return dict(items) - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_reranker_tuning_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() - exp_manager(trainer, cfg.exp_manager) - - model_cfg = MegatronGPTRerankerModel.merge_cfg_with(cfg.model.restore_from_path, cfg) - if trainer.global_rank == 0: - for logger in trainer.loggers: - if isinstance(logger, WandbLogger): - fd = flatten_dict(dict(model_cfg), sep="/") - logger.experiment.config.update(fd) - model = MegatronGPTRerankerModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) - peft_cfg_cls_lst = [PEFT_CONFIG_MAP[s] for s in cfg.model.peft.peft_scheme.split(",")] - peft_cfg_cls = [_peft_cfg(model_cfg) for _peft_cfg in peft_cfg_cls_lst] - - if cfg.model.peft.restore_from_path is not None: - # initialize peft weights from a checkpoint instead of randomly - # This is not the same as resume training because optimizer states are not restored. - logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path) - model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls) - elif peft_cfg_cls is not None: - logging.info("Adding adapter weights to the model for PEFT") - # model.add_adapter(peft_cfg_cls(model_cfg)) - model.add_adapter(peft_cfg_cls) - else: - logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}") - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/information_retrieval/megatron_gpt_reranker_generate.py b/examples/nlp/information_retrieval/megatron_gpt_reranker_generate.py deleted file mode 100644 index dea855963713..000000000000 --- a/examples/nlp/information_retrieval/megatron_gpt_reranker_generate.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import asyncio -import os -import threading -from functools import partial - -import torch -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict - -from nemo.collections.nlp.models.information_retrieval.megatron_gpt_reranker_model import MegatronGPTRerankerModel -from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer -from nemo.collections.nlp.modules.common.text_generation_utils import generate -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder -from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.model_utils import inject_model_parallel_rank - -try: - from megatron.core import parallel_state - - HAVE_MEGATRON_CORE = True -except (ImportError, ModuleNotFoundError): - - HAVE_MEGATRON_CORE = False - -mp.set_start_method("spawn", force=True) - - -def use_inference_server(cfg, model, trainer): - if not HAVE_MEGATRON_CORE: - raise ValueError('Megatron-core needs to be installed to use this feature!') - - from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo - - trainer.test(model, dataloaders=None) - - if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0: - if cfg.web_server: - if cfg.chat: - defaults = { - 'user': cfg.chatbot_config.user, - 'assistant': cfg.chatbot_config.assistant, - 'system': cfg.chatbot_config.system, - } - web_ui = partial( - get_chatbot_demo, - defaults=defaults, - value=cfg.chatbot_config.value, - attributes=cfg.chatbot_config.attributes, - ) - else: - web_ui = get_demo - loop = asyncio.new_event_loop() - thread = threading.Thread( - target=web_ui, - daemon=True, - args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop), - ) - thread.start() - server = MegatronServer(model.cuda()) - server.run("0.0.0.0", port=cfg.port) - - while True: - choice = torch.cuda.LongTensor(1) - torch.distributed.broadcast(choice, 0) - if choice[0].item() == 0: - generate(model.cuda()) - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_reranker_generate_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f"\n{OmegaConf.to_yaml(cfg)}") - trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() - - if cfg.model.peft.restore_from_path: - model_cfg = MegatronGPTRerankerModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg) - else: - model_cfg = MegatronGPTRerankerModel.merge_inference_cfg(cfg.model.restore_from_path, cfg) - - with open_dict(model_cfg): - model_cfg.post_process = False - - model = MegatronGPTRerankerModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) - - if cfg.model.peft.restore_from_path: - model.load_adapters(cfg.model.peft.restore_from_path) - elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name: - peft_cfg_cls_lst = [PEFT_CONFIG_MAP[s] for s in cfg.model.peft.peft_scheme.split(",")] - peft_cfg_cls = [_peft_cfg(model_cfg) for _peft_cfg in peft_cfg_cls_lst] - - checkpoint_path = os.path.join( - cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name - ) - # checkpoint_path is a dir in case of distributed checkpointing - if not os.path.isdir(checkpoint_path): - # legacy checkpoint needs model parallel rank injection - checkpoint_path = inject_model_parallel_rank( - os.path.join( - cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name - ) - ) - model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls) - else: - raise NotImplementedError("distributed checkpointing of PEFT weights is not supported") - - model.freeze() - logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}") - - if not cfg.model.get('use_flash_attention', False): - cfg.inference.compute_attention_mask = True - config = OmegaConf.to_container(cfg.inference, resolve=True) - model.set_inference_config(config) - - if not cfg.server: - trainer.test(model) - else: - use_inference_server(cfg, model, trainer) - - -if __name__ == "__main__": - main() diff --git a/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml b/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml deleted file mode 100644 index df66111375cb..000000000000 --- a/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -# Intent and Slot classification with pretrained BERT models - -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 50 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP. - accelerator: gpu - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - - enable_checkpointing: False - logger: false # Provided by exp_manager - -model: - nemo_path: null # filename to save the model and associated artifacts to .nemo file - data_dir: ??? # /path/to/data - class_labels: - intent_labels_file: intent_labels.csv - slot_labels_file: slot_labels.csv - class_balancing: null # or weighted_loss - intent_loss_weight: 0.6 # relation of intent to slot loss in total loss (between 0 to 1) - pad_label: -1 # if -1 not slot token will be used - ignore_extra_tokens: false - ignore_start_end: true # do not use first and last token for slot training - - train_ds: - prefix: train - batch_size: 32 - shuffle: true - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - validation_ds: - prefix: test - batch_size: 32 - shuffle: false - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - test_ds: - prefix: test - batch_size: 32 - shuffle: false - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - language_model: - max_seq_length: 50 - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - head: - num_output_layers: 2 - fc_dropout: 0.1 - - optim: - name: adam - lr: 2e-5 - args: - name: auto - params: - weight_decay: 0.01 - - sched: - name: WarmupAnnealing - iters_per_batch: null # computed at runtime - max_steps: -1 # computed at runtime or explicitly set here - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - - # scheduler config override - args: - name: auto - params: - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: "IntentSlot" # The name of your model - create_tensorboard_logger: true # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: true # Whether you want exp_manager to create a modelcheckpoint callback - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml b/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml deleted file mode 100644 index c15c71e67c07..000000000000 --- a/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -# Intent and Slot classification with pretrained BERT models - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: 5 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP. - accelerator: auto - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - -model: - nemo_path: null # filename to save the model and associated artifacts to .nemo file - data_dir: ??? # /path/to/data - class_labels: - intent_labels_file: intent_labels.csv - slot_labels_file: slot_labels.csv - class_balancing: null # or weighted_loss - intent_loss_weight: 0.6 # relation of intent to slot loss in total loss (between 0 to 1) - pad_label: -1 # if -1 not slot token will be used - ignore_extra_tokens: false - ignore_start_end: true # do not use first and last token for slot training - - train_ds: - prefix: train - batch_size: 32 - shuffle: true - num_samples: -1 - num_workers: 8 - drop_last: false - pin_memory: false - - validation_ds: - prefix: dev - batch_size: 32 - shuffle: false - num_samples: -1 - num_workers: 8 - drop_last: false - pin_memory: false - - test_ds: - prefix: dev - batch_size: 32 - shuffle: false - num_samples: -1 - num_workers: 8 - drop_last: false - pin_memory: false - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - language_model: - max_seq_length: 50 - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - head: - num_output_layers: 2 - fc_dropout: 0.1 - - optim: - name: adam - lr: 2e-5 - args: - name: auto - params: - weight_decay: 0.01 - - sched: - name: WarmupAnnealing - iters_per_batch: null # computed at runtime - max_steps: -1 # computed at runtime or explicitly set here - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - - # scheduler config override - args: - name: auto - params: - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - -language_model: - max_seq_length: 50 - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: "MultiLabelIntentSlot" # The name of your model - create_tensorboard_logger: False # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: False # Whether you want exp_manager to create a modelcheckpoint callback diff --git a/examples/nlp/intent_slot_classification/intent_slot_classification.py b/examples/nlp/intent_slot_classification/intent_slot_classification.py deleted file mode 100644 index 2025f48f330f..000000000000 --- a/examples/nlp/intent_slot_classification/intent_slot_classification.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import lightning.pytorch as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import IntentSlotClassificationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="intent_slot_classification_config") -def main(cfg: DictConfig) -> None: - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters like here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - # initialize the model using the config file - model = IntentSlotClassificationModel(cfg.model, trainer=trainer) - - # training - logging.info("================================================================================================") - logging.info('Starting training...') - trainer.fit(model) - logging.info('Training finished!') - - # Stop further testing as fast_dev_run does not save checkpoints - if trainer.fast_dev_run: - return - - # after model training is done, you can load the model from the saved checkpoint - # and evaluate it on a data file or on given queries. - logging.info("================================================================================================") - logging.info("Starting the testing of the trained model on test set...") - logging.info("We will load the latest model saved checkpoint from the training...") - - # for evaluation and inference you can load the previously trained model saved in .nemo file - # like this in your code, but we will just reuse the trained model here - # eval_model = IntentSlotClassificationModel.restore_from(restore_path=checkpoint_path) - eval_model = model - - # we will setup testing data reusing the same config (test section) - eval_model.update_data_dir_for_testing(data_dir=cfg.model.data_dir) - eval_model.setup_test_data(test_data_config=cfg.model.test_ds) - - trainer.test(model=eval_model, ckpt_path=None, verbose=False) - logging.info("Testing finished!") - - # run an inference on a few examples - logging.info("======================================================================================") - logging.info("Evaluate the model on the given queries...") - - # this will work well if you train the model on Assistant dataset - # for your own dataset change the examples appropriately - queries = [ - 'set alarm for seven thirty am', - 'lower volume by fifty percent', - 'what is my schedule for tomorrow', - ] - - pred_intents, pred_slots = eval_model.predict_from_examples(queries, cfg.model.test_ds) - - logging.info('The prediction results of some sample queries with the trained model:') - for query, intent, slots in zip(queries, pred_intents, pred_slots): - logging.info(f'Query : {query}') - logging.info(f'Predicted Intent: {intent}') - logging.info(f'Predicted Slots: {slots}') - - logging.info("Inference finished!") - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py b/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py deleted file mode 100644 index 232aa7d4d230..000000000000 --- a/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Sample command to run the script: - -python multi_label_intent_slot_classification.py \ - model.data_dir=/home/user/multiatis \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=[0] \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints - -fast_dev_run=false will save checkpoints for the model -""" - - -import lightning.pytorch as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import MultiLabelIntentSlotClassificationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="multi_label_intent_slot_classification_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - # initialize the model using the config file - model = MultiLabelIntentSlotClassificationModel(cfg.model, trainer=trainer) - - # training - logging.info("================================================================================================") - logging.info('Starting training...') - trainer.fit(model) - logging.info('Training finished!') - - # Stop further testing as fast_dev_run does not save checkpoints - if trainer.fast_dev_run: - return - - # after model training is done, you can load the model from the saved checkpoint - # and evaluate it on a data file or on given queries. - logging.info("================================================================================================") - logging.info("Starting the testing of the trained model on test set...") - logging.info("We will load the latest model saved checkpoint from the training...") - - # for evaluation and inference you can load the previously trained model saved in .nemo file - # like this in your code, but we will just reuse the trained model here - # eval_model = MultiLabelIntentSlotClassificationModel.restore_from(restore_path=checkpoint_path) - eval_model = model - - # we will setup testing data reusing the same config (test section) - eval_model.update_data_dir_for_testing(data_dir=cfg.model.data_dir) - eval_model.setup_test_data(test_data_config=cfg.model.test_ds) - - trainer.test(model=eval_model, ckpt_path=None, verbose=False) - logging.info("Testing finished!") - - # Optimize Threshold - eval_model.optimize_threshold(cfg.model.test_ds, 'dev') - - # run an inference on a few examples - logging.info("======================================================================================") - logging.info("Evaluate the model on the given queries...") - - # this will work well if you train the model on ATIS dataset - # for your own dataset change the examples appropriately - queries = [ - 'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis', - 'on april first i need a ticket from tacoma to san jose departing before 7 am', - 'how much is the limousine service in boston', - ] - - # We use the optimized threshold for predictions - pred_intents, pred_slots, pred_list = eval_model.predict_from_examples(queries, cfg.model.test_ds) - logging.info('The prediction results of some sample queries with the trained model:') - - for query, intent, slots in zip(queries, pred_intents, pred_slots): - logging.info(f'Query : {query}') - logging.info(f'Predicted Intents: {intent}') - logging.info(f'Predicted Slots: {slots}') - - logging.info("Inference finished!") - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/spellchecking_asr_customization/README.md b/examples/nlp/spellchecking_asr_customization/README.md deleted file mode 100644 index 9d2063eff181..000000000000 --- a/examples/nlp/spellchecking_asr_customization/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# SpellMapper - spellchecking model for ASR Customization -Paper: https://arxiv.org/abs/2306.02317 -This model was partly inspired by Microsoft's paper https://arxiv.org/pdf/2203.00888.pdf. -The goal is to build a model that gets as input a single ASR hypothesis (text) and a vocabulary of custom words/phrases and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any. -Our model is non-autoregressive (NAR) based on transformer architecture (BERT with multiple separators). - -As initial data we use about 5 mln entities from [YAGO corpus](https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/yago-naga/yago/downloads/). These entities are short phrases from Wikipedia headings. -In order to get misspelled predictions we feed these data to TTS model and then to ASR model. -Having a "parallel" corpus of "correct + misspelled" phrases, we use statistical machine translation techniques to create a dictionary of possible ngram mappings with their respective frequencies. -We create an auxiliary algorithm that takes as input a sentence (ASR hypothesis) and a large custom dictionary (e.g. 5000 phrases) and selects top 10 candidate phrases that are probably contained in this sentence in a misspelled way. -The task of our final neural model is to predict which fragments in the ASR hypothesis should be replaced by which of top-10 candidate phrases if any. - -The pipeline consists of multiple steps: - -1. Download or generate training data. - See `https://github.com/bene-ges/nemo_compatible/tree/main/scripts/nlp/en_spellmapper/dataset_preparation` - -2. [Optional] Convert training dataset to tarred files. - `convert_dataset_to_tarred.sh` - -3. Train spellchecking model. - `run_training.sh` - or - `run_training_tarred.sh` - -4. Run evaluation. - - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh) - - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh) - - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh) - -5. Run inference. - `python run_infer.sh` diff --git a/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py b/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py deleted file mode 100644 index c2f514f3e67e..000000000000 --- a/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script converts checkpoint .ckpt to .nemo file. - -This script uses the `examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. -""" - -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import SpellcheckingAsrCustomizationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") -def main(cfg: DictConfig) -> None: - logging.debug(f'Config Params: {OmegaConf.to_yaml(cfg)}') - SpellcheckingAsrCustomizationModel.load_from_checkpoint(cfg.checkpoint_path).save_to(cfg.target_nemo_path) - - -if __name__ == "__main__": - main() diff --git a/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml b/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml deleted file mode 100644 index f8dca7b974e5..000000000000 --- a/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml +++ /dev/null @@ -1,97 +0,0 @@ -name: &name spellchecking -lang: ??? # e.g. 'ru', 'en' - -# Pretrained Nemo Models -pretrained_model: null - -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 3 # the number of training epochs - enable_checkpointing: false # provided by exp_manager - logger: false # provided by exp_manager - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. - accelerator: gpu - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - -model: - do_training: true - label_map: ??? # path/.../label_map.txt - semiotic_classes: ??? # path/.../semiotic_classes.txt - max_sequence_len: 128 - lang: ${lang} - hidden_size: 768 - - optim: - name: adamw - lr: 3e-5 - weight_decay: 0.1 - - sched: - name: WarmupAnnealing - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - - # scheduler config override - warmup_ratio: 0.1 - last_epoch: -1 - - language_model: - pretrained_model_name: bert-base-uncased # For ru, try DeepPavlov/rubert-base-cased | For de or multilingual, try bert-base-multilingual-cased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - -exp_manager: - exp_dir: nemo_experiments # where to store logs and checkpoints - name: training # name of experiment - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - save_top_k: 3 - monitor: "val_loss" - mode: "min" - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - -tokenizer: - tokenizer_name: ${model.transformer} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - -# Data -data: - train_ds: - data_path: ??? # provide the full path to the file - batch_size: 8 - shuffle: true - num_workers: 3 - pin_memory: false - drop_last: false - - validation_ds: - data_path: ??? # provide the full path to the file. - batch_size: 8 - shuffle: false - num_workers: 3 - pin_memory: false - drop_last: false - - -# Inference -inference: - from_file: null # Path to the raw text, no labels required. Each sentence on a separate line - out_file: null # Path to the output file - batch_size: 16 # batch size for inference.from_file diff --git a/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh b/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh deleted file mode 100644 index d4265eb4beb6..000000000000 --- a/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# Path to NeMo repository -NEMO_PATH=NeMo - -DATA_PATH="data_folder" - -## data_folder_example -## ├── tarred_data -## | └── (output) -## ├── config.json -##   ├── label_map.txt -##   ├── semiotic_classes.txt -## ├── test.tsv -## ├── 1.tsv -## ├── ... -## └── 200.tsv - -## Each of {1-200}.tsv input files are 110'000 examples subsets of all.tsv (except for validation part), -## generated by https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/dataset_preparation/build_training_data.sh -## Note that in this example we use 110'000 as input and only pack 100'000 of them to tar file. -## This is because some input examples, e.g. too long, can be skipped during preprocessing, and we want all tar files to contain fixed equal number of examples. - -for part in {1..200} -do - python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py \ - lang="en" \ - data.train_ds.data_path=${DATA_PATH}/${part}.tsv \ - data.validation_ds.data_path=${DATA_PATH}/test.tsv \ - model.max_sequence_len=256 \ - model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \ - model.language_model.config_file=${DATA_PATH}/config.json \ - model.label_map=${DATA_PATH}/label_map.txt \ - model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \ - +output_tar_file=${DATA_PATH}/tarred_data/part${part}.tar \ - +take_first_n_lines=100000 -done diff --git a/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py b/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py deleted file mode 100644 index 68c55ff51a4f..000000000000 --- a/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script is used to create an index of custom vocabulary and save it to file. -See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline. -""" - -from argparse import ArgumentParser - -from nemo.collections.nlp.data.spellchecking_asr_customization.utils import get_index, load_ngram_mappings - -parser = ArgumentParser(description="Create an index of custom vocabulary and save it to file") - -parser.add_argument( - "--input_name", required=True, type=str, help="Path to input file with custom vocabulary (plain text)" -) -parser.add_argument( - "--ngram_mappings", required=True, type=str, help="Path to input file with n-gram mapping vocabulary" -) -parser.add_argument("--output_name", required=True, type=str, help="Path to output file with custom vocabulary index") -parser.add_argument("--min_log_prob", default=-4.0, type=float, help="Threshold on log probability") -parser.add_argument( - "--max_phrases_per_ngram", - default=500, - type=int, - help="Threshold on number of phrases that can be stored for one n-gram key in index. Keys with more phrases are discarded.", -) -parser.add_argument( - "--max_misspelled_freq", default=125000, type=int, help="Threshold on maximum frequency of misspelled n-gram" -) - -args = parser.parse_args() - -# Load custom vocabulary -custom_phrases = set() -with open(args.input_name, "r", encoding="utf-8") as f: - for line in f: - phrase = line.strip() - custom_phrases.add(" ".join(list(phrase.replace(" ", "_")))) -print("Size of customization vocabulary:", len(custom_phrases)) - -# Load n-gram mappings vocabulary -ngram_mapping_vocab, ban_ngram = load_ngram_mappings(args.ngram_mappings, max_misspelled_freq=args.max_misspelled_freq) - -# Generate index of custom phrases -phrases, ngram2phrases = get_index( - custom_phrases, - ngram_mapping_vocab, - ban_ngram, - min_log_prob=args.min_log_prob, - max_phrases_per_ngram=args.max_phrases_per_ngram, -) - -# Save index to file -with open(args.output_name, "w", encoding="utf-8") as out: - for ngram in ngram2phrases: - for phrase_id, begin, size, logprob in ngram2phrases[ngram]: - phrase = phrases[phrase_id] - out.write(ngram + "\t" + phrase + "\t" + str(begin) + "\t" + str(size) + "\t" + str(logprob) + "\n") diff --git a/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py b/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py deleted file mode 100644 index d0bdc2c9bd30..000000000000 --- a/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script is used to create a tarred dataset for SpellcheckingAsrCustomizationModel. - -This script uses the `/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. Probably it is worth looking -at the example config file to see the list of parameters used for training. - -USAGE Example: -1. Obtain a processed dataset -2. Run: - python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py \ - lang=${LANG} \ - data.train_ds.data_path=${DATA_PATH}/train.tsv \ - model.language_model.pretrained_model_name=${LANGUAGE_MODEL} \ - model.label_map=${DATA_PATH}/label_map.txt \ - +output_tar_file=tarred/part1.tar \ - +take_first_n_lines=100000 - -""" -import pickle -import tarfile -from io import BytesIO - -from helpers import MODEL, instantiate_model_and_trainer -from omegaconf import DictConfig, OmegaConf - -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config Params: {OmegaConf.to_yaml(cfg)}') - logging.info("Start creating tar file from " + cfg.data.train_ds.data_path + " ...") - _, model = instantiate_model_and_trainer( - cfg, MODEL, True - ) # instantiate model like for training because we may not have pretrained model - dataset = model._train_dl.dataset - archive = tarfile.open(cfg.output_tar_file, mode="w") - max_lines = int(cfg.take_first_n_lines) - for i in range(len(dataset)): - if i >= max_lines: - logging.info("Reached " + str(max_lines) + " examples") - break - ( - input_ids, - input_mask, - segment_ids, - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - character_pos_to_subword_pos, - labels_mask, - labels, - spans, - ) = dataset[i] - - # do not store masks as they are just arrays of 1 - content = { - "input_ids": input_ids, - "input_mask": input_mask, - "segment_ids": segment_ids, - "input_ids_for_subwords": input_ids_for_subwords, - "input_mask_for_subwords": input_mask_for_subwords, - "segment_ids_for_subwords": segment_ids_for_subwords, - "character_pos_to_subword_pos": character_pos_to_subword_pos, - "labels_mask": labels_mask, - "labels": labels, - "spans": spans, - } - b = BytesIO() - pickle.dump(content, b) - b.seek(0) - tarinfo = tarfile.TarInfo(name="example_" + str(i) + ".pkl") - tarinfo.size = b.getbuffer().nbytes - archive.addfile(tarinfo=tarinfo, fileobj=b) - - archive.close() - logging.info("Tar file " + cfg.output_tar_file + " created!") - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/spellchecking_asr_customization/helpers.py b/examples/nlp/spellchecking_asr_customization/helpers.py deleted file mode 100644 index 8e3957d34cc1..000000000000 --- a/examples/nlp/spellchecking_asr_customization/helpers.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -from typing import Tuple - -import lightning.pytorch as pl -from omegaconf import DictConfig - -from nemo.collections.nlp.models import SpellcheckingAsrCustomizationModel -from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector -from nemo.utils import logging - -__all__ = ["MODEL", "MODEL_NAMES", "instantiate_model_and_trainer"] - -MODEL = "spellchecking" -MODEL_NAMES = [MODEL] - - -def instantiate_model_and_trainer( - cfg: DictConfig, model_name: str, do_training: bool -) -> Tuple[pl.Trainer, SpellcheckingAsrCustomizationModel]: - """Function for instantiating a model and a trainer - Args: - cfg: The config used to instantiate the model and the trainer. - model_name: A str indicates the model direction, currently only 'itn'. - do_training: A boolean flag indicates whether the model will be trained or evaluated. - - Returns: - trainer: A PyTorch Lightning trainer - model: A SpellcheckingAsrCustomizationModel - """ - - if model_name not in MODEL_NAMES: - raise ValueError(f"{model_name} is unknown model type") - - # Get configs for the corresponding models - trainer_cfg = cfg.get("trainer") - model_cfg = cfg.get("model") - pretrained_cfg = cfg.get("pretrained_model", None) - trainer = pl.Trainer(**trainer_cfg) - if not pretrained_cfg: - logging.info(f"Initializing {model_name} model") - if model_name == MODEL: - model = SpellcheckingAsrCustomizationModel(model_cfg, trainer=trainer) - else: - raise ValueError(f"{model_name} is unknown model type") - elif os.path.exists(pretrained_cfg): - logging.info(f"Restoring pretrained {model_name} model from {pretrained_cfg}") - save_restore_connector = NLPSaveRestoreConnector() - model = SpellcheckingAsrCustomizationModel.restore_from( - pretrained_cfg, save_restore_connector=save_restore_connector - ) - else: - logging.info(f"Loading pretrained model {pretrained_cfg}") - if model_name == MODEL: - if pretrained_cfg not in SpellcheckingAsrCustomizationModel.get_available_model_names(): - raise ( - ValueError( - f"{pretrained_cfg} not in the list of available Tagger models." - f"Select from {SpellcheckingAsrCustomizationModel.list_available_models()}" - ) - ) - model = SpellcheckingAsrCustomizationModel.from_pretrained(pretrained_cfg) - else: - raise ValueError(f"{model_name} is unknown model type") - - # Setup train and validation data - if do_training: - model.setup_training_data(train_data_config=cfg.data.train_ds) - model.setup_validation_data(val_data_config=cfg.data.validation_ds) - - logging.info(f"Model {model_name} -- Device {model.device}") - return trainer, model diff --git a/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py b/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py deleted file mode 100644 index 871d5e5c0c0c..000000000000 --- a/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script is used to postprocess SpellMapper results and generate an updated nemo ASR manifest. -See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline. -""" - -from argparse import ArgumentParser - -from nemo.collections.nlp.data.spellchecking_asr_customization.utils import ( - update_manifest_with_spellmapper_corrections, -) - -parser = ArgumentParser(description="Postprocess SpellMapper results and generate an updated nemo ASR manifest") - -parser.add_argument("--input_manifest", required=True, type=str, help="Path to input nemo ASR manifest") -parser.add_argument( - "--field_name", default="pred_text", type=str, help="Name of json field with original ASR hypothesis text" -) -parser.add_argument( - "--short2full_name", - required=True, - type=str, - help="Path to input file with correspondence between sentence fragments and full sentences", -) -parser.add_argument( - "--spellmapper_results", required=True, type=str, help="Path to input file with SpellMapper inference results" -) -parser.add_argument("--output_manifest", required=True, type=str, help="Path to output nemo ASR manifest") -parser.add_argument("--min_prob", default=0.5, type=float, help="Threshold on replacement probability") -parser.add_argument( - "--use_dp", - action="store_true", - help="Whether to use additional replacement filtering by using dynamic programming", -) -parser.add_argument( - "--replace_hyphen_to_space", - action="store_true", - help="Whether to use space instead of hyphen in replaced fragments", -) -parser.add_argument( - "--ngram_mappings", type=str, required=True, help="File with ngram mappings, only needed if use_dp=true" -) -parser.add_argument( - "--min_dp_score_per_symbol", - default=-1.5, - type=float, - help="Minimum dynamic programming sum score averaged by hypothesis length", -) - -args = parser.parse_args() - -update_manifest_with_spellmapper_corrections( - input_manifest_name=args.input_manifest, - short2full_name=args.short2full_name, - output_manifest_name=args.output_manifest, - spellmapper_results_name=args.spellmapper_results, - min_prob=args.min_prob, - replace_hyphen_to_space=args.replace_hyphen_to_space, - field_name=args.field_name, - use_dp=args.use_dp, - ngram_mappings=args.ngram_mappings, - min_dp_score_per_symbol=args.min_dp_score_per_symbol, -) - -print("Resulting manifest saved to: ", args.output_manifest) diff --git a/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py b/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py deleted file mode 100644 index 6fd5e524390a..000000000000 --- a/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script contains an example on how to prepare input for SpellMapper inference from a nemo ASR manifest. -It splits sentences to shorter fragments, runs candidate retrieval and generates input in the required format. -It produces two output files: - 1. File with correspondence between sentence fragments and full sentences. - 2. File that will serve as input for SpellMapper inference. - -See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline. -""" - -from argparse import ArgumentParser - -from nemo.collections.nlp.data.spellchecking_asr_customization.utils import ( - extract_and_split_text_from_manifest, - get_candidates, - load_index, -) - -parser = ArgumentParser(description="Prepare input for SpellMapper inference from a nemo ASR manifest") -parser.add_argument("--manifest", required=True, type=str, help="Path to input manifest file") -parser.add_argument( - "--custom_vocab_index", required=True, type=str, help="Path to input file with custom vocabulary index" -) -parser.add_argument( - "--big_sample", - required=True, - type=str, - help="Path to input file with big sample of phrases to sample dummy candidates if there less than 10 are found by retrieval", -) -parser.add_argument( - "--short2full_name", - required=True, - type=str, - help="Path to output file with correspondence between sentence fragments and full sentences", -) -parser.add_argument( - "--output_name", - required=True, - type=str, - help="Path to output file that will serve as input for SpellMapper inference", -) -parser.add_argument("--field_name", default="pred_text", type=str, help="Name of json field with ASR hypothesis text") -parser.add_argument("--len_in_words", default=16, type=int, help="Maximum fragment length in words") -parser.add_argument( - "--step_in_words", - default=8, - type=int, - help="Step in words for moving to next fragment. If less than len_in_words, fragments will intersect", -) - -args = parser.parse_args() - -# Split ASR hypotheses to shorter fragments, because SpellMapper can't handle arbitrarily long sequences. -# The correspondence between short and original fragments is saved to a file and will be used at post-processing. -extract_and_split_text_from_manifest( - input_name=args.manifest, - output_name=args.short2full_name, - field_name=args.field_name, - len_in_words=args.len_in_words, - step_in_words=args.step_in_words, -) - -# Load index of custom vocabulary from file -phrases, ngram2phrases = load_index(args.custom_vocab_index) - -# Load big sample of phrases to sample dummy candidates if there less than 10 are found by retrieval -big_sample_of_phrases = set() -with open(args.big_sample, "r", encoding="utf-8") as f: - for line in f: - phrase, freq = line.strip().split("\t") - if int(freq) > 50: # do not want to use frequent phrases as dummy candidates - continue - if len(phrase) < 6 or len(phrase) > 15: # do not want to use too short or too long phrases as dummy candidates - continue - big_sample_of_phrases.add(phrase) - -big_sample_of_phrases = list(big_sample_of_phrases) - -# Generate input for SpellMapper inference -out = open(args.output_name, "w", encoding="utf-8") -with open(args.short2full_name, "r", encoding="utf-8") as f: - for line in f: - short_sent, _ = line.strip().split("\t") - sent = "_".join(short_sent.split()) - letters = list(sent) - candidates = get_candidates(ngram2phrases, phrases, letters, big_sample_of_phrases) - if len(candidates) == 0: - continue - if len(candidates) != 10: - raise ValueError("expect 10 candidates, got: ", len(candidates)) - - # We add two columns with targets and span_info. - # They have same format as during training, but start and end positions are APPROXIMATE, they will be adjusted when constructing BertExample. - targets = [] - span_info = [] - for idx, c in enumerate(candidates): - if c[1] == -1: - continue - targets.append(str(idx + 1)) # targets are 1-based - start = c[1] - # ensure that end is not outside sentence length (it can happen because c[2] is candidate length used as approximation) - end = min(c[1] + c[2], len(letters)) - span_info.append("CUSTOM " + str(start) + " " + str(end)) - out.write( - " ".join(letters) - + "\t" - + ";".join([x[0] for x in candidates]) - + "\t" - + " ".join(targets) - + "\t" - + ";".join(span_info) - + "\n" - ) -out.close() diff --git a/examples/nlp/spellchecking_asr_customization/run_infer.sh b/examples/nlp/spellchecking_asr_customization/run_infer.sh deleted file mode 100644 index b4bbdc4da375..000000000000 --- a/examples/nlp/spellchecking_asr_customization/run_infer.sh +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -## RUN INFERENCE ON NEMO MANIFEST AND CUSTOM VOCABULARY - -## Path to NeMo repository -NEMO_PATH=NeMo - -## Download model repo from Hugging Face (if clone doesn't work, run "git lfs install" and try again) -git clone https://huggingface.co/bene-ges/spellmapper_asr_customization_en -## Download repo with test data -git clone https://huggingface.co/datasets/bene-ges/spellmapper_en_evaluation - -## Files in model repo -PRETRAINED_MODEL=spellmapper_asr_customization_en/training_10m_5ep.nemo -NGRAM_MAPPINGS=spellmapper_asr_customization_en/replacement_vocab_filt.txt -BIG_SAMPLE=spellmapper_asr_customization_en/big_sample.txt - -## Override these two files if you want to test on your own data -## File with input nemo ASR manifest -INPUT_MANIFEST=spellmapper_en_evaluation/medical_manifest_ctc.json -## File containing custom words and phrases (plain text) -CUSTOM_VOCAB=spellmapper_en_evaluation/medical_custom_vocab.txt - -## Other files will be created -## File with index of custom vocabulary -INDEX="index.txt" -## File with short fragments and corresponding original sentences -SHORT2FULL="short2full.txt" -## File with input for SpellMapper inference -SPELLMAPPER_INPUT="spellmapper_input.txt" -## File with output of SpellMapper inference -SPELLMAPPER_OUTPUT="spellmapper_output.txt" -## File with output nemo ASR manifest -OUTPUT_MANIFEST="out_manifest.json" - - -# Create index of custom vocabulary -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py \ - --input_name ${CUSTOM_VOCAB} \ - --ngram_mappings ${NGRAM_MAPPINGS} \ - --output_name ${INDEX} \ - --min_log_prob -4.0 \ - --max_phrases_per_ngram 600 - -# Prepare input for SpellMapper inference -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py \ - --manifest ${INPUT_MANIFEST} \ - --custom_vocab_index ${INDEX} \ - --big_sample ${BIG_SAMPLE} \ - --short2full_name ${SHORT2FULL} \ - --output_name ${SPELLMAPPER_INPUT} \ - --field_name "pred_text" \ - --len_in_words 16 \ - --step_in_words 8 - -# Run SpellMapper inference -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \ - pretrained_model=${PRETRAINED_MODEL} \ - model.max_sequence_len=512 \ - inference.from_file=${SPELLMAPPER_INPUT} \ - inference.out_file=${SPELLMAPPER_OUTPUT} \ - inference.batch_size=16 \ - lang=en - -# Postprocess and create output corrected manifest -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \ - --input_manifest ${INPUT_MANIFEST} \ - --short2full_name ${SHORT2FULL} \ - --output_manifest ${OUTPUT_MANIFEST} \ - --spellmapper_result ${SPELLMAPPER_OUTPUT} \ - --replace_hyphen_to_space \ - --field_name "pred_text" \ - --use_dp \ - --ngram_mappings ${NGRAM_MAPPINGS} \ - --min_dp_score_per_symbol -1.5 - -# Check WER of initial manifest -python ${NEMO_PATH}/examples/asr/speech_to_text_eval.py \ - dataset_manifest=${INPUT_MANIFEST} \ - use_cer=False \ - only_score_manifest=True - -# Check WER of corrected manifest -python ${NEMO_PATH}/examples/asr/speech_to_text_eval.py \ - dataset_manifest=${OUTPUT_MANIFEST} \ - use_cer=False \ - only_score_manifest=True diff --git a/examples/nlp/spellchecking_asr_customization/run_training.sh b/examples/nlp/spellchecking_asr_customization/run_training.sh deleted file mode 100644 index 85dddbb2a038..000000000000 --- a/examples/nlp/spellchecking_asr_customization/run_training.sh +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -## TRAIN WITH NON-TARRED DATA - -# Path to NeMo repository -NEMO_PATH=NeMo - -## Download repo with training data (very small example) -## If clone doesn't work, run "git lfs install" and try again -git clone https://huggingface.co/datasets/bene-ges/spellmapper_en_train_micro - -DATA_PATH=spellmapper_en_train_micro - -## Example of all files needed to run training with non-tarred data: -## spellmapper_en_train_micro -## ├── config.json -##   ├── label_map.txt -##   ├── semiotic_classes.txt -## ├── test.tsv -## └── train.tsv - -## To generate files config.json, label_map.txt, semiotic_classes.txt - run generate_configs.sh -## Files "train.tsv" and "test.tsv" contain training examples. -## For data preparation see https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/dataset_preparation/build_training_data.sh - -## Note that training with non-tarred data only works on single gpu. It makes sense if you use 1-2 million examples or less. - -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py \ - lang="en" \ - data.validation_ds.data_path=${DATA_PATH}/test.tsv \ - data.train_ds.data_path=${DATA_PATH}/train.tsv \ - data.train_ds.batch_size=32 \ - data.train_ds.num_workers=8 \ - model.max_sequence_len=512 \ - model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \ - model.language_model.config_file=${DATA_PATH}/config.json \ - model.label_map=${DATA_PATH}/label_map.txt \ - model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \ - model.optim.lr=3e-5 \ - trainer.devices=[1] \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.strategy=ddp \ - trainer.max_epochs=5 diff --git a/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh b/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh deleted file mode 100644 index 655c3e23e610..000000000000 --- a/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -## TRAIN WITH TARRED DATA - -# Path to NeMo repository -NEMO_PATH=NeMo - -DATA_PATH=data_folder - -## data_folder_example -## ├── train_tarred -## | ├── part1.tar -## | ├── ... -## | └── part200.tar -## ├── config.json -##   ├── label_map.txt -##   ├── semiotic_classes.txt -## └── test.tsv -## To generate files config.json, label_map.txt, semiotic_classes.txt, run generate_configs.sh -## To prepare data, see ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/dataset_preparation/build_training_data.sh -## To convert data to tarred format, split all.tsv to pieces of 110'000 examples (except for validation part) and use ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/dataset_preparation/convert_data_to_tarred.sh -## To run training with tarred data, use ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh - -## ATTENTION: How to calculate model.optim.sched.max_steps: -## Suppose, you have 2'000'000 training examples, and want to train for 5 epochs on 4 gpus with batch size 32. -## 5 (epochs) * 32 (bs) * 4 (gpus) -## 1 step consumes 128 examples (32(bs) * 4(gpus)) -## 1 epoch makes 2000000/128=15625 steps (updates) -## 5 epochs make 5*15625=78125 steps - -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py \ - lang="en" \ - data.validation_ds.data_path=${DATA_PATH}/test.tsv \ - data.train_ds.data_path=${DATA_PATH}/train_tarred/part_OP_1..100_CL_.tar \ - data.train_ds.batch_size=32 \ - data.train_ds.num_workers=16 \ - +data.train_ds.use_tarred_dataset=true \ - data.train_ds.shuffle=false \ - data.validation_ds.batch_size=16 \ - model.max_sequence_len=512 \ - model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \ - model.language_model.config_file=${DATA_PATH}/config.json \ - model.label_map=${DATA_PATH}/label_map.txt \ - model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \ - model.optim.sched.name=CosineAnnealing \ - +model.optim.sched.max_steps=195313 \ - trainer.devices=8 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.strategy=ddp \ - trainer.max_epochs=5 diff --git a/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py b/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py deleted file mode 100644 index 593264f14a5d..000000000000 --- a/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script contains an example on how to run inference with the SpellcheckingAsrCustomizationModel. - -An input line should consist of 4 tab-separated columns: - 1. text of ASR-hypothesis - 2. texts of 10 candidates separated by semicolon - 3. 1-based ids of non-dummy candidates - 4. approximate start/end coordinates of non-dummy candidates (correspond to ids in third column) - -Example input (in one line): - t h e _ t a r a s i c _ o o r d a _ i s _ a _ p a r t _ o f _ t h e _ a o r t a _ l o c a t e d _ i n _ t h e _ t h o r a x - h e p a t i c _ c i r r h o s i s;u r a c i l;c a r d i a c _ a r r e s t;w e a n;a p g a r;p s y c h o m o t o r;t h o r a x;t h o r a c i c _ a o r t a;a v f;b l o c k a d e d - 1 2 6 7 8 9 10 - CUSTOM 6 23;CUSTOM 4 10;CUSTOM 4 15;CUSTOM 56 62;CUSTOM 5 19;CUSTOM 28 31;CUSTOM 39 48 - -Each line in SpellMapper output is tab-separated and consists of 4 columns: - 1. ASR-hypothesis (same as in input) - 2. 10 candidates separated with semicolon (same as in input) - 3. fragment predictions, separated with semicolon, each prediction is a tuple (start, end, candidate_id, probability) - 4. letter predictions - candidate_id predicted for each letter (this is only for debug purposes) - -Example output (in one line): - t h e _ t a r a s i c _ o o r d a _ i s _ a _ p a r t _ o f _ t h e _ a o r t a _ l o c a t e d _ i n _ t h e _ t h o r a x - h e p a t i c _ c i r r h o s i s;u r a c i l;c a r d i a c _ a r r e s t;w e a n;a p g a r;p s y c h o m o t o r;t h o r a x;t h o r a c i c _ a o r t a;a v f;b l o c k a d e d - 56 62 7 0.99998;4 20 8 0.95181;12 20 8 0.44829;4 17 8 0.99464;12 17 8 0.97645 - 8 8 8 0 8 8 8 8 8 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7 - - -USAGE Example: -1. Train a model, or use a pretrained checkpoint. -2. Run on a single file: - python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \ - pretrained_model=${PRETRAINED_NEMO_CHECKPOINT} \ - model.max_sequence_len=512 \ - inference.from_file=input.txt \ - inference.out_file=output.txt \ - inference.batch_size=16 \ - lang=en -or on multiple files: - python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \ - pretrained_model=${PRETRAINED_NEMO_CHECKPOINT} \ - model.max_sequence_len=512 \ - +inference.from_filelist=filelist.txt \ - +inference.output_folder=output_folder \ - inference.batch_size=16 \ - lang=en - -This script uses the `/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. -""" - - -import os - -from helpers import MODEL, instantiate_model_and_trainer -from omegaconf import DictConfig, OmegaConf - -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") -def main(cfg: DictConfig) -> None: - logging.debug(f'Config Params: {OmegaConf.to_yaml(cfg)}') - - if cfg.pretrained_model is None: - raise ValueError("A pre-trained model should be provided.") - _, model = instantiate_model_and_trainer(cfg, MODEL, False) - - if cfg.model.max_sequence_len != model.max_sequence_len: - model.max_sequence_len = cfg.model.max_sequence_len - model.builder._max_seq_length = cfg.model.max_sequence_len - input_filenames = [] - output_filenames = [] - - if "from_filelist" in cfg.inference and "output_folder" in cfg.inference: - filelist_file = cfg.inference.from_filelist - output_folder = cfg.inference.output_folder - with open(filelist_file, "r", encoding="utf-8") as f: - for line in f: - path = line.strip() - input_filenames.append(path) - folder, name = os.path.split(path) - output_filenames.append(os.path.join(output_folder, name)) - else: - text_file = cfg.inference.from_file - logging.info(f"Running inference on {text_file}...") - if not os.path.exists(text_file): - raise ValueError(f"{text_file} not found.") - input_filenames.append(text_file) - output_filenames.append(cfg.inference.out_file) - - dataloader_cfg = { - "batch_size": cfg.inference.get("batch_size", 8), - "num_workers": cfg.inference.get("num_workers", 4), - "pin_memory": cfg.inference.get("num_workers", False), - } - for input_filename, output_filename in zip(input_filenames, output_filenames): - if not os.path.exists(input_filename): - logging.info(f"Skip non-existing {input_filename}.") - continue - model.infer(dataloader_cfg, input_filename, output_filename) - logging.info(f"Predictions saved to {output_filename}.") - - -if __name__ == "__main__": - main() diff --git a/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py b/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py deleted file mode 100644 index ac50b4121f15..000000000000 --- a/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script contains an example on how to train SpellMapper (SpellcheckingAsrCustomizationModel). -It uses the `examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. Probably it is worth looking -at the example config file to see the list of parameters used for training. - -USAGE Example: - See `examples/nlp/spellchecking_asr_customization/run_training.sh` for training on non-tarred data. - and - `examples/nlp/spellchecking_asr_customization/run_training_tarred.sh` for training on tarred data. - -One (non-tarred) training example should consist of 4 tab-separated columns: - 1. text of ASR-hypothesis - 2. texts of 10 candidates separated by semicolon - 3. 1-based ids of correct candidates, or 0 if none - 4. start/end coordinates of correct candidates (correspond to ids in third column) -Example (in one line): - a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o - d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y - 1 3 - CUSTOM 12 23;CUSTOM 28 41 -""" - -from helpers import MODEL, instantiate_model_and_trainer -from omegaconf import DictConfig, OmegaConf - -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") -def main(cfg: DictConfig) -> None: - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters like here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - logging.info(f'Config Params: {OmegaConf.to_yaml(cfg)}') - - # Train the model - if cfg.model.do_training: - logging.info( - "================================================================================================" - ) - logging.info('Start training...') - trainer, model = instantiate_model_and_trainer(cfg, MODEL, True) - spellchecking_exp_manager = cfg.get('exp_manager', None) - exp_manager(trainer, spellchecking_exp_manager) - trainer.fit(model) - logging.info('Training finished!') - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/token_classification/README.md b/examples/nlp/token_classification/README.md deleted file mode 100644 index 808ed2856fb2..000000000000 --- a/examples/nlp/token_classification/README.md +++ /dev/null @@ -1,2 +0,0 @@ -> [!IMPORTANT] -> This section is no longer supported in NeMo and is scheduled for removal in the 23.11 release. diff --git a/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml b/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml deleted file mode 100644 index cc374f538c93..000000000000 --- a/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Punctuation and capitalization model with pretrained BERT-like models - -pretrained_model: null # pretrained Punctuation and Capitalization model from list_available_models(), for example: -# punctuation_en_bert or punctuation_en_distilbert -# or your_model.nemo -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 3 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - strategy: ddp - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: Punctuation_and_Capitalization # The name of your model - create_tensorboard_logger: true # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: true # Whether you want exp_manager to create a model checkpoint callback - -model: - class_labels: - punct_labels_file: punct_label_ids.csv - capit_labels_file: capit_label_ids.csv - - common_dataset_parameters: - pad_label: 'O' - ignore_extra_tokens: false - ignore_start_end: true - punct_label_ids: null - capit_label_ids: null - label_vocab_dir: null - - train_ds: - # Tarred dataset is recommended if all dataset cannot be loaded in memory. Use script - # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py` for tarred dataset - # creation. - use_tarred_dataset: false - # A path to directory where `tar_metadata_file` or `text_file` and `labels_file` are stored. - ds_item: ??? - - text_file: text_train.txt - labels_file: labels_train.txt - # Permutes batches every epoch - shuffle: true - num_samples: -1 - # A max number of source text tokens in a batch. Examples are sorted by number of tokens in a source text before - # batching. Examples which number of tokens do not differ much are added to the batch. This procedure reduces - # number of pad tokens in a batch. A number of examples in a batch varies: longer input sequences -> less - # examples in a batch. - tokens_in_batch: 15000 - max_seq_length: 512 - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # Path to tarred dataset metadata file. Required if tarred dataset is used. Metadata file is a JSON file which - # contains total number of batches in the dataset, a list of paths to tar files and paths to label vocabularies. - # Metadata file is create by script - # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py` - tar_metadata_file: null - # Controls batch shuffling in tarred dataset. `tar_shuffle_n` is a size of shuffled batch buffer. Mind that this - # shuffling only permutes batches and doesn't exchange samples between batches. Proper shuffling is turned on in - # regular dataset. - tar_shuffle_n: 1 - - validation_ds: - # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for - # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used - # See `train_ds` section for more details on tarred dataset - use_tarred_dataset: false - # expected format: `[PATH_TO_DEV1,PATH_TO_DEV2]` OR `PATH_TO_DEV` (Note no space between the paths and square - # brackets) - ds_item: ??? - - text_file: text_dev.txt - labels_file: labels_dev.txt - shuffle: false - num_samples: -1 - # See comment above `model.train_ds.tokens_in_batch` parameter for explanation. - tokens_in_batch: 15000 - max_seq_length: 512 - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # For more details see `train_ds` section. - tar_metadata_file: null - - test_ds: - # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for - # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used - # See `train_ds` section for more details on tarred dataset - use_tarred_dataset: false - ds_item: ??? # expected format: [PATH_TO_DEV1,PATH_TO_DEV2] (Note no space between the paths and square brackets) - - text_file: text_dev.txt - labels_file: labels_dev.txt - shuffle: false - num_samples: -1 - # See comment above `model.train_ds.tokens_in_batch` parameter for explanation. - tokens_in_batch: 15000 - max_seq_length: 512 - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # For more details see `train_ds` section. - tar_metadata_file: null - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - punct_head: - num_fc_layers: 1 - fc_dropout: 0.1 - activation: 'relu' - use_transformer_init: True - - capit_head: - num_fc_layers: 1 - fc_dropout: 0.1 - activation: 'relu' - use_transformer_init: true - - optim: - name: adam - lr: 1e-4 - weight_decay: 0.00 - - sched: - name: WarmupAnnealing - # Scheduler params - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml b/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml deleted file mode 100644 index e727d22aca54..000000000000 --- a/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml +++ /dev/null @@ -1,230 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Punctuation and capitalization lexical audio model with pretrained BERT-like models and Encoder-Decoder-like models. -pretrained_model: null # pretrained Punctuation and Capitalization Lexical Audio model from list_available_models(), for example: -# -# or your_model.nemo -trainer: - devices: -1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 5 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - strategy: ddp - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, - # LR schedulers, apex, etc. - log_every_n_steps: 50 - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: Punctuation_and_Capitalization_Lexical_Audio # The name of your model - create_tensorboard_logger: true # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: true # Whether you want exp_manager to create a model checkpoint callback - checkpoint_callback_params: - save_top_k: 3 - monitor: "val_loss" - mode: "min" - save_best_model: true - resume_from_checkpoint: null - -model: - audio_encoder: - pretrained_model: stt_en_conformer_ctc_medium # You can choose any pretrained ASR model from list_available_models() of EncDecCTCModel. - freeze: - is_enabled: false # If set to True weights of audio encoder will not be updated during training. - d_model: 256 # Input dimension of MultiheadAttentionMechanism and PositionwiseFeedForward - d_ff: 1024 # Hidden dimension of PositionwiseFeedForward - num_layers: 4 # Number of additional Conformer layers - adapter: - enable: false # If set to True will enable adapters for audio encoder. - config: - # For more details see `nemo.collections.common.parts.LinearAdapter` class - in_features: -1 # Will be replaced with size of audio encoder - dim: 128 # Hidden dimension of the feed forward network. - activation: 'swish' # Str name for an activation function. - fusion: - num_layers: 4 # Number of layers to use in fusion - num_attention_heads: 4 # Number of attention heads to use in fusion - inner_size: 2048 # Fusion inner size - - class_labels: - punct_labels_file: punct_label_ids.txt - capit_labels_file: capit_label_ids.txt - - common_dataset_parameters: - pad_label: 'O' - ignore_extra_tokens: false - ignore_start_end: true - punct_label_ids: null - capit_label_ids: null - label_vocab_dir: null - - train_ds: - # Tarred dataset is recommended if all dataset cannot be loaded in memory. Use script - # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py` for tarred dataset - # creation. - use_tarred_dataset: false - - # A path to directory where `tar_metadata_file` or `text_file` and `labels_file` and `audio_file` are stored. - ds_item: ??? - text_file: text_train.txt - labels_file: labels_train.txt - audio_file: audio_train.txt - - use_audio: true # Has to be set to true to use it for lexical audio model. - use_bucketing: true # If set to true batches will be sorted by length of audios and packed in batches limited by `tokens_in_batch`. Otherwise, provide `batch_size` parameter. - # If set to true audios will be loaded to memory during __init__ call of `BertPunctuationCapitalizationDataset`, consumes more RAM. - # Otherwise, audios will be loaded during `collate_fn` call of `BertPunctuationCapitalizationDataset`. - preload_audios: true - - # A max number of source text tokens in a batch. Examples are sorted by number of tokens in a source text before - # batching. Examples which number of tokens do not differ much are added to the batch. This procedure reduces - # number of pad tokens in a batch. A number of examples in a batch varies: longer input sequences -> less - # examples in a batch. - tokens_in_batch: 2048 - max_seq_length: 512 - - sample_rate: 16000 # Target sample rate of audios can be used for downsampling or upsamling. - num_workers: 0 - - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # Path to tarred dataset metadata file. Required if tarred dataset is used. Metadata file is a JSON file which - # contains total number of batches in the dataset, a list of paths to tar files and paths to label vocabularies. - # Metadata file is create by script - # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py` - tar_metadata_file: null - # Controls batch shuffling in tarred dataset. `tar_shuffle_n` is a size of shuffled batch buffer. Mind that this - # shuffling only permutes batches and doesn't exchange samples between batches. Proper shuffling is turned on in - # regular dataset. - tar_shuffle_n: 1 - - validation_ds: - # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for - # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used - # See `train_ds` section for more details on tarred dataset - use_tarred_dataset: false - # expected format: `[PATH_TO_DEV1,PATH_TO_DEV2]` OR `PATH_TO_DEV` (Note no space between the paths and square - # brackets) - ds_item: ??? - - text_file: text_dev.txt - labels_file: labels_dev.txt - audio_file: audio_dev.txt - - use_audio: true - use_bucketing: false - preload_audios: false - - shuffle: false - num_samples: -1 - batch_size: 32 - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # For more details see `train_ds` section. - tar_metadata_file: null - - sample_rate: 16000 - num_workers: 0 - - test_ds: - # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for - # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used - # See `train_ds` section for more details on tarred dataset - use_tarred_dataset: false - # expected format: `[PATH_TO_DEV1,PATH_TO_DEV2]` OR `PATH_TO_DEV` (Note no space between the paths and square - # brackets) - ds_item: ??? - - text_file: text_dev.txt - labels_file: labels_dev.txt - audio_file: audio_dev.txt - - use_audio: true - use_bucketing: false - preload_audios: false - - shuffle: false - num_samples: -1 - batch_size: 32 - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # For more details see `train_ds` section. - tar_metadata_file: null - - sample_rate: 16000 - num_workers: 0 - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - punct_head: - num_fc_layers: 1 - fc_dropout: 0.1 - activation: 'relu' - use_transformer_init: True - - capit_head: - num_fc_layers: 1 - fc_dropout: 0.1 - activation: 'relu' - use_transformer_init: true - - optim: - name: adam - lr: 1e-4 - weight_decay: 0.00 - - sched: - name: WarmupAnnealing - # Scheduler params - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -hydra: - run: - dir: . - job_logging: - root: - handlers: null \ No newline at end of file diff --git a/examples/nlp/token_classification/conf/token_classification_config.yaml b/examples/nlp/token_classification/conf/token_classification_config.yaml deleted file mode 100644 index 05024c781dab..000000000000 --- a/examples/nlp/token_classification/conf/token_classification_config.yaml +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Token Classification tasks (for example, Named Entity Recognition) with pretrained BERT-like models - -pretrained_model: null # pretrained TokenClassification model from list_available_models() or path to a .nemo file, -# for example: ner_en_bert or your_model.nemo -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 5 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: token_classification_model # The name of your model - create_tensorboard_logger: true # Whether you want exp_manager to create a tb logger - create_checkpoint_callback: true # Whether you want exp_manager to create a model checkpoint callback - -model: - label_ids: null # will be filled during training - class_labels: - class_labels_file: label_ids.csv # will be generated during training and saved in .nemo file - dataset: - data_dir: ??? # /path/to/data - class_balancing: null # choose from [null, weighted_loss]. Weighted_loss enables the weighted class balancing of the loss, may be used for handling unbalanced classes - max_seq_length: 128 - pad_label: 'O' - ignore_extra_tokens: false - ignore_start_end: false - use_cache: false - # shared among dataloaders - num_workers: 2 - pin_memory: false - drop_last: false - - train_ds: - text_file: text_train.txt - labels_file: labels_train.txt - shuffle: true - num_samples: -1 - batch_size: 64 - - validation_ds: - text_file: text_dev.txt - labels_file: labels_dev.txt - shuffle: false - num_samples: -1 - batch_size: 64 - - test_ds: - text_file: text_dev.txt - labels_file: labels_dev.txt - shuffle: false - num_samples: -1 - batch_size: 64 - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - - head: - num_fc_layers: 2 - fc_dropout: 0.5 - activation: 'relu' - use_transformer_init: True - - optim: - name: adam - lr: 5e-5 - weight_decay: 0.00 - - sched: - name: WarmupAnnealing - # Scheduler params - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py b/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py deleted file mode 100644 index d74c2d8bc19a..000000000000 --- a/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py +++ /dev/null @@ -1,356 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import multiprocessing as mp -from pathlib import Path - -from nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset import ( - DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME, - DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME, - METADATA_CAPIT_LABEL_VOCAB_KEY, - METADATA_PUNCT_LABEL_VOCAB_KEY, - build_label_ids_from_list_of_labels, - check_labels_for_being_unique_before_building_label_ids, - check_tar_file_prefix, - create_tarred_dataset, -) - - -""" -A tarred dataset allows to train on large amounts without storing it all into memory simultaneously. In case of -punctuation and capitalization model, tarred dataset is a directory which contains metadata file, tar files with -batches, punct_label_vocab.csv and capit_label_vocab.csv files. - -A metadata file is a JSON file with 4 fields: 'num_batches', 'tar_files', 'punct_label_vocab_file', -'capit_label_vocab_file'. 'num_batches' (int) is a total number of batches in tarred dataset. 'tar_files' is a list of -paths to tar files relative to directory containing the metadata file. 'punct_label_vocab_file' and -'capit_label_vocab_file' are paths to .csv files containing all unique punctuation and capitalization labels. Each -label in these files is written in a separate line. The first labels in both files are equal and serve for padding and -as neutral labels. - -Every tar file contains objects written using `webdataset.TarWriter`. Each object is a dictionary with two items: -'__key__' and 'batch.pyd'. '__key__' is a name of a batch and 'batch.pyd' is a pickled dictionary which contains -'input_ids', 'subtokens_mask', 'punct_labels', 'capit_labels'. 'input_ids' is an array containing ids of source tokens, -'subtokens_mask' is a boolean array showing first tokens in words, 'punct_labels' and 'capit_labels' are arrays with -ids of labels. Metadata file should be passed to constructor of -`nemo.collections.nlp.data.token_classification.PunctuationCapitalizationTarredDataset` and the instance of -the class will handle iteration and constructing masks and token types for BERT model. - -Example of usage: - -python create_punctuation_capitalization_tarred_dataset.py \ - --text \ - --labels \ - --output_dir \ - --lines_per_dataset_fragment 10000 \ - --tokens_in_batch 8000 \ - --num_batches_per_tarfile 5 \ - --tokenizer_name char \ - --vocab_file -""" - - -def get_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description=f"A tarred dataset allows to train on large amounts without storing it all into memory " - f"simultaneously. In case of punctuation and capitalization model, tarred dataset is a directory which " - f"contains metadata file, tar files with batches, {DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME} and " - f"{DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME} files. A metadata file is a JSON file with 4 fields: 'num_batches', " - f"'tar_files', '{METADATA_PUNCT_LABEL_VOCAB_KEY}', '{METADATA_CAPIT_LABEL_VOCAB_KEY}'. 'num_batches' (int) is " - f"a total number of batches in tarred dataset. 'tar_files' is a list of paths to tar files relative " - f"to directory containing the metadata file. '{METADATA_PUNCT_LABEL_VOCAB_KEY}' and " - f"'{METADATA_CAPIT_LABEL_VOCAB_KEY}' are paths to .csv files containing all unique punctuation and " - f"capitalization labels. Each label in these files is written in a separate line. The first labels in both " - f"files are equal and serve for padding and as neutral labels. Every tar file contains objects written " - f"using `webdataset.TarWriter`. Each object is a dictionary with two items: '__key__' and 'batch.pyd'. " - f"'__key__' is a name of a batch and 'batch.pyd' is a pickled dictionary which contains 'input_ids', " - f"'subtokens_mask', 'punct_labels', 'capit_labels'. 'input_ids' is an array containing ids of source tokens, " - f"'subtokens_mask' is a boolean array showing first tokens in words, 'punct_labels' and 'capit_labels' are " - f"arrays with ids of labels. Metadata file should be passed to constructor of " - "`nemo.collections.nlp.data.token_classification.PunctuationCapitalizationTarredDataset` and the instance of " - "the class will handle iteration and constructing masks and token types for BERT model.", - ) - parser.add_argument( - "--text", - "-t", - help="Path to source lowercased text without punctuation. Number of lines in `--text` file has to be equal " - "to number of lines in `--labels` file.", - type=Path, - required=True, - ) - parser.add_argument( - "--audio_file", - type=Path, - required=False, - help="Path to source file which contains paths to audio one path per line. " - "Number of lines in `--audio_file` has to be equal to number of lines in `--labels` file", - ) - parser.add_argument( - "--use_audio", - required=False, - action="store_true", - help="If set to `True` script creates lexical audio dataset which can be used with `PunctuationCapitalizationLexicalAudioModel`.", - ) - parser.add_argument( - "--sample_rate", - type=int, - required=False, - help="Target sample rate of audios. Can be used for downsampling or upsampling.", - ) - parser.add_argument( - "--labels", - "-L", - type=Path, - required=True, - help="Path to file with labels in the format described here " - "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#" - "nemo-data-format . Number of lines in `--labels` file has to be equal to the number of lines in `--text` " - "file.", - ) - parser.add_argument( - "--output_dir", - "-o", - type=Path, - required=True, - help="Path to directory where .tar files, metadata file, label id files are stored.", - ) - parser.add_argument( - "--max_seq_length", - "-s", - type=int, - default=512, - help="Maximum number of subtokens in an input sequence. A source sequence which contain too many subtokens are " - "clipped to `--max_seq_length - 2` subtokens and then [CLS] token is prepended to the clipped sequence and " - "[SEP] token is appended to the clipped sequence. The clipping is performed via removal of subtokens in the " - "end of a source sequence.", - ) - parser.add_argument( - "--tokens_in_batch", - "-b", - type=int, - default=15000, - help="Maximum number of tokens in a batch including [CLS], [SEP], [UNK], and [PAD] tokens. Before packing into " - "batches source sequences are sorted by number of tokens in order to reduce number of pad tokens. So the " - "number of sequences in a batch may be different.", - ) - parser.add_argument( - "--lines_per_dataset_fragment", - type=int, - default=10 ** 6, - help="A number of lines processed by one worker during creation of tarred dataset. A worker tokenizes " - "`--lines_per_dataset_fragment` lines and keeps in RAM tokenized text labels before packing them into " - "batches. Reducing `--lines_per_dataset_fragment` leads to reducing of the amount of memory required by this " - "script.", - ) - parser.add_argument( - "--num_batches_per_tarfile", - type=int, - default=1000, - help="A number of batches saved in a tar file. If you increase `--num_batches_per_tarfile`, then there will " - "be less tar files in the dataset. There cannot be less then `--num_batches_per_tarfile` batches in a tar " - "file, and all excess batches are removed. Maximum number of discarded batches is " - "`--num_batches_per_tarfile - 1`.", - ) - parser.add_argument( - "--tokenizer_name", - "-T", - default="bert-base-uncased", - help="Name of the tokenizer used for tokenization of source sequences. Possible options are 'sentencepiece', " - "'word', 'char', HuggingFace tokenizers. For more options see function " - "`nemo.collections.nlp.modules.common.get_tokenizer`. The tokenizer has to have properties `cls_id`, " - "`pad_id`, `sep_id`, `unk_id`.", - ) - parser.add_argument( - "--tokenizer_model", "-m", type=Path, help="Path to tokenizer model required for 'sentencepiece' tokenizer." - ) - parser.add_argument( - "--vocab_file", - "-v", - type=Path, - help="Path to vocabulary file which can be used in 'word', 'char', and HuggingFace tokenizers.", - ) - parser.add_argument( - "--merges_file", "-M", type=Path, help="Path to merges file which can be used in HuggingFace tokenizers." - ) - parser.add_argument( - "--special_token_names", - "-n", - nargs="+", - help="Names of special tokens which may be passed to constructors of 'char', 'word', 'sentencepiece', and " - "HuggingFace tokenizers.", - ) - parser.add_argument( - "--special_token_values", - "-V", - nargs="+", - help="Values of special tokens which may be passed to constructors of 'char', 'word', 'sentencepiece', and " - "HuggingFace tokenizers.", - ) - parser.add_argument( - "--use_fast_tokenizer", "-f", action="store_true", help="Whether to use fast HuggingFace tokenizer." - ) - parser.add_argument( - "--pad_label", - "-P", - default='O', - help="Pad label both for punctuation and capitalization. This label is also is used for marking words which " - "do not need punctuation and capitalization. It is also a neutral label used for marking words which do " - "not require punctuation and capitalization.", - ) - punct = parser.add_mutually_exclusive_group(required=False) - punct.add_argument( - "--punct_labels", - "-p", - nargs="+", - help="All punctuation labels EXCEPT PAD LABEL. Punctuation labels are strings separated by spaces. " - "Alternatively you can use parameter `--punct_label_vocab_file`. If none of parameters `--punct_labels` " - "and `--punct_label_vocab_file` are provided, then punctuation label ids will be inferred from `--labels` " - "file.", - ) - punct.add_argument( - "--punct_label_vocab_file", - type=Path, - help="A path to file with punctuation labels. These labels include pad label. Pad label has to be the first " - "label in the file. Each label is written on separate line. Alternatively you can use `--punct_labels` " - "parameter. If none of parameters `--punct_labels` and `--punct_label_vocab_file` are provided, then " - "punctuation label ids will be inferred from `--labels` file.", - ) - capit = parser.add_mutually_exclusive_group(required=False) - capit.add_argument( - "--capit_labels", - "-c", - nargs="+", - help="All capitalization labels EXCEPT PAD LABEL. Capitalization labels are strings separated by spaces. " - "Alternatively you can use parameter `--capit_label_vocab_file`. If none of parameters `--capit_labels` " - "and `--capit_label_vocab_file` are provided, then capitalization label ids will be inferred from `--labels` " - "file.", - ) - capit.add_argument( - "--capit_label_vocab_file", - type=Path, - help="A path to file with capitalization labels. These labels include pad label. Pad label has to be the " - "first label in the file. Each label is written on separate line. Alternatively you can use `--capit_labels` " - "parameter. If none of parameters `--capit_labels` and `--capit_label_vocab_file` are provided, then " - "capitalization label ids will be inferred from `--labels` file.", - ) - parser.add_argument( - "--tar_file_prefix", - "-x", - default="punctuation_capitalization", - help="A string from which tar file names start. It can contain only characters 'A-Z', 'a-z', '0-9', '_', '-', " - "'.'.", - ) - parser.add_argument( - "--n_jobs", - "-j", - type=int, - default=mp.cpu_count(), - help="Number of workers for creating tarred dataset. By default it is equal to the number of CPU cores.", - ) - args = parser.parse_args() - for name in [ - "text", - "labels", - "output_dir", - "tokenizer_model", - "vocab_file", - "merges_file", - "punct_label_vocab_file", - "capit_label_vocab_file", - ]: - if getattr(args, name) is not None: - setattr(args, name, getattr(args, name).expanduser()) - if args.special_token_names is not None or args.special_token_values is not None: - if args.special_token_names is None: - parser.error( - "If you provide parameter `--special_token_values` you have to provide parameter " - "`--special_token_names`." - ) - if args.special_token_values is None: - parser.error( - "If you provide parameter `--special_token_names` you have to provide parameter " - "`--special_token_values`." - ) - if len(args.special_token_names) != len(args.special_token_values): - parser.error( - f"Parameters `--special_token_names` and `--special_token_values` have to have equal number of values " - f"whereas parameter `--special_token_names` has {len(args.special_token_names)} values and parameter " - f"`--special_token_values` has {len(args.special_token_values)} values." - ) - if len(set(args.special_token_names)) != len(args.special_token_names): - for i in range(len(args.special_token_names) - 1): - if args.special_token_names[i] in args.special_token_names[i + 1 :]: - parser.error( - f"Values of parameter `--special_token_names` has to be unique. Found duplicate value " - f"'{args.special_token_names[i]}'." - ) - if args.punct_labels is not None: - check_labels_for_being_unique_before_building_label_ids( - args.pad_label, args.punct_labels, '--pad_label', '--punct_labels', parser.error - ) - check_labels_for_being_unique_before_building_label_ids( - args.pad_label, args.capit_labels, '--pad_label', '--capit_labels', parser.error - ) - check_tar_file_prefix(args.tar_file_prefix, parser.error, '--tar_file_prefix') - return args - - -def main() -> None: - args = get_args() - if args.special_token_names is None: - special_tokens = None - else: - special_tokens = dict(zip(args.special_token_names, args.special_token_values)) - - if args.punct_labels is not None: - punct_label_ids = build_label_ids_from_list_of_labels(args.pad_label, args.punct_labels) - else: - punct_label_ids = None - - if args.capit_labels is not None: - capit_label_ids = build_label_ids_from_list_of_labels(args.pad_label, args.capit_labels) - else: - capit_label_ids = None - - create_tarred_dataset( - args.text, - args.labels, - args.output_dir, - args.max_seq_length, - args.tokens_in_batch, - args.lines_per_dataset_fragment, - args.num_batches_per_tarfile, - args.tokenizer_name, - tokenizer_model=args.tokenizer_model, - vocab_file=args.vocab_file, - merges_file=args.merges_file, - special_tokens=special_tokens, - use_fast_tokenizer=args.use_fast_tokenizer, - pad_label=args.pad_label, - punct_label_ids=punct_label_ids, - capit_label_ids=capit_label_ids, - punct_label_vocab_file=args.punct_label_vocab_file, - capit_label_vocab_file=args.capit_label_vocab_file, - tar_file_prefix=args.tar_file_prefix, - n_jobs=args.n_jobs, - audio_file=args.audio_file, - sample_rate=args.sample_rate, - use_audio=args.use_audio, - ) - - -if __name__ == "__main__": - main() diff --git a/examples/nlp/token_classification/data/get_libritts_data.py b/examples/nlp/token_classification/data/get_libritts_data.py deleted file mode 100644 index 86a5d01eb9dc..000000000000 --- a/examples/nlp/token_classification/data/get_libritts_data.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This script downloads and unpacks LibriTTS data. And prepares it for punctuation and capitalization lexical audio model. -Data is being downloaded from www.openslr.org and then extracted via tar. -The script gathers text from every *.normalized.txt file inside of archive into single file with text and file with audio filepaths. -""" -import argparse -import glob -import os -import re -import shutil -import subprocess -import tarfile - -from tqdm import tqdm - -from nemo.collections.nlp.data.token_classification.token_classification_utils import create_text_and_labels -from nemo.utils import logging - -URL = { - 'train_clean_100': "https://www.openslr.org/resources/60/train-clean-100.tar.gz", - 'train_clean_360': "https://www.openslr.org/resources/60/train-clean-360.tar.gz", - 'train_other_500': "https://www.openslr.org/resources/60/train-other-500.tar.gz", - 'dev_clean': "https://www.openslr.org/resources/60/dev-clean.tar.gz", - 'dev_other': "https://www.openslr.org/resources/60/dev-other.tar.gz", - 'test_clean': "https://www.openslr.org/resources/60/test-clean.tar.gz", - 'test_other': "https://www.openslr.org/resources/60/test-other.tar.gz", -} - - -def __extract_file(filepath, data_dir): - try: - tar = tarfile.open(filepath) - tar.extractall(data_dir) - tar.close() - except Exception: - print(f"Error while extracting {filepath}. Already extracted?") - - -def __maybe_download_file(destination: str, source: str): - """ - Downloads source to destination if not exists. - If exists, skips download - Args: - destination: local filepath - source: url of resource - """ - source = URL[source] - if not os.path.exists(destination): - logging.info(f'Downloading {source} to {destination}') - subprocess.run(['wget', '-O', destination, source]) - return 1 - else: - logging.info(f'{destination} found. Skipping download') - return 0 - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Prepare LibriTTS dataset for punctuation capitalization lexical audio model training/evaluating.' - ) - parser.add_argument("--data_sets", default="dev_clean", type=str, help="List of subsets separated by comma") - parser.add_argument("--data_dir", required=True, type=str, help="Path to dir where data will be stored") - parser.add_argument( - "--clean", "-c", action="store_true", help="If set to True will delete all files except produced .txt and .wav" - ) - args = parser.parse_args() - - data_dir = args.data_dir - - if not os.path.exists(data_dir): - os.makedirs(data_dir) - - for subset in args.data_sets.split(','): - logging.info(f'Downloading {subset} subset') - if __maybe_download_file(data_dir + f'/{subset}.tar.gz', subset): - logging.info(f'Extracting {subset} subset') - __extract_file(data_dir + f'/{subset}.tar.gz', data_dir) - - logging.info(f'Processing data') - - splits = set([split.split('_')[0] for split in args.data_sets.split(',')]) - for split in splits: - os.makedirs(f'{data_dir}/audio/{split}', exist_ok=True) - with open(f'{data_dir}/{split}.txt', 'w') as text_data, open( - f'{data_dir}/audio_{split}.txt', 'w' - ) as audio_data: - for file in tqdm(glob.glob(f'{data_dir}/LibriTTS/{split}*/*/*/*.wav'), desc=f'Processing {split}'): - with open(file[:-4] + '.normalized.txt', 'r') as source_file: - lines = source_file.readlines() - text = lines[0] - text = re.sub(r"[^a-zA-Z\d,?!.']", ' ', text) - text = re.sub(' +', ' ', text) - shutil.copy(file.strip(), (f'{data_dir}/audio/{split}/' + file.split('/')[-1]).strip()) - text_data.write(text.strip() + "\n") - audio_data.write((f'{data_dir}/audio/{split}/' + file.split('/')[-1]).strip() + "\n") - create_text_and_labels(f'{data_dir}/', f'{data_dir}/{split}.txt') - logging.info(f'Processed {split} subset') - - if args.clean: - shutil.rmtree(f'{data_dir}/LibriTTS') - for tar in glob.glob(f'{data_dir}/**.tar.gz'): - os.remove(tar) diff --git a/examples/nlp/token_classification/data/get_tatoeba_data.py b/examples/nlp/token_classification/data/get_tatoeba_data.py deleted file mode 100644 index 6a4cd23b249d..000000000000 --- a/examples/nlp/token_classification/data/get_tatoeba_data.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging -import os -import random -import re -import subprocess - -from nemo.collections.nlp.data.token_classification.token_classification_utils import create_text_and_labels -from nemo.utils import logging - -URL = {'tatoeba': 'https://downloads.tatoeba.org/exports/sentences.csv'} - - -def __maybe_download_file(destination: str, source: str): - """ - Downloads source to destination if not exists. - If exists, skips download - Args: - destination: local filepath - source: url of resource - """ - source = URL[source] - if not os.path.exists(destination): - logging.info(f'Downloading {source} to {destination}') - subprocess.run(['wget', '-O', destination, source]) - else: - logging.info(f'{destination} found. Skipping download') - - -def __process_english_sentences( - in_file: str, out_file: str, percent_to_cut: float = 0, num_to_combine: int = 1, num_samples: int = -1 -): - """ - Extract English sentences from the Tatoeba dataset. - - Expected in_file format - that - contain letters and punctuation marks (,.?). - Chop and combine sentences. - Args: - in_file: local filepath to the tatoeba dataset. - Format: id [TAB] region_name [TAB] sentence, - for example: "1276\teng\tLet's try something.\n" - out_file: local filepath to the clean dataset - percent_to_cut: Percent of sentences to cut in the middle - to get examples of incomplete sentences. - This could be useful since ASR output not always - represents a complete sentence - num_to_combine: Number of sentences to combine into - a single example - num_samples: Number of samples in the final dataset - """ - if not os.path.exists(in_file): - raise FileNotFoundError(f'{in_file} not found.') - - in_file = open(in_file, 'r') - out_file = open(out_file, 'w') - lines_to_combine = [] - samples_count = 0 - - for line in in_file: - line = line.split('\t') - # use only English sentences - if line[1] == 'eng': - line = line[2].strip() - if re.match("^[A-Z][A-Za-z.,'?\s]+$", line): # nopep8 - # chop some sentences in the middle - if percent_to_cut > 0: - line = line.split() - if random.random() < percent_to_cut: - line = line[: len(line) // 2] - line = ' '.join(line) - - # combine multiple sentences into a single example - # to make it harder for the model to learn eos punctuation - if len(lines_to_combine) >= num_to_combine: - if samples_count == num_samples: - return - out_file.write(' '.join(lines_to_combine) + '\n') - lines_to_combine = [] - samples_count += 1 - lines_to_combine.append(line) - - if len(lines_to_combine) > 0 and (samples_count < num_samples or num_samples < 0): - out_file.write(' '.join(lines_to_combine) + '\n') - - -def __split_into_train_dev(in_file: str, train_file: str, dev_file: str, percent_dev: float): - """ - Create train and dev split of the dataset. - Args: - in_file: local filepath to the dataset - train_file: local filepath to the train dataset - dev_file: local filepath to the dev dataset - percent_dev: Percent of the sentences in the dev set - """ - if not os.path.exists(in_file): - raise FileNotFoundError(f'{in_file} not found.') - - lines = open(in_file, 'r').readlines() - train_file = open(train_file, 'w') - dev_file = open(dev_file, 'w') - - dev_size = int(len(lines) * percent_dev) - train_file.write(' '.join(lines[:-dev_size])) - dev_file.write(' '.join(lines[-dev_size:])) - - -def __delete_file(file_to_del: str): - """ - Deletes the file - Args: - file_to_del: local filepath to the file to delete - """ - if os.path.exists(file_to_del): - os.remove(file_to_del) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Prepare tatoeba dataset') - parser.add_argument("--data_dir", required=True, type=str) - parser.add_argument("--dataset", default='tatoeba', type=str) - parser.add_argument("--num_samples", default=-1, type=int, help='-1 to use the whole dataset') - parser.add_argument("--percent_to_cut", default=0, type=float, help='Percent of sentences to cut in the middle') - parser.add_argument( - "--num_lines_to_combine", default=1, type=int, help='Number of lines to combine into single example' - ) - parser.add_argument("--percent_dev", default=0.2, type=float, help='Size of the dev set, float') - parser.add_argument("--clean_dir", action='store_true') - args = parser.parse_args() - - if not os.path.exists(args.data_dir): - os.makedirs(args.data_dir) - - if args.dataset != 'tatoeba': - raise ValueError("Unsupported dataset.") - - logging.info(f'Downloading tatoeba dataset') - tatoeba_dataset = os.path.join(args.data_dir, 'sentences.csv') - __maybe_download_file(tatoeba_dataset, args.dataset) - - logging.info(f'Processing English sentences...') - clean_eng_sentences = os.path.join(args.data_dir, 'clean_eng_sentences.txt') - __process_english_sentences( - tatoeba_dataset, clean_eng_sentences, args.percent_to_cut, args.num_lines_to_combine, args.num_samples - ) - - train_file = os.path.join(args.data_dir, 'train.txt') - dev_file = os.path.join(args.data_dir, 'dev.txt') - - logging.info( - f'Splitting the {args.dataset} dataset into train and dev sets' + ' and creating labels and text files' - ) - __split_into_train_dev(clean_eng_sentences, train_file, dev_file, args.percent_dev) - - logging.info(f'Creating text and label files for training') - create_text_and_labels(args.data_dir, os.path.join(args.data_dir, 'train.txt')) - create_text_and_labels(args.data_dir, os.path.join(args.data_dir, 'dev.txt')) - - if args.clean_dir: - logging.info(f'Cleaning up {args.data_dir}') - __delete_file(clean_eng_sentences) - __delete_file(tatoeba_dataset) - __delete_file(train_file) - __delete_file(dev_file) - logging.info(f'Processing of the {args.dataset} is complete') diff --git a/examples/nlp/token_classification/data/import_from_iob_format.py b/examples/nlp/token_classification/data/import_from_iob_format.py deleted file mode 100644 index 4a6f15442b98..000000000000 --- a/examples/nlp/token_classification/data/import_from_iob_format.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -from nemo.utils import logging - - -def __convert_data(in_file: str, out_text_f: str, out_labels_f: str, max_length: int): - """ - Convert data from the IOB format to NeMo accepted format described below. - in_file should be in the IOB format, see example here: - https://www.clips.uantwerpen.be/conll2003/ner/. - - Args: - in_file: input file name - out_text_f: output file with text - out_labels_f: output file with labels - max_length: use -1 to leave the examples' length as is, otherwise long examples will be split into multiple - examples - After the conversion, the dataset is split into 2 files: text.txt - and labels.txt. - Each line of the text.txt file contains text sequences, where words - are separated with spaces. The labels.txt file contains corresponding - labels for each word in text.txt, the labels are separated with spaces. - Each line of the files should follow the format: - [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and - [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt). - - """ - in_file = open(in_file, 'r') - - if max_length == -1: - with open(out_text_f, 'w') as out_text, open(out_labels_f, 'w') as out_labels: - for line in in_file: - if line == '\n': - out_text.write(line) - out_labels.write(line) - else: - line = line.split() - out_text.write(line[0] + ' ') - out_labels.write(line[-1] + ' ') - - else: - words = [] - labels = [] - with open(out_text_f, 'w') as out_text, open(out_labels_f, 'w') as out_labels: - lines = in_file.readlines() - for line_id, line in enumerate(lines): - logging.info(f"{line_id} {len(lines)}") - contends = line.strip() - if len(contends) == 0: - assert len(words) == len(labels) - if len(words) > max_length: - # split if the sentence is longer than max_length - while len(words) > max_length: - tmplabel = labels[:max_length] - for iidx in range(len(tmplabel)): - if tmplabel.pop() == 'O': - break - l = ' '.join([label for label in labels[: len(tmplabel) + 1] if len(label) > 0]) - w = ' '.join([word for word in words[: len(tmplabel) + 1] if len(word) > 0]) - - out_text.write(w + "\n") - out_labels.write(l + "\n") - words = words[len(tmplabel) + 1 :] - labels = labels[len(tmplabel) + 1 :] - - if len(words) == 0: - continue - l = ' '.join([label for label in labels if len(label) > 0]) - w = ' '.join([word for word in words if len(word) > 0]) - - out_text.write(w + "\n") - out_labels.write(l + "\n") - words = [] - labels = [] - continue - - word = line.strip().split()[0] - label = line.strip().split()[-1] - words.append(word) - labels.append(label) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Convert data from IOB format to the format compatible with \ - nlp/examples/token_classification/scripts/token_classification_train.py and \ - token_classification_evaluate.py' - ) - parser.add_argument("--data_file", required=True, type=str, help='path to a file in IOB format') - parser.add_argument( - "--max_length", - default=-1, - type=int, - help='use -1 to leave the examples\'s length as is, ' - 'otherwise long examples will be split into multiple examples', - ) - args = parser.parse_args() - - data_dir, basename = os.path.split(args.data_file) - prefix = os.path.splitext(basename)[0] - if not os.path.exists(args.data_file): - raise FileNotFoundError(f"{args.data_file} not found") - - logging.info(f'Processing {args.data_file}') - out_text = os.path.join(data_dir, 'text_' + prefix + '.txt') - out_labels = os.path.join(data_dir, 'labels_' + prefix + '.txt') - - __convert_data(args.data_file, out_text, out_labels, args.max_length) - logging.info(f'Processing of the {args.data_file} is complete') diff --git a/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py b/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py deleted file mode 100644 index 78a0763d3b54..000000000000 --- a/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -The script converts raw text to the NeMo format for punctuation and capitalization task. - -Raw Data Format ---------------- - -The Punctuation and Capitalization model can work with any text dataset, although it is recommended to balance the data, especially for the punctuation task. -Before pre-processing the data to the format expected by the model, the data should be split into train.txt and dev.txt (and optionally test.txt). -Each line in the **train.txt/dev.txt/test.txt** should represent one or more full and/or truncated sentences. - -Example of the train.txt/dev.txt file: - When is the next flight to New York? - The next flight is ... - .... - - -The `source_data_dir` structure should look like this: - . - |--sourced_data_dir - |-- dev.txt - |-- train.txt - - - -NeMo Data Format for training the model ---------------------------------------- - -The punctuation and capitalization model expects the data in the following format: - -The training and evaluation data is divided into 2 files: text.txt and labels.txt. \ -Each line of the **text.txt** file contains text sequences, where words are separated with spaces, i.e. - -[WORD] [SPACE] [WORD] [SPACE] [WORD], for example: - when is the next flight to new york - the next flight is ... - ... - -The **labels.txt** file contains corresponding labels for each word in text.txt, the labels are separated with spaces. \ -Each label in labels.txt file consists of 2 symbols: - -* the first symbol of the label indicates what punctuation mark should follow the word (where O means no punctuation needed); -* the second symbol determines if a word needs to be capitalized or not (where U indicates that the word should be upper-cased, and O - no capitalization needed.) - -By default, the following punctuation marks are considered: commas, periods, and question marks; the rest punctuation marks were removed from the data. -This can be changed by introducing new labels in the labels.txt files - -Each line of the labels.txt should follow the format: [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt). \ -For example, labels for the above text.txt file should be: - - OU OO OO OO OO OO OU ?U - OU OO OO OO ... - ... - -The complete list of all possible labels for this task used in this tutorial is: OO, ,O, .O, ?O, OU, ,U, .U, ?U. - -Converting Raw data to NeMo format ----------------------------------- - -To pre-process the raw text data, stored under :code:`sourced_data_dir` (see the :ref:`raw_data_format_punct` -section), run the following command: - - python examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py \ - -s \ - -o - -""" - -import argparse -import os - -from get_tatoeba_data import create_text_and_labels - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Prepare data for punctuation and capitalization tasks') - parser.add_argument("-s", "--source_file", required=True, type=str, help="Path to the source file") - parser.add_argument("-o", "--output_dir", required=True, type=str, help="Path to the output directory") - parser.add_argument( - "-p", - "--marks", - required=False, - type=str, - help="Punctuation marks to consider for dataset", - default=[",", ".", "?"], - nargs="+", - ) - args = parser.parse_args() - - if not os.path.exists(args.source_file): - raise ValueError(f'{args.source_file} was not found') - - os.makedirs(args.output_dir, exist_ok=True) - create_text_and_labels(args.output_dir, args.source_file, "".join(args.marks)) - - print(f'Processing of the {args.source_file} is complete') diff --git a/examples/nlp/token_classification/punctuate_capitalize_infer.py b/examples/nlp/token_classification/punctuate_capitalize_infer.py deleted file mode 100644 index 8fdb3ab5a1ed..000000000000 --- a/examples/nlp/token_classification/punctuate_capitalize_infer.py +++ /dev/null @@ -1,282 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json -from pathlib import Path -from typing import Dict, List, Union - -import torch.cuda - -from nemo.collections.nlp.models import PunctuationCapitalizationLexicalAudioModel, PunctuationCapitalizationModel - - -""" -This script is for restoring punctuation and capitalization. - -Usage example: - -python punctuate_capitalize.py \ - --input_manifest \ - --output_manifest - -Usage example for lexical audio model: -python punctuate_capitalize.py \ - --input_manifest \ - --output_manifest \ - --use_audio - - - is a path to NeMo ASR manifest. Usually it is an output of - NeMo/examples/asr/transcribe_speech.py but can be a manifest with 'text' key. Alternatively you can use - --input_text parameter for passing text for inference. - is a path to NeMo ASR manifest into which script output will be written. Alternatively - you can use parameter --output_text. - -For more details on this script usage look in argparse help. -""" - - -def get_args() -> argparse.Namespace: - default_model_parameter = "pretrained_name" - default_model = "punctuation_en_bert" - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description="The script is for restoring punctuation and capitalization in text or text and audio. To use text and audio use '--use_audio'. Long strings are split into " - "segments of length `--max_seq_length`. `--max_seq_length` is the length which includes [CLS] and [SEP] " - "tokens. If `--use_audio` is set, samples with texts longer than `--max_seq_length` will be ignored. Parameter `--step` controls segments overlapping. `--step` is a distance between beginnings of " - "consequent segments. Model outputs for tokens near the borders of tensors are less accurate and can be " - "discarded before final predictions computation. Parameter `--margin` is number of discarded outputs near " - "segments borders. Probabilities of tokens in overlapping parts of segments multiplied before selecting the " - "best prediction. Default values of parameters `--max_seq_length`, `--step`, and `--margin` are optimal for " - "IWSLT 2019 test dataset.", - ) - parser.add_argument( - '--use_audio', - required=False, - action="store_true", - help="If set `PunctuationCapitalizationLexicalAudioModel` will be used for inference", - ) - input_ = parser.add_mutually_exclusive_group(required=True) - input_.add_argument( - "--input_manifest", - "-m", - type=Path, - help="Path to the file with NeMo manifest which needs punctuation and capitalization. If the first element " - "of manifest contains key 'pred_text', 'pred_text' values are passed for tokenization. Otherwise 'text' " - "values are passed for punctuation and capitalization. Exactly one parameter of `--input_manifest` and " - "`--input_text` should be provided.", - ) - input_.add_argument( - "--input_text", - "-t", - type=Path, - help="Path to file with text which needs punctuation and capitalization. Exactly one parameter of " - "`--input_manifest` and `--input_text` should be provided.", - ) - parser.add_argument( - '--audio_file', - required=False, - type=Path, - help="Path to file with paths to audio. One path per row. Required if '--input_text' provided. Else 'audio_filepath' from manifest will be used.", - ) - output = parser.add_mutually_exclusive_group(required=True) - output.add_argument( - "--output_manifest", - "-M", - type=Path, - help="Path to output NeMo manifest. Text with restored punctuation and capitalization will be saved in " - "'pred_text' elements if 'pred_text' key is present in the input manifest. Otherwise text with restored " - "punctuation and capitalization will be saved in 'text' elements. Exactly one parameter of `--output_manifest` " - "and `--output_text` should be provided.", - ) - output.add_argument( - "--output_text", - "-T", - type=Path, - help="Path to file with text with restored punctuation and capitalization. Exactly one parameter of " - "`--output_manifest` and `--output_text` should be provided.", - ) - model = parser.add_mutually_exclusive_group(required=False) - model.add_argument( - "--pretrained_name", - "-p", - help=f"The name of NGC pretrained model. No more than one of parameters `--pretrained_name`, `--model_path`" - f"should be provided. If neither of parameters `--pretrained_name` and `--model_path` are provided, then the " - f"script is run with `--{default_model_parameter}={default_model}`.", - choices=[m.pretrained_model_name for m in PunctuationCapitalizationModel.list_available_models()] - + [m.pretrained_model_name for m in PunctuationCapitalizationLexicalAudioModel.list_available_models()], - ) - model.add_argument( - "--model_path", - "-P", - type=Path, - help=f"Path to .nemo checkpoint of punctuation and capitalization model. No more than one of parameters " - f"`--pretrained_name` and `--model_path` should be provided. If neither of parameters `--pretrained_name` and " - f"`--model_path` are provided, then the script is run with `--{default_model_parameter}={default_model}`.", - ) - parser.add_argument( - "--max_seq_length", - "-L", - type=int, - default=64, - help="Length of segments into which queries are split. `--max_seq_length` includes [CLS] and [SEP] tokens.", - ) - parser.add_argument( - "--step", - "-s", - type=int, - default=8, - help="Relative shift of consequent segments into which long queries are split. Long queries are split into " - "segments which can overlap. Parameter `step` controls such overlapping. Imagine that queries are " - "tokenized into characters, `max_seq_length=5`, and `step=2`. In such a case query 'hello' is tokenized " - "into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`.", - ) - parser.add_argument( - "--margin", - "-g", - type=int, - default=16, - help="A number of subtokens in the beginning and the end of segments which output probabilities are not used " - "for prediction computation. The first segment does not have left margin and the last segment does not have " - "right margin. For example, if input sequence is tokenized into characters, `max_seq_length=5`, `step=1`, " - "and `margin=1`, then query 'hello' will be tokenized into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], " - "['[CLS]', 'e', 'l', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`. These segments are passed to the " - "model. Before final predictions computation, margins are removed. In the next list, subtokens which logits " - "are not used for final predictions computation are marked with asterisk: `[['[CLS]'*, 'h', 'e', 'l'*, " - "'[SEP]'*], ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]`.", - ) - parser.add_argument( - "--batch_size", "-b", type=int, default=128, help="Number of segments which are processed simultaneously.", - ) - parser.add_argument( - "--save_labels_instead_of_text", - "-B", - action="store_true", - help="If this option is set, then punctuation and capitalization labels are saved instead text with restored " - "punctuation and capitalization. Labels are saved in format described here " - "https://docs.nvidia.com/deeplearning/nemo/" - "user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#nemo-data-format", - ) - parser.add_argument( - "--device", - "-d", - choices=['cpu', 'cuda'], - help="Which device to use. If device is not set and CUDA is available, then GPU will be used. If device is " - "not set and CUDA is not available, then CPU is used.", - ) - parser.add_argument( - "--sample_rate", - type=int, - default=16000, - help="Target sample rate for audios if `--use_audio` was passed", - required=False, - ) - args = parser.parse_args() - if args.input_manifest is None and args.output_manifest is not None: - parser.error("--output_manifest requires --input_manifest") - if args.use_audio and (args.input_manifest is None and args.audio_file is None): - parser.error("--use_audio and --input_text require --audio_file") - if args.pretrained_name is None and args.model_path is None: - setattr(args, default_model_parameter, default_model) - for name in ["input_manifest", "input_text", "output_manifest", "output_text", "model_path", "audio_file"]: - if getattr(args, name) is not None: - setattr(args, name, getattr(args, name).expanduser()) - return args - - -def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]: - result = [] - with manifest.open() as f: - for i, line in enumerate(f): - data = json.loads(line) - result.append(data) - return result - - -def main() -> None: - args = get_args() - if args.pretrained_name is None: - model = ( - PunctuationCapitalizationModel.restore_from(args.model_path) - if not args.use_audio - else PunctuationCapitalizationLexicalAudioModel.restore_from(args.model_path) - ) - else: - model = ( - PunctuationCapitalizationModel.from_pretrained(args.pretrained_name) - if not args.use_audio - else PunctuationCapitalizationLexicalAudioModel.restore_from(args.model_path) - ) - if args.device is None: - if torch.cuda.is_available(): - model = model.cuda() - else: - model = model.cpu() - else: - model = model.to(args.device) - if args.input_manifest is None: - texts = [] - audios = [] - with args.input_text.open() as f: - for line in f: - texts.append(line.strip()) - if args.use_audio: - with args.audio_file.open() as f: - for line in f: - audios.append(line.strip()) - else: - manifest = load_manifest(args.input_manifest) - text_key = "pred_text" if "pred_text" in manifest[0] else "text" - texts = [] - audios = [] - for item in manifest: - texts.append(item[text_key]) - if args.use_audio: - audios.append(item["audio_filepath"]) - if args.use_audio: - processed_texts = model.add_punctuation_capitalization( - texts, - batch_size=args.batch_size, - max_seq_length=args.max_seq_length, - step=args.step, - margin=args.margin, - return_labels=args.save_labels_instead_of_text, - audio_queries=audios, - target_sr=args.sample_rate, - ) - else: - processed_texts = model.add_punctuation_capitalization( - texts, - batch_size=args.batch_size, - max_seq_length=args.max_seq_length, - step=args.step, - margin=args.margin, - return_labels=args.save_labels_instead_of_text, - ) - if args.output_manifest is None: - args.output_text.parent.mkdir(exist_ok=True, parents=True) - with args.output_text.open('w') as f: - for t in processed_texts: - f.write(t + '\n') - else: - args.output_manifest.parent.mkdir(exist_ok=True, parents=True) - with args.output_manifest.open('w') as f: - for item, t in zip(manifest, processed_texts): - item[text_key] = t - f.write(json.dumps(item) + '\n') - - -if __name__ == "__main__": - main() diff --git a/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py b/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py deleted file mode 100644 index 508e434bb598..000000000000 --- a/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import lightning.pytorch as pl -import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models.token_classification.punctuation_capitalization_config import ( - PunctuationCapitalizationLexicalAudioConfig, -) -from nemo.collections.nlp.models.token_classification.punctuation_capitalization_lexical_audio_model import ( - PunctuationCapitalizationLexicalAudioModel, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -""" -This script show how to train a Punctuation and Capitalization Model with lexical and acoustic features. -More details on the task and data format could be found in tutorials/nlp/Punctuation_and_Capitalization.ipynb - -*** Setting the configs *** - -The model and the PT trainer are defined in a config file which declares multiple important sections. -The most important ones are: - model: All arguments that are related to the Model - language model, audio encoder, tokenizer, token classifier, optimizer, - schedulers, and datasets/data loaders. - trainer: Any argument to be passed to PyTorch Lightning including number of epochs, number of GPUs, - precision level, etc. -This script uses the `/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml` config file -by default. You may update the config file from the file directly. -The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. - -*** Model training *** - -To run this script and train the model from scratch, use: - python punctuation_capitalization_lexical_audio_train_evaluate.py \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.train_ds.audio_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - model.validation_ds.audio_file= - -To use BERT-like pretrained P&C models' weights to initialize lexical encoder, use: - python punctuation_capitalization_lexical_audio_train_evaluate.py \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.train_ds.audio_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - model.validation_ds.audio_file= \ - model.restore_lexical_encoder_from= - - -If you wish to perform testing after training set `do_testing` to `true: - python punctuation_capitalization_lexical_audio_train_evaluate.py \ - +do_testing=true \ - pretrained_model= \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.train_ds.audio_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - model.validation_ds.audio_file= \ - model.test_ds.ds_item= \ - model.test_ds.text_file= \ - model.test_ds.labels_file= \ - model.test_ds.audio_file= - -Set `do_training` to `false` and `do_testing` to `true` to perform evaluation without training: - python punctuation_capitalization_lexical_audio_train_evaluate.py \ - +do_testing=true \ - +do_training=false \ - pretrained_model== \ - model.test_ds.ds_item= \ - model.test_ds.text_file= \ - model.test_ds.labels_file= \ - model.test_ds.audio_file= - -""" - - -@hydra_runner(config_path="conf", config_name="punctuation_capitalization_lexical_audio_config") -def main(cfg: DictConfig) -> None: - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters like here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - torch.manual_seed(42) - cfg = OmegaConf.merge(OmegaConf.structured(PunctuationCapitalizationLexicalAudioConfig()), cfg) - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - if not cfg.do_training and not cfg.do_testing: - raise ValueError("At least one of config parameters `do_training` and `do_testing` has to be `true`.") - if cfg.do_training: - if cfg.model.get('train_ds') is None: - raise ValueError('`model.train_ds` config section is required if `do_training` config item is `True`.') - if cfg.do_testing: - if cfg.model.get('test_ds') is None: - raise ValueError('`model.test_ds` config section is required if `do_testing` config item is `True`.') - - if not cfg.pretrained_model: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - model = PunctuationCapitalizationLexicalAudioModel(cfg.model, trainer=trainer) - else: - if os.path.exists(cfg.pretrained_model): - model = PunctuationCapitalizationLexicalAudioModel.restore_from(cfg.pretrained_model) - elif cfg.pretrained_model in PunctuationCapitalizationLexicalAudioModel.get_available_model_names(): - model = PunctuationCapitalizationLexicalAudioModel.from_pretrained(cfg.pretrained_model) - else: - raise ValueError( - f'Provide path to the pre-trained .nemo file or choose from ' - f'{PunctuationCapitalizationLexicalAudioModel.list_available_models()}' - ) - model.update_config_after_restoring_from_checkpoint( - class_labels=cfg.model.class_labels, - common_dataset_parameters=cfg.model.common_dataset_parameters, - train_ds=cfg.model.get('train_ds') if cfg.do_training else None, - validation_ds=cfg.model.get('validation_ds') if cfg.do_training else None, - test_ds=cfg.model.get('test_ds') if cfg.do_testing else None, - optim=cfg.model.get('optim') if cfg.do_training else None, - ) - model.set_trainer(trainer) - if cfg.do_training: - model.setup_training_data() - model.setup_multiple_validation_data(cfg.model.validation_ds) - model.setup_optimization() - else: - model.setup_multiple_test_data(cfg.model.test_ds) - if cfg.do_training: - trainer.fit(model) - if cfg.do_testing: - trainer.test(model) - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py b/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py deleted file mode 100644 index b16e1ecd0bdc..000000000000 --- a/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import lightning.pytorch as pl -import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import PunctuationCapitalizationModel -from nemo.collections.nlp.models.token_classification.punctuation_capitalization_config import ( - PunctuationCapitalizationConfig, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -""" -This script show how to train a Punctuation and Capitalization Model. -More details on the task and data format could be found in tutorials/nlp/Punctuation_and_Capitalization.ipynb - -*** Setting the configs *** - -The model and the PT trainer are defined in a config file which declares multiple important sections. -The most important ones are: - model: All arguments that are related to the Model - language model, tokenizer, token classifier, optimizer, - schedulers, and datasets/data loaders. - trainer: Any argument to be passed to PyTorch Lightning including number of epochs, number of GPUs, - precision level, etc. -This script uses the `/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml` config file -by default. You may update the config file from the file directly. -The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. - -Additional default parameters could be found in PunctuationCapitalizationDataConfigBase from -/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py, -use `+` to modify their values via command line, e.g.: `+model.train_ds.num_workers=2` - -For more details about the config files and different ways of model restoration, see tutorials/00_NeMo_Primer.ipynb - -*** Model training *** - -To run this script and train the model from scratch, use: - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - ~model.test_ds - -To use one of the pretrained versions of the model and finetune it, run: - python punctuation_capitalization_train_evaluate.py \ - pretrained_model=punctuation_en_bert \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - ~model.test_ds - - pretrained_model - pretrained PunctuationCapitalization model from list_available_models() or - path to a .nemo file, for example: punctuation_en_bert or model.nemo - -If you wish to perform testing after training set `do_testing` to `true: - python punctuation_capitalization_train_evaluate.py \ - +do_testing=true \ - pretrained_model=punctuation_en_bert \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - model.test_ds.ds_item= \ - model.test_ds.text_file= \ - model.test_ds.labels_file= - -Set `do_training` to `false` and `do_testing` to `true` to perform evaluation without training: - python punctuation_capitalization_train_evaluate.py \ - +do_testing=true \ - +do_training=false \ - pretrained_model=punctuation_en_bert \ - model.test_ds.ds_item= \ - model.test_ds.text_file= \ - model.test_ds.labels_file= - -""" - - -@hydra_runner(config_path="conf", config_name="punctuation_capitalization_config") -def main(cfg: DictConfig) -> None: - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters like here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - torch.manual_seed(42) - cfg = OmegaConf.merge(OmegaConf.structured(PunctuationCapitalizationConfig()), cfg) - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - if not cfg.do_training and not cfg.do_testing: - raise ValueError("At least one of config parameters `do_training` and `do_testing` has to `true`.") - if cfg.do_training: - if cfg.model.get('train_ds') is None: - raise ValueError('`model.train_ds` config section is required if `do_training` config item is `True`.') - if cfg.do_testing: - if cfg.model.get('test_ds') is None: - raise ValueError('`model.test_ds` config section is required if `do_testing` config item is `True`.') - - if not cfg.pretrained_model: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - model = PunctuationCapitalizationModel(cfg.model, trainer=trainer) - else: - if os.path.exists(cfg.pretrained_model): - model = PunctuationCapitalizationModel.restore_from(cfg.pretrained_model) - elif cfg.pretrained_model in PunctuationCapitalizationModel.get_available_model_names(): - model = PunctuationCapitalizationModel.from_pretrained(cfg.pretrained_model) - else: - raise ValueError( - f'Config parameter `pretrained_model` should contain a path to the pre-trained .nemo file or a model ' - f'name from ' - f'{[m.pretrained_model_name for m in PunctuationCapitalizationModel.list_available_models()]}. ' - f'Provided `pretrained_model="{cfg.pretrained_model}"` is neither a valid path, nor a valid model ' - f'name.' - ) - model.update_config_after_restoring_from_checkpoint( - class_labels=cfg.model.class_labels, - common_dataset_parameters=cfg.model.common_dataset_parameters, - train_ds=cfg.model.get('train_ds') if cfg.do_training else None, - validation_ds=cfg.model.get('validation_ds') if cfg.do_training else None, - test_ds=cfg.model.get('test_ds') if cfg.do_testing else None, - optim=cfg.model.get('optim') if cfg.do_training else None, - ) - model.set_trainer(trainer) - if cfg.do_training: - model.setup_training_data() - model.setup_multiple_validation_data(cfg.model.validation_ds) - model.setup_optimization() - else: - model.setup_multiple_test_data(cfg.model.test_ds) - if cfg.do_training: - trainer.fit(model) - if cfg.do_testing: - trainer.test(model) - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/token_classification/token_classification_evaluate.py b/examples/nlp/token_classification/token_classification_evaluate.py deleted file mode 100644 index 764aa90c8593..000000000000 --- a/examples/nlp/token_classification/token_classification_evaluate.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import lightning.pytorch as pl -from omegaconf import DictConfig - -from nemo.collections.nlp.models import TokenClassificationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -""" -This script shows how to perform evaluation and runs inference of a few examples. - -More details on Token Classification model could be found in tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb - -*** Setting the configs *** - -This script uses the `/examples/nlp/token_classification/conf/token_classification_config.yaml` config file -by default. You may update the config file from the file directly. -The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. - -For more details about the config files and different ways of model restoration, see tutorials/00_NeMo_Primer.ipynb - -*** Model Evaluation *** - -The script runs two types of evaluation: - * model.test() - this eval will use the config setting for evaluation such as model.dataset.max_seq_length - * model.evaluate_from_file(): - * disregards model.dataset.max_seq_length and evaluates all the tokens, BERT max seq length - 512 tokens after tokenization - * creates confusion matrix - * saves predictions and labels (if provided) - -To run the script: - - python token_classification_evaluate.py \ - model.dataset.data_dir= \ - pretrained_model=ner_en_bert - - - a directory that contains test_ds.text_file and test_ds.labels_file (see the config) -pretrained_model - pretrained TokenClassification model from list_available_models() or - path to a .nemo file, for example: ner_en_bert or your_model.nemo - -""" - - -@hydra_runner(config_path="conf", config_name="token_classification_config") -def main(cfg: DictConfig) -> None: - logging.info( - 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \ - no DDP to obtain accurate results' - ) - - if not hasattr(cfg.model, 'test_ds'): - raise ValueError(f'model.test_ds was not found in the config, skipping evaluation') - - trainer = pl.Trainer( - devices=1, - precision=cfg.trainer.precision, - logger=False, - enable_checkpointing=False, - accelerator=cfg.trainer.accelerator, - ) - exp_dir = exp_manager(trainer, cfg.exp_manager) - - if not cfg.pretrained_model: - raise ValueError( - 'To run evaluation and inference script a pre-trained model or .nemo file must be provided.' - f'Choose from {TokenClassificationModel.list_available_models()} or "pretrained_model"="your_model.nemo"' - ) - - if os.path.exists(cfg.pretrained_model): - model = TokenClassificationModel.restore_from(cfg.pretrained_model) - elif cfg.pretrained_model in TokenClassificationModel.get_available_model_names(): - model = TokenClassificationModel.from_pretrained(cfg.pretrained_model) - else: - raise ValueError( - f'Provide path to the pre-trained .nemo checkpoint or choose from {TokenClassificationModel.list_available_models()}' - ) - - data_dir = cfg.model.dataset.get('data_dir', None) - if data_dir is None: - logging.error( - 'No dataset directory provided. Skipping evaluation. ' - 'To run evaluation on a file, specify path to the directory that contains test_ds.text_file and test_ds.labels_file with "model.dataset.data_dir" argument.' - ) - elif not os.path.exists(data_dir): - logging.error(f'{data_dir} is not found, skipping evaluation on the test set.') - else: - model.update_data_dir(data_dir=data_dir) - model._cfg.dataset = cfg.model.dataset - - if not hasattr(cfg.model, 'test_ds'): - logging.error(f'model.test_ds was not found in the config, skipping evaluation') - elif model.prepare_test(trainer): - model.setup_test_data(cfg.model.test_ds) - trainer.test(model) - - model.evaluate_from_file( - text_file=os.path.join(data_dir, cfg.model.test_ds.text_file), - labels_file=os.path.join(data_dir, cfg.model.test_ds.labels_file), - output_dir=exp_dir, - add_confusion_matrix=True, - normalize_confusion_matrix=True, - ) - else: - logging.error('Skipping the evaluation. The trainer is not setup properly.') - - # run an inference on a few examples - queries = ['we bought four shirts from the nvidia gear store in santa clara.', 'Nvidia is a company.'] - results = model.add_predictions(queries, output_file='predictions.txt') - - for query, result in zip(queries, results): - logging.info(f'Query : {query}') - logging.info(f'Result: {result.strip()}\n') - - logging.info(f'Results are saved at {exp_dir}') - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/token_classification/token_classification_train.py b/examples/nlp/token_classification/token_classification_train.py deleted file mode 100644 index 536327aff6da..000000000000 --- a/examples/nlp/token_classification/token_classification_train.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import lightning.pytorch as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import TokenClassificationModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -""" -This scripts shows how to train a Token Classification model. - -The Token Classification model supports Named Entity Recognition task and other token level classification tasks, -as long as the data follows the format specified below. - -More details on how to use this script could be found in -tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb - -*** Data Format *** -Token Classification Model requires the data to be split into 2 files: text.txt and labels.txt. -Each line of the text.txt file contains text sequences, where words are separated with spaces, i.e.: -[WORD] [SPACE] [WORD] [SPACE] [WORD]. -The labels.txt file contains corresponding labels for each word in text.txt, the labels are separated with spaces, i.e.: -[LABEL] [SPACE] [LABEL] [SPACE] [LABEL]. - -Example of a text.txt file: -Jennifer is from New York City . -She likes ... -... - -Corresponding labels.txt file: -B-PER O O B-LOC I-LOC I-LOC O -O O ... -... - -*** Preparing the dataset *** - -To convert an IOB format data to the format required for training, run -examples/nlp/token_classification/data/import_from_iob_format.py on your train and dev files, as follows: - -python examples/nlp/token_classification/data/import_from_iob_format.py --data_file PATH_TO_IOB_FORMAT_DATAFILE - -*** Setting the configs *** - -The model and the PT trainer are defined in a config file which declares multiple important sections. -The most important ones are: - model: All arguments that are related to the Model - language model, tokenizer, token classifier, optimizer, - schedulers, and datasets/data loaders. - trainer: Any argument to be passed to PyTorch Lightning including number of epochs, number of GPUs, - precision level, etc. -This script uses the `/examples/nlp/token_classification/conf/token_classification_config.yaml` config file -by default. You may update the config file from the file directly. -The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. - -For more details about the config files and different ways of model restoration, see tutorials/00_NeMo_Primer.ipynb - -*** Model Training *** - -To train TokenClassification model from scratch with the default config file, run: - - python token_classification_train.py \ - model.dataset.data_dir= \ - trainer.max_epochs= \ - trainer.devices=[] - -To use one of the pretrained versions of the model specify a `pretrained_model` arg with either -TokenClassification model from list_available_models() or path to a .nemo file, for example: -ner_en_bert or model.nemo, run: - - python token_classification_train.py pretrained_model=ner_en_bert - -To use one of the pretrained versions of the model and fine-tune it, run: - - python token_classification_train.py \ - model.dataset.data_dir= \ - pretrained_model=ner_en_bert - - - a directory that contains test_ds.text_file and test_ds.labels_file (see the config) -pretrained_model - pretrained TokenClassification model from list_available_models() or - path to a .nemo file, for example: ner_en_bert or model.nemo - -For more ways of restoring a pre-trained model, see tutorials/00_NeMo_Primer.ipynb -""" - - -@hydra_runner(config_path="conf", config_name="token_classification_config") -def main(cfg: DictConfig) -> None: - try: - strategy = NLPDDPStrategy(find_unused_parameters=True) - except (ImportError, ModuleNotFoundError): - strategy = 'auto' - - trainer = pl.Trainer(strategy=strategy, **cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - if not cfg.pretrained_model: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - model = TokenClassificationModel(cfg.model, trainer=trainer) - else: - if os.path.exists(cfg.pretrained_model): - # TODO: can we drop strict=False? - model = TokenClassificationModel.restore_from(cfg.pretrained_model, trainer=trainer, strict=False) - elif cfg.pretrained_model in TokenClassificationModel.get_available_model_names(): - model = TokenClassificationModel.from_pretrained(cfg.pretrained_model) - else: - raise ValueError( - f'Provide path to the pre-trained .nemo file or choose from {TokenClassificationModel.list_available_models()}' - ) - - data_dir = cfg.model.dataset.get('data_dir', None) - if data_dir: - if not os.path.exists(data_dir): - raise ValueError(f'{data_dir} is not found at') - - # we can also do finetuning of the pretrained model but it will require - # setup the data dir to get class weights statistics - model.update_data_dir(data_dir=data_dir) - # finally, setup train and validation Pytorch DataLoaders - model.setup_training_data() - model.setup_validation_data() - # then we're setting up loss, use model.dataset.class_balancing, - # if you want to add class weights to the CrossEntropyLoss - model.setup_loss(class_balancing=cfg.model.dataset.class_balancing) - logging.info(f'Using config file of the pretrained model') - else: - raise ValueError( - 'Specify a valid dataset directory that contains test_ds.text_file and test_ds.labels_file \ - with "model.dataset.data_dir" argument' - ) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml b/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml deleted file mode 100644 index 64fd5f8542f3..000000000000 --- a/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Config file for Zero Shot Intent Recognition (BERT model trained NLI) -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 1 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 16 - accelerator: gpu - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - -model: - dataset: - data_dir: ??? # /path/to/data - sentence_1_column: 8 # index of the column containing the premise or sentence 1 - sentence_2_column: 9 # index of the column containing the hypothesis or sentence 2 - label_column: -1 # index of the column containing labels. Labels should be "entailment", "contradiction", and "neutral". - class_balancing: null # null or 'weighted_loss'. 'weighted_loss' enables the weighted class balancing of the loss, may be used for handling unbalanced classes - use_cache: true # uses a cache to store the processed dataset, you may use it for large datasets for speed up - num_classes: 3 - max_seq_length: 128 - do_lower_case: true # true for uncased models, false for cased models, will be set automatically if pre-trained tokenizer model is used - - train_ds: - file_name: train.tsv - batch_size: 64 - shuffle: true - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 2 - drop_last: false - pin_memory: false - - validation_ds: - file_name: dev_matched.tsv - batch_size: 64 - shuffle: false - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 2 - drop_last: false - pin_memory: false - - test_ds: - file_name: null - batch_size: 64 - shuffle: false - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 2 - drop_last: false - pin_memory: false - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently. - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - classifier_head: - num_output_layers: 2 - fc_dropout: 0.1 - - optim: - name: adam - lr: 5e-5 - weight_decay: 0.00 - - sched: - name: WarmupAnnealing - # Scheduler params - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./NeMo_experiments" - name: "ZeroShotIntentRecognition" # The name of your model - create_tensorboard_logger: True # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: True # Whether you want exp_manager to create a modelcheckpoint callback - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - -pretrained_model: # pretrained ZeroShotIntent model to be used for inference (.nemo file) \ No newline at end of file diff --git a/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_infer.py b/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_infer.py deleted file mode 100644 index eca8f1ef87c6..000000000000 --- a/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_infer.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os - -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import ZeroShotIntentModel -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="zero_shot_intent_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}') - - # initialize the model using the config file - if cfg.pretrained_model and os.path.exists(cfg.pretrained_model): - model = ZeroShotIntentModel.restore_from(cfg.pretrained_model, strict=False) - else: - raise ValueError('Provide path to the pre-trained .nemo checkpoint') - - # predicting an intent of a query - queries = [ - "I'd like a veggie burger and fries", - "Turn off the lights in the living room", - ] - - candidate_labels = ['Food order', 'Play music', 'Request for directions', 'Change lighting', 'Calendar query'] - - predictions = model.predict(queries, candidate_labels, batch_size=4, multi_label=True) - - logging.info('The prediction results of some sample queries with the trained model:') - for query in predictions: - logging.info(json.dumps(query, indent=4)) - logging.info("Inference finished!") - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py b/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py deleted file mode 100644 index 4dbbf01c935e..000000000000 --- a/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import lightning.pytorch as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import ZeroShotIntentModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="zero_shot_intent_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - # initialize the model using the config file - model = ZeroShotIntentModel(cfg.model, trainer=trainer) - - # training - logging.info("================================================================================================") - logging.info('Starting training...') - trainer.fit(model) - logging.info('Training finished!') - if cfg.model.nemo_path: - model.save_to(cfg.model.nemo_path) - - -if __name__ == '__main__': - main() diff --git a/nemo/collections/nlp/data/__init__.py b/nemo/collections/nlp/data/__init__.py index 7c1b59d3868c..ffc0bdabe0e7 100644 --- a/nemo/collections/nlp/data/__init__.py +++ b/nemo/collections/nlp/data/__init__.py @@ -13,10 +13,6 @@ # limitations under the License. from nemo.collections.nlp.data.data_utils import * # noqa: F401 -from nemo.collections.nlp.data.entity_linking.entity_linking_dataset import EntityLinkingDataset # noqa: F401 -from nemo.collections.nlp.data.information_retrieval.information_retrieval_dataset import ( # noqa: F401 - BertInformationRetrievalDataset, -) from nemo.collections.nlp.data.language_modeling.l2r_lm_dataset import ( # noqa: F401 L2RLanguageModelingDataset, TarredL2RLanguageModelingDataset, @@ -33,11 +29,3 @@ TarredTranslationDataset, TranslationDataset, ) -from nemo.collections.nlp.data.token_classification.token_classification_dataset import ( # noqa: F401 - BertTokenClassificationDataset, - BertTokenClassificationInferDataset, -) -from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import ( # noqa: F401 - ZeroShotIntentDataset, - ZeroShotIntentInferenceDataset, -) diff --git a/nemo/collections/nlp/data/entity_linking/__init__.py b/nemo/collections/nlp/data/entity_linking/__init__.py deleted file mode 100644 index 659718d71b82..000000000000 --- a/nemo/collections/nlp/data/entity_linking/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.entity_linking.entity_linking_dataset import EntityLinkingDataset diff --git a/nemo/collections/nlp/data/entity_linking/entity_linking_dataset.py b/nemo/collections/nlp/data/entity_linking/entity_linking_dataset.py deleted file mode 100644 index 3b1d97a354f0..000000000000 --- a/nemo/collections/nlp/data/entity_linking/entity_linking_dataset.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import array -import pickle as pkl -from typing import Optional - -import torch - -from nemo.collections.nlp.data.data_utils.data_preprocessing import find_newlines, load_data_indices -from nemo.core.classes import Dataset -from nemo.utils import logging - -__all__ = ['EntityLinkingDataset'] - - -class EntityLinkingDataset(Dataset): - """ - Parent class for entity linking encoder training and index - datasets - - Args: - tokenizer (obj): huggingface tokenizer, - data_file (str): path to tab separated column file where data - pairs apear in the format - concept_ID\tconcept_synonym1\tconcept_synonym2\n - newline_idx_file (str): path to pickle file containing location - of data_file newline characters - max_seq_length (int): maximum length of a concept in tokens - is_index_data (bool): Whether dataset will be used for building - a nearest neighbors index - """ - - def __init__( - self, - tokenizer: object, - data_file: str, - newline_idx_file: Optional[str] = None, - max_seq_length: Optional[int] = 512, - is_index_data: bool = False, - ): - - self.tokenizer = tokenizer - - # Try and load pair indices file if already exists - newline_indices, newline_idx_file, _ = load_data_indices(newline_idx_file, data_file, "newline_indices") - - # If pair indices file doesn't exists, generate and store them - if newline_indices is None: - logging.info("Getting datafile newline indices") - - with open(data_file, "rb") as f: - contents = f.read() - newline_indices = find_newlines(contents) - newline_indices = array.array("I", newline_indices) - - # Store data file indicies to avoid generating them again - with open(newline_idx_file, "wb") as f: - pkl.dump(newline_indices, f) - - self.newline_indices = newline_indices - self.data_file = data_file - self.num_lines = len(newline_indices) - self.max_seq_length = max_seq_length - self.is_index_data = is_index_data - - logging.info(f"Loaded dataset with {self.num_lines} examples") - - def __len__(self): - return self.num_lines - - def __getitem__(self, idx): - - concept_offset = self.newline_indices[idx] - - with open(self.data_file, "r", encoding='utf-8-sig') as f: - # Find data pair within datafile using byte offset - f.seek(concept_offset) - concept = f.readline()[:-1] - concept = concept.strip().split("\t") - - if self.is_index_data: - concept_id, concept = concept - return (int(concept_id), concept) - - else: - concept_id, concept1, concept2 = concept - return (int(concept_id), concept1, concept2) - - def _collate_fn(self, batch): - """collate batch of input_ids, segment_ids, input_mask, and label - - Args: - batch: A list of tuples of format (concept_ID, concept_synonym1, concept_synonym2). - """ - if self.is_index_data: - concept_ids, concepts = zip(*batch) - concept_ids = list(concept_ids) - concepts = list(concepts) - - else: - concept_ids, concepts1, concepts2 = zip(*batch) - concept_ids = list(concept_ids) - concept_ids.extend(concept_ids) # Need to double label list to match each concept - concepts = list(concepts1) - concepts.extend(concepts2) - - batch = self.tokenizer( - concepts, - add_special_tokens=True, - padding=True, - truncation=True, - max_length=self.max_seq_length, - return_token_type_ids=True, - return_attention_mask=True, - return_length=True, - ) - - return ( - torch.LongTensor(batch["input_ids"]), - torch.LongTensor(batch["token_type_ids"]), - torch.LongTensor(batch["attention_mask"]), - torch.LongTensor(concept_ids), - ) diff --git a/nemo/collections/nlp/data/glue_benchmark/__init__.py b/nemo/collections/nlp/data/glue_benchmark/__init__.py deleted file mode 100644 index 753411382bc1..000000000000 --- a/nemo/collections/nlp/data/glue_benchmark/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset diff --git a/nemo/collections/nlp/data/glue_benchmark/data_processors.py b/nemo/collections/nlp/data/glue_benchmark/data_processors.py deleted file mode 100644 index 3d907f24eff8..000000000000 --- a/nemo/collections/nlp/data/glue_benchmark/data_processors.py +++ /dev/null @@ -1,445 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor -from nemo.utils import logging - -__all__ = [ - 'ColaProcessor', - 'MnliProcessor', - 'MnliMismatchedProcessor', - 'MrpcProcessor', - 'Sst2Processor', - 'StsbProcessor', - 'QqpProcessor', - 'QnliProcessor', - 'RteProcessor', - 'WnliProcessor', - 'XNLIProcessor', -] - - -class MrpcProcessor(DataProcessor): - """Processor for the MRPC data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}') - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[3] - text_b = line[4] - label = line[0] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"mrpc sentence1: {text_a} sentence2: {text_b}" - - def label2string(self, label): - return "equivalent" if label == "1" else "not equivalent" - - -class MnliProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[8] - text_b = line[9] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"mnli hypothesis: {text_a} premise: {text_b}" - - def label2string(self, label): - return label - - -class XNLIProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[6] - text_b = line[7] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"mnli hypothesis: {text_a} premise: {text_b}" - - def label2string(self, label): - return label - - -class MnliMismatchedProcessor(MnliProcessor): - """Processor for the MultiNLI Mismatched data set (GLUE version).""" - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - -class ColaProcessor(DataProcessor): - """Processor for the CoLA data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s" % (set_type, i) - text_a = line[3] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - assert text_b is None - return f"cola sentence: {text_a}" - - def label2string(self, label): - return "acceptable" if label == "1" else "not acceptable" - - -class Sst2Processor(DataProcessor): - """Processor for the SST-2 data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[0] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - assert text_b is None - return f"sst2 sentence: {text_a}" - - def label2string(self, label): - return "positive" if label == "1" else "negative" - - -class StsbProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return [None] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[7] - text_b = line[8] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"stsb sentence1: {text_a} sentence2: {text_b}" - - def label2string(self, label): - return '%.1f' % float(label) - - -class QqpProcessor(DataProcessor): - """Processor for the QQP data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - try: - text_a = line[3] - text_b = line[4] - label = line[5] - except IndexError: - continue - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"qqp question1: {text_a} question2: {text_b}" - - def label2string(self, label): - return "duplicate" if label == "1" else "not_duplicate" - - -class QnliProcessor(DataProcessor): - """Processor for the QNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"qnli question: {text_a} sentence: {text_b}" - - def label2string(self, label): - return label - - -class RteProcessor(DataProcessor): - """Processor for the RTE data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"rte sentence1: {text_a} sentence2: {text_b}" - - def label2string(self, label): - return label - - -class WnliProcessor(DataProcessor): - """Processor for the WNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - raise NotImplementedError("NeMo-Megatron T5 does not support WNLI at the moment.") - - def label2string(self, label): - raise NotImplementedError("NeMo-Megatron T5 does not support WNLI at the moment.") - - -class InputExample(object): - """A single training/test example for simple sequence classification. - - Args: - guid: Unique id for the example. - text_a: The untokenized text of the first sequence. - For single sequence tasks, only this sequence must be specified. - text_b: The untokenized text of the second - sequence. Only must be specified for sequence pair tasks. - label:The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - - def __init__(self, guid: int, text_a: str, text_b: str = None, label: str = None): - """Constructs a InputExample.""" - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - def __repr__(self): - return ( - f"InputExample(guid='{self.guid}', text_a='{self.text_a}', text_b='{self.text_b}', label='{self.label}')" - ) diff --git a/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py b/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py deleted file mode 100644 index ef7845895a72..000000000000 --- a/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py +++ /dev/null @@ -1,561 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Some code of this file was adapted from the HuggingFace library available at -# https://github.com/huggingface/transformers - -import os -import pickle -from typing import Dict, List, Optional, Union - -import numpy as np -import torch - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.glue_benchmark.data_processors import ( - ColaProcessor, - MnliMismatchedProcessor, - MnliProcessor, - MrpcProcessor, - QnliProcessor, - QqpProcessor, - RteProcessor, - Sst2Processor, - StsbProcessor, - WnliProcessor, - XNLIProcessor, -) -from nemo.core.classes import Dataset -from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType, RegressionValuesType -from nemo.utils import logging - -__all__ = ['GLUEDataset', 'TextToTextGLUEDataset', 'TextToTextXNLIDataset'] - -processors = { - "cola": ColaProcessor, - "mnli": MnliProcessor, - "mnli-mm": MnliMismatchedProcessor, - "mrpc": MrpcProcessor, - "sst-2": Sst2Processor, - "sts-b": StsbProcessor, - "qqp": QqpProcessor, - "qnli": QnliProcessor, - "rte": RteProcessor, - "wnli": WnliProcessor, - "xnli": XNLIProcessor, -} -output_modes = { - "cola": "classification", - "mnli": "classification", - "mnli-mm": "classification", - "mrpc": "classification", - "sst-2": "classification", - "sts-b": "regression", - "qqp": "classification", - "qnli": "classification", - "rte": "classification", - "wnli": "classification", - "xnli": "classification", -} -GLUE_TASKS_NUM_LABELS = { - "cola": 2, - "mnli": 3, - "mrpc": 2, - "sst-2": 2, - "sts-b": 1, - "qqp": 2, - "qnli": 2, - "rte": 2, - "wnli": 2, -} - - -class GLUEDataset(Dataset): - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - "labels": NeuralType( - tuple('B'), RegressionValuesType() if self.task_name == 'sts-b' else CategoricalValuesType() - ), - } - - def __init__( - self, - file_name: str, - task_name: str, - tokenizer: TokenizerSpec, - max_seq_length: str, - use_cache: bool = True, - compute_features: bool = True, - ): - """ - Processes GLUE datasets - Args: - file_name: path to file - task_name: GLUE task name - tokenizer: such as AutoTokenizer - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] - use_cache: whether to use data cache - """ - original_file_name = file_name - logging.info(f'Processing {file_name}') - data_dir, file_name = os.path.split(file_name) - file_name = file_name[:-4] - self.tokenizer = tokenizer - evaluate = False if 'train' in file_name else True - - if task_name not in processors: - raise ValueError(f'{task_name} not supported. Choose from {processors.keys()}') - - if task_name == 'mnli' and 'dev_mismatched' in file_name: - self.task_name = 'mnli-mm' - else: - self.task_name = task_name - - processor = processors[self.task_name]() - output_mode = output_modes[self.task_name] - self.label_list = processor.get_labels() - - # TODO: use a different variable to decide whether to trust the user provided filename. This is a temporary workaround for T5 GLUE and XNLI. - if not compute_features: - if not os.path.exists(original_file_name): - raise ValueError(f"Could not find file : {original_file_name}") - self.examples = processor.get_examples(original_file_name) - else: - self.examples = ( - processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) - ) - processor_name = type(processor).__name__ - vocab_size = getattr(tokenizer, "vocab_size", 0) - if compute_features: - cached_features_file = os.path.join( - data_dir, - "cached_{}_{}_{}_{}_{}".format( - processor_name, file_name, tokenizer.name, str(max_seq_length), str(vocab_size) - ), - ) - - if use_cache and os.path.exists(cached_features_file): - logging.info(f"loading from {cached_features_file}") - with open(cached_features_file, "rb") as reader: - self.features = pickle.load(reader) - else: - token_params = { - 'bos_token': None, - 'eos_token': tokenizer.eos_token, - 'pad_token': tokenizer.pad_token, - 'cls_token': tokenizer.cls_token, - 'sep_token_extra': tokenizer.eos_token if 'roberta' in tokenizer.name.lower() else None, - } - - self.features = self.convert_examples_to_features( - self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params - ) - master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0 - if master_device: - logging.info(f'Saving train features into {cached_features_file}') - with open(cached_features_file, "wb") as writer: - pickle.dump(self.features, writer) - - def __len__(self): - return len(self.features) - - def __getitem__(self, idx): - feature = self.features[idx] - return ( - np.array(feature.input_ids), - np.array(feature.segment_ids), - np.array(feature.input_mask, dtype=np.longlong), - np.array(feature.label_id), - ) - - def convert_examples_to_features( - self, - examples: List[str], - label_list: List[int], - max_seq_length: int, - tokenizer: TokenizerSpec, - output_mode: str, - bos_token: str = None, - eos_token: str = '[SEP]', - pad_token: str = '[PAD]', - cls_token: str = '[CLS]', - sep_token_extra: str = None, - cls_token_at_end: bool = False, - cls_token_segment_id: int = 0, - pad_token_segment_id: int = 0, - pad_on_left: bool = False, - mask_padding_with_zero: bool = True, - sequence_a_segment_id: int = 0, - sequence_b_segment_id: int = 1, - ): - """ - Loads a data file into a list of `InputBatch`s. - The `cls_token_at_end` defines the location of the CLS token: - - * False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - * True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - - The `cls_token_segment_id` defines the segment id associated to the CLS token (0 for BERT, 2 for XLNet) - - The convention in BERT is: - - a. For sequence pairs: - * tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] - * type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 - b. For single sequences: - * tokens: [CLS] the dog is hairy . [SEP] - * type_ids: 0 0 0 0 0 0 0 - - Where "type_ids" are used to indicate whether this is the first - sequence or the second sequence. The embedding vectors for `type=0` - and `type=1` were learned during pre-training and are added to the - wordpiece embedding vector (and position vector). This is - not *strictly* necessarysince the [SEP] token unambiguously separates - the sequences, but it makes it easier for the model to learn - the concept of sequences. - For classification tasks, the first vector (corresponding to [CLS]) - is used as as the "sentence vector". Note that this only makes sense - because the entire model is fine-tuned. - - The convention for NMT is: - - a. For sequence pairs: - * tokens: is this jack ##ville ? no it is not . - * type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 - b. For single sequences: - * tokens: the dog is hairy . - * type_ids: 0 0 0 0 0 0 0 - - """ - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for ex_index, example in enumerate(examples): - if example.label == "-": # skip examples without a consensus label (e.g. in SNLI data set) - continue - if ex_index % 10000 == 0: - logging.info("Writing example %d of %d" % (ex_index, len(examples))) - - tokens_a = tokenizer.text_to_tokens(example.text_a) - - tokens_b = None - if example.text_b: - tokens_b = tokenizer.text_to_tokens(example.text_b) - - special_tokens_count = 2 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 2 if bos_token else 0 - special_tokens_count += 1 if cls_token else 0 - self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) - else: - special_tokens_count = 1 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 1 if bos_token else 0 - if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[: max_seq_length - special_tokens_count] - # Add special tokens to sequence_a - tokens = tokens_a - if bos_token: - tokens = [bos_token] + tokens - if eos_token: - tokens += [eos_token] - segment_ids = [sequence_a_segment_id] * len(tokens) - - # Add sequence separator between sequences - if tokens_b and sep_token_extra: - tokens += [sep_token_extra] - segment_ids += [sequence_a_segment_id] - - # Add special tokens to sequence_b - if tokens_b: - if bos_token: - tokens += [bos_token] - segment_ids += [sequence_b_segment_id] - tokens += tokens_b - segment_ids += [sequence_b_segment_id] * (len(tokens_b)) - if eos_token: - tokens += [eos_token] - segment_ids += [sequence_b_segment_id] - - # Add classification token - for BERT models - if cls_token: - if cls_token_at_end: - tokens += [cls_token] - segment_ids += [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - segment_ids = [cls_token_segment_id] + segment_ids - input_ids = tokenizer.tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_seq_length - len(input_ids) - pad_token_id = tokenizer.tokens_to_ids([pad_token])[0] - if pad_on_left: - input_ids = ([pad_token_id] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask - segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids - else: - input_ids = input_ids + ([pad_token_id] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) - if len(input_ids) != max_seq_length: - raise ValueError("input_ids must be of length max_seq_length") - if len(input_mask) != max_seq_length: - raise ValueError("input_mask must be of length max_seq_length") - if len(segment_ids) != max_seq_length: - raise ValueError("segment_ids must be of length max_seq_length") - if output_mode == "classification": - label_id = label_map[example.label] - elif output_mode == "regression": - label_id = np.float32(example.label) - else: - raise KeyError(output_mode) - - if ex_index < 5: - logging.info("*** Example ***") - logging.info("guid: %s" % (example.guid)) - logging.info("tokens: %s" % " ".join(list(map(str, tokens)))) - logging.info("input_ids: %s" % " ".join(list(map(str, input_ids)))) - logging.info("input_mask: %s" % " ".join(list(map(str, input_mask)))) - logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids)))) - logging.info("label: %s (id = %d)" % (example.label, label_id)) - - features.append( - InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) - ) - return features - - def _truncate_seq_pair(self, tokens_a: str, tokens_b: str, max_length: int): - """Truncates a sequence pair in place to the maximum length. - - This will always truncate the longer sequence one token at a time. - This makes more sense than truncating an equal percent - of tokens from each, since if one sequence is very short then each token - that's truncated likely contains more information than a longer sequence. - """ - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -class TextToTextGLUEDataset(GLUEDataset): - """GLUE Dataset in a text-to-text format.""" - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return - - def __init__( - self, - file_name: str, - task_name: str, - tokenizer: TokenizerSpec, - max_seq_length: int, - max_seq_length_decoder: int = 128, - use_cache: bool = True, - prefix_override: str = None, - pad_to_max_length: bool = True, - ): - """ - Processes GLUE datasets - Args: - file_name: path to file - task_name: GLUE task name - tokenizer: such as AutoTokenizer - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] - use_cache: whether to use data cache - prefix_override: if you want to override default prompt for this task specify this via a string. - pad_to_max_length: If true, pad to the maximum length. - """ - super().__init__(file_name, task_name, tokenizer, max_seq_length, use_cache, compute_features=False) - self.max_seq_length = max_seq_length - self.max_seq_length_decoder = max_seq_length_decoder - self.pad_to_max_length = pad_to_max_length - self.processor = processors[self.task_name]() - self.prefix_override = prefix_override - self.features = self.convert_examples_to_features() - - def __len__(self): - return len(self.examples) - - def __getitem__(self, idx): - enc_query, dec_input, labels = self.features[idx] - return {'text_enc': enc_query, 'text_dec': dec_input, 'labels': labels} - - def collate_fn(self, batch): - enc_query = [item['text_enc'] for item in batch] - dec_input = [item['text_dec'] for item in batch] - labels = [item['labels'] for item in batch] - - max_enc_query_length = max([len(item) for item in enc_query]) if enc_query else 0 - max_dec_input_length = max([len(item) for item in dec_input]) if dec_input else 0 - max_label_length = max([len(item) for item in labels]) if labels else 0 - if self.pad_to_max_length: - assert max_enc_query_length <= self.max_seq_length - assert max_dec_input_length <= self.max_seq_length_decoder - assert max_label_length <= self.max_seq_length_decoder - max_enc_query_length = self.max_seq_length - max_dec_input_length = self.max_seq_length_decoder - max_label_length = self.max_seq_length_decoder - - loss_mask = [([1] * (len(item))) + ([0] * (max_label_length - len(item))) for item in labels] - enc_query = [item + [self.tokenizer.pad_id] * (max_enc_query_length - len(item)) for item in enc_query] - dec_input = [item + [self.tokenizer.pad_id] * (max_dec_input_length - len(item)) for item in dec_input] - labels = [item + [self.tokenizer.pad_id] * (max_label_length - len(item)) for item in labels] - - enc_query = torch.LongTensor(enc_query) - dec_input = torch.LongTensor(dec_input) - labels = torch.LongTensor(labels) - loss_mask = torch.LongTensor(loss_mask) - - enc_mask = (enc_query != self.tokenizer.pad_id).long() - dec_mask = (dec_input != self.tokenizer.pad_id).long() - - return { - 'text_enc': enc_query, - 'text_dec': dec_input, - 'labels': labels, - 'loss_mask': loss_mask, - 'enc_mask': enc_mask, - 'dec_mask': dec_mask, - } - - def make_history_mask_3d(self, block): - batch, length = block.shape - arange = np.arange(length) - history_mask = (arange[None,] <= arange[:, None])[ - None, - ] - history_mask = np.repeat(history_mask, batch, 0) - return history_mask - - def convert_examples_to_features(self): - """ - Converts examples into Text-to-Text batches to be used with a model like T5. - Inputs are prefixed with a text prompt that indicates the task to perform. - """ - features = [] - for ex_index, example in enumerate(self.examples): - if ex_index % 10000 == 0: - logging.info(f"Writing example {ex_index} of {len(self.examples)}") - - text_to_text_query = self.processor.get_t5_prompted_query(example.text_a, example.text_b) - enc_query = self.tokenizer.text_to_ids(text_to_text_query) - if len(enc_query) > self.max_seq_length: - enc_query = enc_query[: self.max_seq_length] - dec_query = ( - [self.tokenizer.bos_id] - + self.tokenizer.text_to_ids(self.processor.label2string(example.label)) - + [self.tokenizer.eos_id] - ) - - dec_input = dec_query[:-1] - labels = dec_query[1:] - - features.append([enc_query, dec_input, labels]) - - return features - - -class TextToTextXNLIDataset(TextToTextGLUEDataset): - """XNLI Dataset in a text-to-text format.""" - - def __init__( - self, - file_name: str, - task_name: str, - tokenizer: TokenizerSpec, - max_seq_length: int, - max_seq_length_decoder: int = 128, - use_cache: bool = True, - prefix_override: str = None, - lang_list: List[str] = None, - pad_to_max_length: bool = True, - ): - self.lang_list = set(lang_list) - super().__init__( - file_name, - task_name, - tokenizer, - max_seq_length, - max_seq_length_decoder, - use_cache, - prefix_override, - pad_to_max_length, - ) - if len(lang_list) <= 0 or lang_list is None: - raise ValueError(f"Found an empty or None lang_list for {self.task_name}") - self.features = self.convert_xnli_examples_to_features() - - def __getitem__(self, idx): - enc_query, dec_input, labels, lang = self.features[idx] - return {'text_enc': enc_query, 'text_dec': dec_input, 'labels': labels, 'lang': lang} - - def collate_fn(self, batch): - base_batch = super().collate_fn(batch) - base_batch['lang'] = [item['lang'] for item in batch] - return base_batch - - def convert_xnli_examples_to_features(self): - """ - Converts examples into Text-to-Text batches to be used with a model like T5. - Inputs are prefixed with a text prompt that indicates the task to perform. - """ - features = self.features - lang_filtered_features = [] - for ex_index, example in enumerate(self.examples): - language = example.guid.split('-')[1] - if language in self.lang_list: - lang_filtered_features.append(features[ex_index] + [language]) - return lang_filtered_features - - def __len__(self): - return len(self.features) - - -class InputFeatures(object): - """A single set of features of data. - - Args: - input_ids: input/token ids - input_mask: masks out subword tokens - segment_ids: distinguish one sentence from the other one (if present) - label_ids: label for the current example - """ - - def __init__( - self, input_ids: List[int], input_mask: List[int], segment_ids: List[int], label_id: Union[float, int] - ): - """Initialized InputFeatures.""" - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id diff --git a/nemo/collections/nlp/data/information_retrieval/__init__.py b/nemo/collections/nlp/data/information_retrieval/__init__.py deleted file mode 100644 index a32196ee7c11..000000000000 --- a/nemo/collections/nlp/data/information_retrieval/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.information_retrieval.information_retrieval_dataset import ( - BertInformationRetrievalDataset, -) diff --git a/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py b/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py deleted file mode 100644 index 0da7af6ed96d..000000000000 --- a/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py +++ /dev/null @@ -1,342 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from random import choices, sample -from typing import Literal, Mapping, Optional - -import datasets -import numpy as np -import torch -from torch.utils.data import Dataset - -# hack to avoid the "not enough disk space" error in some slurm cluster -datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory='.': True -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping -from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import JSONLMemMapDataset -from nemo.core.classes import Dataset -from nemo.utils import logging - -__all__ = ['BertEmbeddingDataset'] - - -class BertEmbeddingDataset(Dataset): - """ - Embedding Dataset Class. - """ - - def __init__( - self, - file_path: str, - tokenizer: TokenizerSpec, - max_seq_length: int = 1024, - min_seq_length: int = 1, - add_bos: bool = True, - add_eos: bool = True, - max_num_samples: int = None, - seed: int = 1234, - index_mapping_dir: str = None, - virtual_tokens: int = 0, - memmap_workers: Optional[int] = None, - truncation_method: str = 'right', - special_tokens: Optional[Mapping[str, str]] = None, # special tokens, a dictory of {token_type: token} - data_type: str = 'train', # train, query or doc - num_hard_negatives: int = 4, - negative_sample_strategy: Literal["random", "first"] = 'first', - ): - """ - file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format. - tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec - (ex: YTTM, SentencePiece). - max_seq_length (int): maximum sequence length for each dataset examples. - Examples will either be truncated to fit this length or dropped if they cannot be truncated. - min_seq_length (int): min length of each data example in the dataset. - Data examples will be dropped if they do not meet the min length requirements. - add_bos (bool): Whether to add a beginning of sentence token to each data example - add_eos (bool): Whether to add an end of sentence token to each data example - seed: Random seed for data shuffling. - max_num_samples: Maximum number of samples to load. This can be > dataset length - if you want to oversample data. If None, all samples will be loaded. - index_mapping_dir: Directory to save the index mapping to. - If None, will write to the same folder as the dataset. - truncation_method: Truncation from which position. Options: ['left', 'right'] - special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. - Default: {'system_turn_start': '', 'turn_start': '', - 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} - negative_sample_strategy: Strategy for negative samples. Options: ['random', 'first'] - """ - # TODO: lot of copy-paste from GPTSFDDataset, should refactor both to use a common base class (@adithyare) - self.tokenizer = tokenizer - self.file_path = file_path - self.max_seq_length = max_seq_length - self.min_seq_length = min_seq_length - self.add_bos = add_bos - self.add_eos = add_eos - self.max_num_samples = max_num_samples - self.seed = seed - self.index_mapping_dir = index_mapping_dir - self.virtual_tokens = virtual_tokens - self.truncation_method = truncation_method - self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id else self.tokenizer.eos_id - self.negative_sample_strategy = negative_sample_strategy - assert ( - truncation_method == 'left' or truncation_method == 'right' - ), 'truncation_method must be either "left" or "right"' - assert ( - negative_sample_strategy == 'random' or negative_sample_strategy == 'first' - ), 'negative_sample_strategy must be either "random" or "first"' - if special_tokens is None: - self.special_tokens = { - "system_turn_start": "", - "turn_start": "", - "label_start": "", - "end_of_turn": "\n", - "end_of_name": "\n", - } - else: - self.special_tokens = special_tokens - self.data_type = data_type - self.num_hard_negatives = num_hard_negatives - - self.indexed_dataset = JSONLMemMapDataset( - dataset_paths=[file_path], - tokenizer=None, - header_lines=0, - index_mapping_dir=index_mapping_dir, - workers=memmap_workers, - ) - # Will be None after this call if `max_num_samples` is None - self.samples_mapping = None - self._build_samples_mapping() - logging.info( - f"Creating EmbeddingDataset with seed={self.seed},\n" - f"add_bos={self.add_bos}, add_eos={self.add_eos},\n" - f"max_seq_length={self.max_seq_length}, min_seq_length={self.min_seq_length},\n" - f"pad_token_id={self.pad_token_id}, negative_sample_strategy={self.negative_sample_strategy},\n" - f"num_hard_negatives={self.num_hard_negatives}." - ) - - def _build_samples_mapping(self): - if self.max_num_samples is not None: - self.samples_mapping = get_samples_mapping( - indexed_dataset=self.indexed_dataset, - data_prefix=self.file_path, - num_epochs=None, - max_num_samples=self.max_num_samples, - max_seq_length=self.max_seq_length - 2, - short_seq_prob=0, - seed=self.seed, - name=self.file_path.split('/')[-1], - binary_head=False, - index_mapping_dir=self.index_mapping_dir, - ) - else: - self.samples_mapping = None - - def __len__(self): - if self.max_num_samples is None: - return len(self.indexed_dataset) - else: - assert self.samples_mapping is not None - return len(self.samples_mapping) - - def __getitem__(self, idx): - if isinstance(idx, np.int64): - idx = idx.item() - - if self.samples_mapping is not None: - assert idx < len(self.samples_mapping) - idx, _, _ = self.samples_mapping[idx] - if isinstance(idx, np.uint32): - idx = idx.item() - - if idx is not None: - assert idx < len(self.indexed_dataset) - else: - idx = -1 - # idx may < 0 because we pad_samples_to_global_batch_size, e.g. id = -1 - if idx < 0: - idx = len(self) + idx - auto_gen_idx = True - else: - auto_gen_idx = False - try: - example = self.indexed_dataset[idx] - if auto_gen_idx: - example['__AUTOGENERATED__'] = True - except Exception as e: - logging.error(f"Error while loading example {idx} from dataset {self.file_path}") - raise e - return self._process_example(example) - - def _process_example(self, example): - """ - Create an example by concatenating text and answer. - Truncation is carried out when needed, but it is performed only on the prompt side. - BOS, EOS, and SEP, are added if specified. - """ - - metadata = {k: v for k, v in example.items()} - if self.data_type == 'train': - q = self.tokenizer.text_to_ids("query: " + example['query'].strip()) - d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip()) - # handle cases where the required number of hard negatives are not present - if len(example['neg_doc']) < self.num_hard_negatives: - nd = example['neg_doc'] - # sample rest with replacement - nd = nd + choices(example['neg_doc'], k=self.num_hard_negatives - len(example['neg_doc'])) - else: - if self.negative_sample_strategy == 'random': - # sample without replacement - # Choose the first self.num_hard_negatives - nd = sample(example['neg_doc'], k=self.num_hard_negatives) - else: - # Choose the first self.num_hard_negatives samples - nd = example['neg_doc'][: self.num_hard_negatives] - assert len(nd) == self.num_hard_negatives, "Error in sampling required number of hard negatives" - nd = [self.tokenizer.text_to_ids("passage: " + ex.strip()) for ex in nd] - - elif self.data_type == 'query': - q = self.tokenizer.text_to_ids("query: " + example['query'].strip()) - d, nd = None, None - assert "query_id" in example, "query_id is required for query dataset" - assert "doc_id" in example, "doc_id is required for query dataset" - elif self.data_type == 'doc': - d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip()) - assert "doc_id" in example, "doc_id is required for doc dataset" - q, nd = None, None - else: - raise ValueError(f"Invalid data type: {self.data_type}") - - q = q if q is not None else [] - d = d if d is not None else [] - nd = nd if nd is not None else [] - - if self.virtual_tokens: - # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context - # these pad/eos tokens are placeholders for virtual tokens for ptuning (if used) - q = [self.tokenizer.eos_id] * self.virtual_tokens + q # type: ignore - d = [self.tokenizer.eos_id] * self.virtual_tokens + d # type: ignore - nd = [[self.tokenizer.eos_id] * self.virtual_tokens + n for n in nd] # type: ignore - - if self.add_bos: - q = [self.tokenizer.bos_id] + q # type: ignore - d = [self.tokenizer.bos_id] + d # type: ignore - nd = [[self.tokenizer.bos_id] + n for n in nd] # type: ignore - - # TODO: (@adithyare) should probably add a warning before truncation - q = q[: self.max_seq_length - 1] - d = d[: self.max_seq_length - 1] - nd = [n[: self.max_seq_length - 1] for n in nd] - - if self.add_eos: - q = q + [self.tokenizer.eos_id] # type: ignore - d = d + [self.tokenizer.eos_id] # type: ignore - nd = [n + [self.tokenizer.eos_id] for n in nd] # type: ignore - - processed_example = { - 'query': q, - 'pos_doc': d, - 'neg_doc': nd, - 'metadata': metadata, - } - return processed_example - - def _maybe_cast_to_list(self, x): - if isinstance(x, np.ndarray): - return [item.tolist() for item in x] - return x - - def _ceil_to_nearest(self, n, m): - return (n + m - 1) // m * m - - def _collate_item(self, item, max_length): - item = self._maybe_cast_to_list(item) - pad_id = self.pad_token_id - if self.truncation_method == 'left': - item = [[pad_id] * (max_length - len(x)) + x for x in item] - else: - item = [x + [pad_id] * (max_length - len(x)) for x in item] - return item - - @torch.no_grad() - def _create_attention_mask2(self, max_length, item_length): - """Create `attention_mask`. - Args: - input_ids: A 1D tensor that holds the indices of tokens. - """ - # seq_length = len(input_ids) - # `attention_mask` has the shape of [1, seq_length, seq_length] - attention_mask = torch.zeros(max_length) - if self.truncation_method == 'left': - # input ids: [pad] [pad] token token | - # attention mask: 0 0 1 1 - attention_mask[max_length - item_length :] = 1 - else: - # input ids: token token [pad] [pad] | - # attention mask: 1 1 0 0 - attention_mask[:item_length] = 1 - return attention_mask - - def _collate_fn(self, batch): - """ - Collate query passage together - """ - input_ids = [] - metadata = [] - lengths = [] - max_length = -1 - for item in batch: - metadata.append(item['metadata']) - if self.data_type == 'train': - input_ids.append(item['query']) - lengths.append(len(item['query'])) - input_ids.append(item['pos_doc']) - lengths.append(len(item['pos_doc'])) - for nd in item['neg_doc']: - input_ids.append(nd) - lengths.append(len(nd)) - max_length = max( - max_length, len(item['query']), len(item['pos_doc']), *(len(nd) for nd in item['neg_doc']) - ) - elif self.data_type == 'query': - input_ids.append(item['query']) - lengths.append(len(item['query'])) - max_length = max(max_length, len(item['query'])) - elif self.data_type == 'doc': - input_ids.append(item['pos_doc']) - lengths.append(len(item['pos_doc'])) - max_length = max(max_length, len(item['pos_doc'])) - else: - raise ValueError(f"Invalid data type: {self.data_type}") - - max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16)) - assert max_length <= self.max_seq_length - - attention_mask = [self._create_attention_mask2(max_length, len) for len in lengths] - attention_mask = torch.stack(attention_mask) - position_ids = [list(range(max_length)) for _ in batch] - position_ids = torch.LongTensor(position_ids) - input_ids = torch.LongTensor(self._collate_item(input_ids, max_length=max_length)) - lengths = torch.LongTensor(lengths) - 1 # subtract 1 to account for the eos token - - processed_batch = { - 'input_ids': input_ids, - 'token_type_ids': torch.zeros_like(input_ids), - 'attention_mask': attention_mask, - 'metadata': metadata, - 'position_ids': position_ids, - } - - return processed_batch diff --git a/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py b/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py deleted file mode 100644 index 3a2a8152313e..000000000000 --- a/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py +++ /dev/null @@ -1,416 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Mapping, Optional - -import datasets -import numpy as np -import torch - -# hack to avoid the "not enough disk space" error in some slurm cluster -datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory='.': True - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping -from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import JSONLMemMapDataset -from nemo.core.classes import Dataset -from nemo.utils import logging - -__all__ = ['GPTEmbeddingDataset', 'GPTRerankerDataset'] - - -class GPTEmbeddingDataset(Dataset): - def __init__( - self, - file_path: str, - tokenizer: TokenizerSpec, - max_seq_length: int = 1024, - min_seq_length: int = 1, - add_bos: bool = False, - add_eos: bool = True, - max_num_samples: int = None, - seed: int = 1234, - index_mapping_dir: str = None, - virtual_tokens: int = 0, - memmap_workers: Optional[int] = None, - truncation_method: str = 'right', - special_tokens: Optional[Mapping[str, str]] = None, # special tokens, a dictory of {token_type: token} - data_type: str = 'train', # train, query or doc - ): - """ - file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format. - tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). - max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. - min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. - add_bos (bool): Whether to add a beginning of sentence token to each data example - add_eos (bool): Whether to add an end of sentence token to each data example - seed: Random seed for data shuffling. - max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded. - index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset. - truncation_method: Truncation from which position. Options: ['left', 'right'] - special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} - """ - # TODO: lot of copy-paste from GPTSFDDataset, should refactor both to use a common base class (@adithyare) - self.tokenizer = tokenizer - self.file_path = file_path - self.max_seq_length = max_seq_length - self.min_seq_length = min_seq_length - self.add_bos = add_bos - self.add_eos = add_eos - self.max_num_samples = max_num_samples - self.seed = seed - self.index_mapping_dir = index_mapping_dir - self.virtual_tokens = virtual_tokens - self.truncation_method = truncation_method - if special_tokens is None: - self.special_tokens = { - "system_turn_start": "", - "turn_start": "", - "label_start": "", - "end_of_turn": "\n", - "end_of_name": "\n", - } - else: - self.special_tokens = special_tokens - self.data_type = data_type - - self.indexed_dataset = JSONLMemMapDataset( - dataset_paths=[file_path], - tokenizer=None, - header_lines=0, - index_mapping_dir=index_mapping_dir, - workers=memmap_workers, - ) - - # Will be None after this call if `max_num_samples` is None - self.samples_mapping = None - self._build_samples_mapping() - - def _build_samples_mapping(self): - if self.max_num_samples is not None: - self.samples_mapping = get_samples_mapping( - indexed_dataset=self.indexed_dataset, - data_prefix=self.file_path, - num_epochs=None, - max_num_samples=self.max_num_samples, - max_seq_length=self.max_seq_length - 2, - short_seq_prob=0, - seed=self.seed, - name=self.file_path.split('/')[-1], - binary_head=False, - index_mapping_dir=self.index_mapping_dir, - ) - else: - self.samples_mapping = None - - def __len__(self): - if self.max_num_samples is None: - return len(self.indexed_dataset) - else: - assert self.samples_mapping is not None - return len(self.samples_mapping) - - def __getitem__(self, idx): - if isinstance(idx, np.int64): - idx = idx.item() - - if self.samples_mapping is not None: - assert idx < len(self.samples_mapping) - idx, _, _ = self.samples_mapping[idx] - if isinstance(idx, np.uint32): - idx = idx.item() - - assert idx < len(self.indexed_dataset) - # idx may < 0 because we pad_samples_to_global_batch_size, e.g. id = -1 - if idx < 0: - idx = len(self) + idx - auto_gen_idx = True - else: - auto_gen_idx = False - try: - example = self.indexed_dataset[idx] - if auto_gen_idx: - example['__AUTOGENERATED__'] = True - except Exception as e: - logging.error(f"Error while loading example {idx} from dataset {self.file_path}") - raise e - return self._process_example(example) - - def _process_example(self, example): - """ - Create an example by concatenating text and answer. - Truncation is carried out when needed, but it is performed only on the prompt side. - BOS, EOS, and SEP, are added if specified. - """ - metadata = {k: v for k, v in example.items()} - if self.data_type == 'train': - q = self.tokenizer.text_to_ids("query: " + example['query'].strip()) - d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip()) - nd = self.tokenizer.text_to_ids("passage: " + example['neg_doc'].strip()) - elif self.data_type == 'query': - q = self.tokenizer.text_to_ids("query: " + example['query'].strip()) - d, nd = None, None - assert "query_id" in example, "query_id is required for query dataset" - assert "doc_id" in example, "doc_id is required for query dataset" - elif self.data_type == 'doc': - d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip()) - assert "doc_id" in example, "doc_id is required for doc dataset" - q, nd = None, None - else: - raise ValueError(f"Invalid data type: {self.data_type}") - - q = q if q is not None else [] - d = d if d is not None else [] - nd = nd if nd is not None else [] - - if self.virtual_tokens: - # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context - # these pad/eos tokens are placeholders for virtual tokens for ptuning (if used) - q = [self.tokenizer.eos_id] * self.virtual_tokens + q # type: ignore - d = [self.tokenizer.eos_id] * self.virtual_tokens + d # type: ignore - nd = [self.tokenizer.eos_id] * self.virtual_tokens + nd # type: ignore - - if self.add_bos: - q = [self.tokenizer.bos_id] + q # type: ignore - d = [self.tokenizer.bos_id] + d # type: ignore - nd = [self.tokenizer.bos_id] + nd # type: ignore - - # TODO: (@adithyare) should probably add a warning before truncation - q = q[: self.max_seq_length - 1] - d = d[: self.max_seq_length - 1] - nd = nd[: self.max_seq_length - 1] - - if self.add_eos: - q = q + [self.tokenizer.eos_id] # type: ignore - d = d + [self.tokenizer.eos_id] # type: ignore - nd = nd + [self.tokenizer.eos_id] # type: ignore - - processed_example = { - 'query': q, - 'pos_doc': d, - 'neg_doc': nd, - 'metadata': metadata, - } - - return processed_example - - def _maybe_cast_to_list(self, x): - if isinstance(x, np.ndarray): - return [item.tolist() for item in x] - return x - - def _ceil_to_nearest(self, n, m): - return (n + m - 1) // m * m - - def _collate_item(self, item, max_length, pad_id): - item = self._maybe_cast_to_list(item) - # max_length = max([len(x) for x in item]) if item else 0 - # here [0] should be tokenizer.pad_id - item = [x + [pad_id] * (max_length - len(x)) for x in item] - return item - - @torch.no_grad() - def _create_attention_mask(self, max_length): - """Create `attention_mask`. - Args: - input_ids: A 1D tensor that holds the indices of tokens. - """ - # seq_length = len(input_ids) - # `attention_mask` has the shape of [1, seq_length, seq_length] - attention_mask = torch.tril(torch.ones((max_length, max_length))).unsqueeze(0) - attention_mask = attention_mask < 0.5 - return attention_mask - - def collate_fn(self, batch): - input_ids = [] - metadata = [] - lengths = [] - max_length = -1 - for item in batch: - metadata.append(item['metadata']) - if self.data_type == 'train': - input_ids.append(item['query']) - lengths.append(len(item['query'])) - input_ids.append(item['pos_doc']) - lengths.append(len(item['pos_doc'])) - input_ids.append(item['neg_doc']) - lengths.append(len(item['neg_doc'])) - max_length = max(max_length, len(item['query']), len(item['pos_doc']), len(item['neg_doc'])) - elif self.data_type == 'query': - input_ids.append(item['query']) - lengths.append(len(item['query'])) - max_length = max(max_length, len(item['query'])) - elif self.data_type == 'doc': - input_ids.append(item['pos_doc']) - lengths.append(len(item['pos_doc'])) - max_length = max(max_length, len(item['pos_doc'])) - else: - raise ValueError(f"Invalid data type: {self.data_type}") - - max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16)) - assert max_length <= self.max_seq_length - - attention_mask = [self._create_attention_mask(max_length) for _ in input_ids] - attention_mask = torch.stack(attention_mask) - position_ids = [list(range(max_length)) for _ in input_ids] - position_ids = torch.LongTensor(position_ids) - input_ids = torch.LongTensor( - self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id) - ) - lengths = torch.LongTensor(lengths) - 1 # subtract 1 to account for the eos token - - processed_batch = { - 'tokens': input_ids, - 'attention_mask': attention_mask, - 'loss_mask': lengths, - 'position_ids': position_ids, - 'metadata': metadata, - } - - return processed_batch - - -class GPTRerankerDataset(GPTEmbeddingDataset): - def __init__( - self, - file_path: str, - tokenizer: TokenizerSpec, - max_seq_length: int = 1024, - min_seq_length: int = 1, - add_bos: bool = False, - add_eos: bool = True, - max_num_samples: int = None, - seed: int = 1234, - index_mapping_dir: str = None, - virtual_tokens: int = 0, - memmap_workers: Optional[int] = None, - truncation_method: str = 'right', - special_tokens: Optional[Mapping[str, str]] = None, # special tokens, a dictory of {token_type: token} - data_type: str = 'train', # train, query or doc - ): - """ - file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format. - tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). - max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. - min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. - add_bos (bool): Whether to add a beginning of sentence token to each data example - add_eos (bool): Whether to add an end of sentence token to each data example - seed: Random seed for data shuffling. - max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded. - index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset. - truncation_method: Truncation from which position. Options: ['left', 'right'] - special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} - """ - super().__init__( - file_path=file_path, - tokenizer=tokenizer, - max_seq_length=max_seq_length, - min_seq_length=min_seq_length, - add_bos=add_bos, - add_eos=add_eos, - max_num_samples=max_num_samples, - seed=seed, - index_mapping_dir=index_mapping_dir, - virtual_tokens=virtual_tokens, - memmap_workers=memmap_workers, - truncation_method=truncation_method, - special_tokens=special_tokens, - data_type=data_type, - ) - - def _process_example(self, example): - """ - Create an example by concatenating text and answer. - Truncation is carried out when needed, but it is performed only on the prompt side. - BOS, EOS, and SEP, are added if specified. - """ - metadata = {k: v for k, v in example.items()} - if self.data_type == 'train': - qd = self.tokenizer.text_to_ids( - "query: " + example['query'].strip() + " passage: " + example['pos_doc'].strip() - ) - qnd = self.tokenizer.text_to_ids( - "query: " + example['query'].strip() + " passage: " + example['neg_doc'].strip() - ) - else: - qd = self.tokenizer.text_to_ids( - "query: " + example['query'].strip() + " passage: " + example['pos_doc'].strip() - ) - qnd = [] - - if self.virtual_tokens: - # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context - # these pad/eos tokens are placeholders for virtual tokens for ptuning (if used) - qd = [self.tokenizer.eos_id] * self.virtual_tokens + qd # type: ignore - qnd = [self.tokenizer.eos_id] * self.virtual_tokens + qnd # type: ignore - - if self.add_bos: - qd = [self.tokenizer.bos_id] + qd # type: ignore - qnd = [self.tokenizer.bos_id] + qnd # type: ignore - - # TODO: (@adithyare) should probably add a warning before truncation - qd = qd[: self.max_seq_length - 1] - qnd = qnd[: self.max_seq_length - 1] - - if self.add_eos: - qd = qd + [self.tokenizer.eos_id] # type: ignore - qnd = qnd + [self.tokenizer.eos_id] # type: ignore - - processed_example = { - 'query_pos_doc': qd, - 'query_neg_doc': qnd, - 'metadata': metadata, - } - - return processed_example - - def collate_fn(self, batch): - input_ids = [] - metadata = [] - lengths = [] - max_length = -1 - for item in batch: - metadata.append(item['metadata']) - if self.data_type == 'train': - input_ids.append(item['query_pos_doc']) - lengths.append(len(item['query_pos_doc'])) - input_ids.append(item['query_neg_doc']) - lengths.append(len(item['query_neg_doc'])) - max_length = max(max_length, len(item['query_pos_doc']), len(item['query_neg_doc'])) - else: - input_ids.append(item['query_pos_doc']) - lengths.append(len(item['query_pos_doc'])) - max_length = max(max_length, len(item['query_pos_doc'])) - - max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16)) - assert max_length <= self.max_seq_length - - attention_mask = [self._create_attention_mask(max_length) for _ in input_ids] - attention_mask = torch.stack(attention_mask) - position_ids = [list(range(max_length)) for _ in input_ids] - position_ids = torch.LongTensor(position_ids) - input_ids = torch.LongTensor( - self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id) - ) - lengths = torch.LongTensor(lengths) - 1 # subtract 1 to account for the eos token - - processed_batch = { - 'tokens': input_ids, - 'attention_mask': attention_mask, - 'loss_mask': lengths, - 'position_ids': position_ids, - 'metadata': metadata, - } - - return processed_batch diff --git a/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py b/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py deleted file mode 100644 index 349f9e43ef97..000000000000 --- a/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import multiprocessing as mp -import os -import pickle -import random -from typing import Optional - -import numpy as np -from torch.utils.data import Dataset - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec - -__all__ = ["BertInformationRetrievalDataset"] - - -class BaseInformationRetrievalDataset(Dataset): - """ - Base information retrieval dataset on which other datasets are built. - - Args: - tokenizer: tokenizer - max_query_length: maximum length of query in tokens - max_passage_length: maximum length of passage in tokens - """ - - def __init__( - self, tokenizer: TokenizerSpec, max_query_length: Optional[int] = 31, max_passage_length: Optional[int] = 190, - ): - self.tokenizer = tokenizer - self.max_query_length = max_query_length - self.max_passage_length = max_passage_length - - def parse_npz(self, file, max_seq_length): - """ - Function which parses passages (documents) in npz format. - After pre-processing and tokenization, the dataset will be saved - as numpy matrix, i_th entry of which corresponds to i_th passage (document) - and has the following form: - [n, token_1, ..., token_n, 0, ..., 0] - where n is the passage length (in tokens) and 0s correspond to pad tokens. - - Args: - file: str, path to file with passages (documents) - max_seq_length: maximum length of sequence in tokens - """ - cached_collection = file + ".npz" - if os.path.isfile(cached_collection): - dataset_npz = np.load(cached_collection)["data"] - else: - dataset_dict = self.tokenize_dataset(file, max_seq_length) - dataset_npz = np.zeros((len(dataset_dict), max_seq_length + 1)) - for key in dataset_dict: - dataset_npz[key][0] = len(dataset_dict[key]) - dataset_npz[key][1 : len(dataset_dict[key]) + 1] = dataset_dict[key] - np.savez(cached_collection, data=dataset_npz) - return dataset_npz - - def parse_pkl(self, file, max_seq_length): - """ - Function which parses passages (documents, queries) in pkl format. - After pre-processing and tokenization, the dataset will be saved - as pkl dict, i_th entry of which corresponds to i_th passage (document, query) - and has the following form: - {passage_id: [token_1, ..., token_n]} - where n is the passage length (in tokens). - - Args: - file: str, path to file with passages (documents) - max_seq_length: maximum length of sequence in tokens - """ - cached_collection = file + ".pkl" - if os.path.isfile(cached_collection): - dataset_dict = pickle.load(open(cached_collection, "rb")) - else: - dataset_dict = self.tokenize_dataset(file, max_seq_length) - pickle.dump(dataset_dict, open(cached_collection, "wb")) - return dataset_dict - - def tokenize_dataset(self, file, max_seq_length): - """ - Function which pre-tokenizes the dataset. - """ - lines = open(file, "r").readlines() - with mp.Pool() as pool: - dataset_dict = pool.map(self.preprocess_line, lines) - dataset_dict = {id_: tokens[:max_seq_length] for (id_, tokens) in dataset_dict} - return dataset_dict - - def preprocess_line(self, line): - """ - Parse a single entry (line) of tsv file. - """ - if "\t" not in line: - raise ValueError(f"Provided dataset does not have a form of tsv file") - id_, text = line.split("\t") - token_ids = self.tokenizer.text_to_ids(text.strip()) - return int(id_), token_ids - - def construct_input(self, token_ids1, max_seq_length, token_ids2=None): - """ - Function which constructs a valid input to BERT from tokens. - - If only one list of tokens (token_ids1) is passed, the input will be - [CLS] token_ids1 [SEP] - - if two lists of tokens are passed, the input will be - [CLS] token_ids1 [SEP] token_ids2 [SEP] - """ - - input_ids = [self.tokenizer.pad_id] * max_seq_length - bert_input = [self.tokenizer.cls_id] + token_ids1 + [self.tokenizer.sep_id] - sentence1_length = len(bert_input) - if token_ids2 is not None: - bert_input = bert_input + token_ids2 + [self.tokenizer.sep_id] - - bert_input = bert_input[:max_seq_length] - - num_nonpad_tokens = len(bert_input) - - input_ids[:num_nonpad_tokens] = bert_input - input_ids = np.array(input_ids, dtype=np.longlong) - input_mask = input_ids != self.tokenizer.pad_id - input_type_ids = np.ones_like(input_ids) - input_type_ids[:sentence1_length] = 0 - - return input_ids, input_mask, input_type_ids - - def preprocess_bert(self, query_id, psg_ids): - """ - Transforms query id (Q) and a list of passages ids (P1, ..., Pk) - into a tensor of size [k, max_length] with the following rows: - [CLS] Q_text [SEP] Pi_text [SEP], i = 1, ..., k - """ - - max_seq_length = self.max_query_length + self.max_passage_length + 3 - input_ids, input_mask, input_type_ids = [], [], [] - for psg_id in psg_ids: - inputs = self.construct_input(self.queries[query_id], max_seq_length, self._psgid2tokens(psg_id)) - input_ids.append(inputs[0]) - input_mask.append(inputs[1]) - input_type_ids.append(inputs[2]) - - input_ids = np.stack(input_ids) - input_mask = np.stack(input_mask) - input_type_ids = np.stack(input_type_ids) - - return input_ids, input_mask, input_type_ids - - def preprocess_dpr(self, query_id, psg_ids): - """ - Transforms query id (Q) and a list of passages ids (P1, ..., Pk) - into two tensors of sizes [1, max_q_length] and [k, max_p_length] - with the following rows: - 1) [CLS] Q_text [SEP] - 2) [CLS] Pi_text [SEP], i = 1, ..., k - """ - - q_input_ids, q_input_mask, q_type_ids = self.construct_input(self.queries[query_id], self.max_query_length + 2) - input_ids, input_mask, input_type_ids = [], [], [] - for psg_id in psg_ids: - inputs = self.construct_input(self._psgid2tokens(psg_id), self.max_passage_length + 2) - input_ids.append(inputs[0]) - input_mask.append(inputs[1]) - input_type_ids.append(inputs[2]) - input_ids = np.stack(input_ids) - input_mask = np.stack(input_mask) - input_type_ids = np.stack(input_type_ids) - return ( - q_input_ids[None, ...], - q_input_mask[None, ...], - q_type_ids[None, ...], - input_ids, - input_mask, - input_type_ids, - ) - - def _psgid2tokens(self, psg_id): - """ - Internal function which maps passage id to its tokens. - """ - pass - - def psgid2tokens_npz(self, psg_id): - """ - Mapping from passage id to its tokens in case of npz cache format. - """ - seq_len = self.passages[psg_id][0] - return self.passages[psg_id][1 : seq_len + 1].tolist() - - def psgid2tokens_pkl(self, psg_id): - """ - Mapping from passage id to its tokens in case of pkl cache format. - """ - return self.passages[psg_id] - - -class BertInformationRetrievalDataset(BaseInformationRetrievalDataset): - def __init__( - self, - tokenizer: TokenizerSpec, - passages: str, - queries: str, - query_to_passages: str, - max_query_length: Optional[int] = 31, - max_passage_length: Optional[int] = 190, - num_negatives: Optional[int] = 10, - preprocess_fn: Optional[str] = "preprocess_bert", - psg_cache_format: Optional[str] = "npz", - ): - """ - Dataset for training information retrieval models. - - Args: - tokenizer: tokenizer - passages: path to tsv with [psg_id, psg_text] entries - queries: path to tsv with [query_id, query_text] entries - query_to_passages: path to tsv with - [query_id, pos_psg_id, neg_psg_id_1, ..., neg_psg_id_k] entries - max_query_length: maximum length of query in tokens - max_passage_length: maximum length of passage in tokens - num_negatives: number of negative passages per positive to use for training - preprocess_fn: either preprocess_bert or preprocess_dpr - preprocess_bert: joint input: [CLS] query [SEP] passage [SEP] - preprocess_dpr: separate inputs: [CLS] query [SEP], [CLS] passage [SEP] - psg_cache_format: either pkl or npz - """ - - super().__init__(tokenizer, max_query_length, max_passage_length) - self.num_negatives = num_negatives - - self.passages = getattr(self, f"parse_{psg_cache_format}")(passages, max_passage_length) - self._psgid2tokens = getattr(self, f"psgid2tokens_{psg_cache_format}") - self.queries = self.parse_pkl(queries, max_query_length) - self.idx2psgs = self.parse_query_to_passages(query_to_passages) - self._preprocess_fn = getattr(self, preprocess_fn) - - def __getitem__(self, idx): - query_and_psgs = self.idx2psgs[idx] - query_id, psg_ids = query_and_psgs[0], query_and_psgs[1:] - inputs = self._preprocess_fn(query_id, psg_ids) - return [*inputs, query_id, np.array(psg_ids)] - - def __len__(self): - return len(self.idx2psgs) - - def parse_query_to_passages(self, file): - """ - Function which parses query to passages correspondence file. - """ - idx2psgs = {} - idx = 0 - for line in open(file, "r").readlines(): - if "\t" not in line: - raise ValueError(f"Provided dataset does not have a form of tsv file") - query_and_psgs = line.split("\t") - query_and_psgs_ids = [int(id_) for id_ in query_and_psgs] - query_and_rel_psg_ids, irrel_psgs_ids = query_and_psgs_ids[:2], query_and_psgs_ids[2:] - random.shuffle(irrel_psgs_ids) - num_samples = len(irrel_psgs_ids) // self.num_negatives - for j in range(num_samples): - left = self.num_negatives * j - right = self.num_negatives * (j + 1) - idx2psgs[idx] = query_and_rel_psg_ids + irrel_psgs_ids[left:right] - idx += 1 - return idx2psgs diff --git a/nemo/collections/nlp/data/intent_slot_classification/__init__.py b/nemo/collections/nlp/data/intent_slot_classification/__init__.py deleted file mode 100644 index 3e1782e02e4f..000000000000 --- a/nemo/collections/nlp/data/intent_slot_classification/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from nemo.collections.nlp.data.intent_slot_classification.intent_slot_classification_dataset import ( - IntentSlotClassificationDataset, - IntentSlotInferenceDataset, -) -from nemo.collections.nlp.data.intent_slot_classification.intent_slot_classification_descriptor import ( - IntentSlotDataDesc, -) -from nemo.collections.nlp.data.intent_slot_classification.multi_label_intent_slot_classification_dataset import ( - MultiLabelIntentSlotClassificationDataset, -) -from nemo.collections.nlp.data.intent_slot_classification.multi_label_intent_slot_classification_descriptor import ( - MultiLabelIntentSlotDataDesc, -) diff --git a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py b/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py deleted file mode 100644 index a73341aa719d..000000000000 --- a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py +++ /dev/null @@ -1,297 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional - -import numpy as np - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.data_utils import get_stats -from nemo.core.classes import Dataset -from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType -from nemo.utils import logging - -__all__ = ['IntentSlotClassificationDataset', 'IntentSlotInferenceDataset'] - - -def get_features( - queries, - max_seq_length, - tokenizer, - pad_label=128, - raw_slots=None, - ignore_extra_tokens=False, - ignore_start_end=False, -): - all_subtokens = [] - all_loss_mask = [] - all_subtokens_mask = [] - all_segment_ids = [] - all_input_ids = [] - all_input_mask = [] - sent_lengths = [] - all_slots = [] - - with_label = False - if raw_slots is not None: - with_label = True - - for i, query in enumerate(queries): - words = query.strip().split() - subtokens = [tokenizer.cls_token] - loss_mask = [1 - ignore_start_end] - subtokens_mask = [0] - if with_label: - slots = [pad_label] - - for j, word in enumerate(words): - word_tokens = tokenizer.text_to_tokens(word) - - # to handle emojis that could be neglected during tokenization - if len(word.strip()) > 0 and len(word_tokens) == 0: - word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)] - - subtokens.extend(word_tokens) - - loss_mask.append(1) - loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1)) - - subtokens_mask.append(1) - subtokens_mask.extend([0] * (len(word_tokens) - 1)) - - if with_label: - slots.extend([raw_slots[i][j]] * len(word_tokens)) - - subtokens.append(tokenizer.sep_token) - loss_mask.append(1 - ignore_start_end) - subtokens_mask.append(0) - sent_lengths.append(len(subtokens)) - all_subtokens.append(subtokens) - all_loss_mask.append(loss_mask) - all_subtokens_mask.append(subtokens_mask) - all_input_mask.append([1] * len(subtokens)) - if with_label: - slots.append(pad_label) - all_slots.append(slots) - - max_seq_length_data = max(sent_lengths) - max_seq_length = min(max_seq_length, max_seq_length_data) if max_seq_length > 0 else max_seq_length_data - logging.info(f'Setting max length to: {max_seq_length}') - get_stats(sent_lengths) - too_long_count = 0 - - for i, subtokens in enumerate(all_subtokens): - if len(subtokens) > max_seq_length: - subtokens = [tokenizer.cls_token] + subtokens[-max_seq_length + 1 :] - all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :] - all_loss_mask[i] = [1 - ignore_start_end] + all_loss_mask[i][-max_seq_length + 1 :] - all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :] - - if with_label: - all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1 :] - too_long_count += 1 - - all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens]) - - if len(subtokens) < max_seq_length: - extra = max_seq_length - len(subtokens) - all_input_ids[i] = all_input_ids[i] + [0] * extra - all_loss_mask[i] = all_loss_mask[i] + [0] * extra - all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra - all_input_mask[i] = all_input_mask[i] + [0] * extra - - if with_label: - all_slots[i] = all_slots[i] + [pad_label] * extra - - all_segment_ids.append([0] * max_seq_length) - - logging.info(f'{too_long_count} are longer than {max_seq_length}') - - # May be useful for debugging - logging.debug("*** Some Examples of Processed Data ***") - for i in range(min(len(all_input_ids), 5)): - logging.debug("i: %s" % (i)) - logging.debug("subtokens: %s" % " ".join(list(map(str, all_subtokens[i])))) - logging.debug("loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i])))) - logging.debug("input_mask: %s" % " ".join(list(map(str, all_input_mask[i])))) - logging.debug("subtokens_mask: %s" % " ".join(list(map(str, all_subtokens_mask[i])))) - if with_label: - logging.debug("slots_label: %s" % " ".join(list(map(str, all_slots[i])))) - - return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots) - - -class IntentSlotClassificationDataset(Dataset): - """ - Creates dataset to use for the task of joint intent - and slot classification with pretrained model. - - Converts from raw data to an instance that can be used by - NMDataLayer. - - For dataset to use during inference without labels, see - IntentSlotDataset. - - Args: - input_file: file to sequence + label. the first line is header (sentence [tab] label) - each line should be [sentence][tab][label] - slot_file: file to slot labels, each line corresponding to slot labels for a sentence in input_file. No header. - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] - tokenizer: such as NemoBertTokenizer - num_samples: number of samples you want to use for the dataset. If -1, use all dataset. Useful for testing. - pad_label: pad value use for slot labels. by default, it's the neutral label. - ignore_extra_tokens: whether to ignore extra tokens in the loss_mask. - ignore_start_end: whether to ignore bos and eos tokens in the loss_mask. - do_lower_case: convert query to lower case or not - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'intent_labels': NeuralType(('B'), LabelsType()), - 'slot_labels': NeuralType(('B', 'T'), LabelsType()), - } - - def __init__( - self, - input_file: str, - slot_file: str, - max_seq_length: int, - tokenizer: TokenizerSpec, - num_samples: int = -1, - pad_label: int = 128, - ignore_extra_tokens: bool = False, - ignore_start_end: bool = False, - do_lower_case: bool = False, - ): - if num_samples == 0: - raise ValueError("num_samples has to be positive", num_samples) - - with open(slot_file, 'r') as f: - slot_lines = f.readlines() - - with open(input_file, 'r') as f: - input_lines = f.readlines()[1:] - - assert len(slot_lines) == len(input_lines) - - dataset = list(zip(slot_lines, input_lines)) - - if num_samples > 0: - dataset = dataset[:num_samples] - - raw_slots, queries, raw_intents = [], [], [] - for slot_line, input_line in dataset: - raw_slots.append([int(slot) for slot in slot_line.strip().split()]) - parts = input_line.strip().split() - raw_intents.append(int(parts[-1])) - query = ' '.join(parts[:-1]) - if do_lower_case: - query = query.lower() - queries.append(query) - - features = get_features( - queries, - max_seq_length, - tokenizer, - pad_label=pad_label, - raw_slots=raw_slots, - ignore_extra_tokens=ignore_extra_tokens, - ignore_start_end=ignore_start_end, - ) - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - self.all_slots = features[5] - self.all_intents = raw_intents - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.longlong), - np.array(self.all_loss_mask[idx]), - np.array(self.all_subtokens_mask[idx]), - self.all_intents[idx], - np.array(self.all_slots[idx]), - ) - - -class IntentSlotInferenceDataset(Dataset): - """ - Creates dataset to use for the task of joint intent - and slot classification with pretrained model. - This is to be used during inference only. - It uses list of queries as the input. - - Args: - queries (list): list of queries to run inference on - max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] - tokenizer (Tokenizer): such as NemoBertTokenizer - pad_label (int): pad value use for slot labels. - by default, it's the neutral label. - - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """ - Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - } - - def __init__(self, queries, max_seq_length, tokenizer, do_lower_case): - if do_lower_case: - for idx, query in enumerate(queries): - queries[idx] = queries[idx].lower() - - features = get_features(queries, max_seq_length, tokenizer) - - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.longlong), - np.array(self.all_loss_mask[idx]), - np.array(self.all_subtokens_mask[idx]), - ) diff --git a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_descriptor.py b/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_descriptor.py deleted file mode 100644 index 544b5e1db858..000000000000 --- a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_descriptor.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -from typing import List - -from nemo.collections.nlp.data.data_utils.data_preprocessing import ( - fill_class_weights, - get_freq_weights, - get_label_stats, - if_exist, -) -from nemo.utils import logging - - -class IntentSlotDataDesc: - """ Convert the raw data to the standard format supported by - IntentSlotDataDesc. - - By default, the None label for slots is 'O'. - - IntentSlotDataDesc requires two files: - - input_file: file to sequence + label. - the first line is header (sentence [tab] label) - each line should be [sentence][tab][label] - - slot_file: file to slot labels, each line corresponding to - slot labels for a sentence in input_file. No header. - - To keep the mapping from label index to label consistent during - training and inferencing we require the following files: - dicts.intents.csv: each line is an intent. The first line - corresponding to the 0 intent label, the second line - corresponding to the 1 intent label, and so on. - - dicts.slots.csv: each line is a slot. The first line - corresponding to the 0 slot label, the second line - corresponding to the 1 slot label, and so on. - - Args: - data_dir: the directory of the dataset - modes: ['train', 'test', 'dev'], - none_slot_label: the label for slots that aren't identified defaulted to 'O' - pad_label: the int used for padding. If set to -1, it'll be set to the whatever the None label is. - """ - - def __init__( - self, - data_dir: str, - modes: List[str] = ['train', 'test', 'dev'], - none_slot_label: str = 'O', - pad_label: int = -1, - ): - if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): - raise FileNotFoundError( - "Make sure that your data follows the standard format " - "supported by JointIntentSlotDataset. Your data must " - "contain dict.intents.csv and dict.slots.csv." - ) - - self.data_dir = data_dir - self.intent_dict_file = self.data_dir + '/dict.intents.csv' - self.slot_dict_file = self.data_dir + '/dict.slots.csv' - - self.intents_label_ids = IntentSlotDataDesc.label2idx(self.intent_dict_file) - self.num_intents = len(self.intents_label_ids) - self.slots_label_ids = IntentSlotDataDesc.label2idx(self.slot_dict_file) - self.num_slots = len(self.slots_label_ids) - - infold = self.data_dir - for mode in modes: - if not if_exist(self.data_dir, [f'{mode}.tsv']): - logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') - continue - logging.info(f' Stats calculating for {mode} mode...') - slot_file = f'{self.data_dir}/{mode}_slots.tsv' - with open(slot_file, 'r') as f: - slot_lines = f.readlines() - - input_file = f'{self.data_dir}/{mode}.tsv' - with open(input_file, 'r') as f: - input_lines = f.readlines()[1:] # Skipping headers at index 0 - - if len(slot_lines) != len(input_lines): - raise ValueError( - "Make sure that the number of slot lines match the " - "number of intent lines. There should be a 1-1 " - "correspondence between every slot and intent lines." - ) - - dataset = list(zip(slot_lines, input_lines)) - - raw_slots, raw_intents = [], [] - for slot_line, input_line in dataset: - slot_list = [int(slot) for slot in slot_line.strip().split()] - raw_slots.append(slot_list) - parts = input_line.strip().split() - raw_intents.append(int(parts[-1])) - - logging.info(f'Three most popular intents in {mode} mode:') - total_intents, intent_label_freq, max_id = get_label_stats( - raw_intents, infold + f'/{mode}_intent_stats.tsv' - ) - - merged_slots = itertools.chain.from_iterable(raw_slots) - logging.info(f'Three most popular slots in {mode} mode:') - slots_total, slots_label_freq, max_id = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv') - - logging.info(f'Total Number of Intents: {total_intents}') - logging.info(f'Intent Label Frequencies: {intent_label_freq}') - logging.info(f'Total Number of Slots: {slots_total}') - logging.info(f'Slots Label Frequencies: {slots_label_freq}') - - if mode == 'train': - intent_weights_dict = get_freq_weights(intent_label_freq) - logging.info(f'Intent Weights: {intent_weights_dict}') - slot_weights_dict = get_freq_weights(slots_label_freq) - logging.info(f'Slot Weights: {slot_weights_dict}') - - self.intent_weights = fill_class_weights(intent_weights_dict, self.num_intents - 1) - self.slot_weights = fill_class_weights(slot_weights_dict, self.num_slots - 1) - - if pad_label != -1: - self.pad_label = pad_label - else: - if none_slot_label not in self.slots_label_ids: - raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') - self.pad_label = self.slots_label_ids[none_slot_label] - - @staticmethod - def label2idx(file): - lines = open(file, 'r').readlines() - lines = [line.strip() for line in lines if line.strip()] - labels = {lines[i]: i for i in range(len(lines))} - return labels - - @staticmethod - def intent_slot_dicts(data_dir): - ''' - Return Intent and slot dictionaries - ''' - intent_dict_file = data_dir + '/dict.intents.csv' - slot_dict_file = data_dir + '/dict.slots.csv' - - intents_labels = open(intent_dict_file, 'r').readlines() - intents_labels = [line.strip() for line in intents_labels if line.strip()] - - slots_labels = open(slot_dict_file, 'r').readlines() - slots_labels = [line.strip() for line in slots_labels if line.strip()] - - return intents_labels, slots_labels diff --git a/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_dataset.py b/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_dataset.py deleted file mode 100644 index 32a72d107193..000000000000 --- a/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_dataset.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.intent_slot_classification import IntentSlotClassificationDataset -from nemo.collections.nlp.data.intent_slot_classification.intent_slot_classification_dataset import get_features -from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType - -__all__ = ['MultiLabelIntentSlotClassificationDataset'] - - -class MultiLabelIntentSlotClassificationDataset(IntentSlotClassificationDataset): - """ - Creates dataset to use for the task of multi-label joint intent - and slot classification with pretrained model. - - Converts from raw data to an instance that can be used by - NMDataLayer. - - Args: - input_file: file containing sentences + labels. The first line is header (sentence [tab] label) - each line should be [sentence][tab][label] where label can be multiple labels separated by a comma - slot_file: file containing slot labels, each line corresponding to slot labels for a sentence in input_file. No header. - num_intents: total number of intents in dict.intents file - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] - tokenizer: such as NemoBertTokenizer - num_samples: number of samples you want to use for the dataset. If -1, use all dataset. Useful for testing. - pad_label: pad value use for slot labels. by default, it's the neutral label. - ignore_extra_tokens: whether to ignore extra tokens in the loss_mask. - ignore_start_end: whether to ignore bos and eos tokens in the loss_mask. - do_lower_case: convert query to lower case or not - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'intent_labels': [NeuralType(('B'), LabelsType())], - 'slot_labels': NeuralType(('B', 'T'), LabelsType()), - } - - def __init__( - self, - input_file: str, - slot_file: str, - num_intents: int, - max_seq_length: int, - tokenizer: TokenizerSpec, - num_samples: int = -1, - pad_label: int = 128, - ignore_extra_tokens: bool = False, - ignore_start_end: bool = False, - do_lower_case: bool = False, - ): - if num_samples == 0: - raise ValueError("num_samples has to be positive", num_samples) - - with open(slot_file, 'r') as f: - slot_lines = f.readlines() - - with open(input_file, 'r') as f: - input_lines = f.readlines()[1:] - - assert len(slot_lines) == len(input_lines) - - dataset = list(zip(slot_lines, input_lines)) - - if num_samples > 0: - dataset = dataset[:num_samples] - - raw_slots, queries, raw_intents = [], [], [] - for slot_line, input_line in dataset: - raw_slots.append([int(slot) for slot in slot_line.strip().split()]) - parts = input_line.strip().split("\t")[1:][0] - parts = list(map(int, parts.split(","))) - parts = [1 if label in parts else 0 for label in range(num_intents)] - raw_intents.append(tuple(parts)) - tokens = input_line.strip().split("\t")[0].split() - query = ' '.join(tokens) - if do_lower_case: - query = query.lower() - queries.append(query) - - features = get_features( - queries, - max_seq_length, - tokenizer, - pad_label=pad_label, - raw_slots=raw_slots, - ignore_extra_tokens=ignore_extra_tokens, - ignore_start_end=ignore_start_end, - ) - - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - self.all_slots = features[5] - self.all_intents = raw_intents diff --git a/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_descriptor.py b/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_descriptor.py deleted file mode 100644 index ddde1a2896de..000000000000 --- a/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_descriptor.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -from typing import List - -from nemo.collections.nlp.data.data_utils.data_preprocessing import ( - fill_class_weights, - get_freq_weights, - get_freq_weights_bce_with_logits_loss, - get_label_stats, - get_labels_to_labels_id_mapping, - get_multi_label_stats, - if_exist, -) -from nemo.utils import logging - - -class MultiLabelIntentSlotDataDesc: - """ Convert the raw data to the standard format supported by - MultiLabelIntentSlotDataDesc. - - By default, the None label for slots is 'O'. - - MultiLabelIntentSlotDataDesc requires two files: - - input_file: file containing sentences + labels. - the first line is header (sentence [tab] label) - each line should be [sentence][tab][label] where label is a string of comma separated values. - Example: 1 or 1,2 are both valid labels - - slot_file: file containing slot labels, each line corresponding to - slot labels for a sentence in input_file. No header. - - To keep the mapping from label index to label consistent during - training and inferencing we require the following files: - dicts.intents.csv: each line is an intent. The first line - corresponding to the 0 intent label, the second line - corresponding to the 1 intent label, and so on. - - dicts.slots.csv: each line is a slot. The first line - corresponding to the 0 slot label, the second line - corresponding to the 1 slot label, and so on. - - Args: - data_dir: the directory of the dataset - modes: ['train', 'test', 'dev'], - none_slot_label: the label for slots that aren't identified defaulted to 'O' - pad_label: the int used for padding. If set to -1, it'll be set to the whatever the None label is. - """ - - def __init__( - self, - data_dir: str, - modes: List[str] = ["train", "test", "dev"], - none_slot_label: str = "O", - pad_label: int = -1, - ): - if not if_exist(data_dir, ["dict.intents.csv", "dict.slots.csv"]): - raise FileNotFoundError( - "Make sure that your data follows the standard format " - "supported by MultiLabelIntentSlotDataset. Your data must " - "contain dict.intents.csv and dict.slots.csv." - ) - - self.data_dir = data_dir - self.intent_dict_file = self.data_dir + "/dict.intents.csv" - self.slot_dict_file = self.data_dir + "/dict.slots.csv" - - self.intents_label_ids = get_labels_to_labels_id_mapping(self.intent_dict_file) - self.num_intents = len(self.intents_label_ids) - self.slots_label_ids = get_labels_to_labels_id_mapping(self.slot_dict_file) - self.num_slots = len(self.slots_label_ids) - - infold = self.data_dir - for mode in modes: - if not if_exist(self.data_dir, [f"{mode}.tsv"]): - logging.info(f" Stats calculation for {mode} mode" f" is skipped as {mode}.tsv was not found.") - continue - logging.info(f" Stats calculating for {mode} mode...") - slot_file = f"{self.data_dir}/{mode}_slots.tsv" - with open(slot_file, "r") as f: - slot_lines = f.readlines() - - input_file = f"{self.data_dir}/{mode}.tsv" - with open(input_file, "r") as f: - input_lines = f.readlines()[1:] # Skipping headers at index 0 - - if len(slot_lines) != len(input_lines): - raise ValueError( - "Make sure that the number of slot lines match the " - "number of intent lines. There should be a 1-1 " - "correspondence between every slot and intent lines." - ) - - dataset = list(zip(slot_lines, input_lines)) - - raw_slots, raw_intents = [], [] - for slot_line, input_line in dataset: - slot_list = [int(slot) for slot in slot_line.strip().split()] - raw_slots.append(slot_list) - parts = input_line.strip().split("\t")[1:][0] - parts = list(map(int, parts.split(","))) - parts = [1 if label in parts else 0 for label in range(self.num_intents)] - raw_intents.append(tuple(parts)) - - logging.info(f"Three most popular intents in {mode} mode:") - total_intents, intent_label_freq, max_id = get_multi_label_stats( - raw_intents, infold + f"/{mode}_intent_stats.tsv" - ) - - merged_slots = itertools.chain.from_iterable(raw_slots) - logging.info(f"Three most popular slots in {mode} mode:") - slots_total, slots_label_freq, max_id = get_label_stats(merged_slots, infold + f"/{mode}_slot_stats.tsv") - - logging.info(f"Total Number of Intent Labels: {total_intents}") - logging.info(f"Intent Label Frequencies: {intent_label_freq}") - logging.info(f"Total Number of Slots: {slots_total}") - logging.info(f"Slots Label Frequencies: {slots_label_freq}") - - if mode == "train": - intent_weights_dict = get_freq_weights_bce_with_logits_loss(intent_label_freq) - logging.info(f"Intent Weights: {intent_weights_dict}") - slot_weights_dict = get_freq_weights(slots_label_freq) - logging.info(f"Slot Weights: {slot_weights_dict}") - - self.intent_weights = fill_class_weights(intent_weights_dict, self.num_intents - 1) - self.slot_weights = fill_class_weights(slot_weights_dict, self.num_slots - 1) - - if pad_label != -1: - self.pad_label = pad_label - else: - if none_slot_label not in self.slots_label_ids: - raise ValueError(f"none_slot_label {none_slot_label} not " f"found in {self.slot_dict_file}.") - self.pad_label = self.slots_label_ids[none_slot_label] diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/__init__.py b/nemo/collections/nlp/data/spellchecking_asr_customization/__init__.py deleted file mode 100644 index 4e786276108c..000000000000 --- a/nemo/collections/nlp/data/spellchecking_asr_customization/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from nemo.collections.nlp.data.spellchecking_asr_customization.dataset import ( - SpellcheckingAsrCustomizationDataset, - SpellcheckingAsrCustomizationTestDataset, - TarredSpellcheckingAsrCustomizationDataset, -) diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py deleted file mode 100644 index c98abb300c64..000000000000 --- a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py +++ /dev/null @@ -1,607 +0,0 @@ -# Copyright 2019 The Google Research Authors. -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from collections import OrderedDict -from os import path -from typing import Dict, List, Optional, Tuple, Union - -from transformers import PreTrainedTokenizerBase - -from nemo.utils.decorators import deprecated_warning - -"""Build BERT Examples from asr hypothesis, customization candidates, target labels, span info. -""" - - -class BertExample(object): - """Class for training and inference examples for BERT. - - Attributes: - features: Feature dictionary. - """ - - def __init__( - self, - input_ids: List[int], - input_mask: List[int], - segment_ids: List[int], - input_ids_for_subwords: List[int], - input_mask_for_subwords: List[int], - segment_ids_for_subwords: List[int], - character_pos_to_subword_pos: List[int], - fragment_indices: List[Tuple[int, int, int]], - labels_mask: List[int], - labels: List[int], - spans: List[Tuple[int, int, int]], - default_label: int, - ) -> None: - """Inputs to the example wrapper - - Args: - input_ids: indices of single characters (treated as subwords) - input_mask: list of bools with 0s in place of input_ids to be masked - segment_ids: list of ints from 0 to 10 to denote the text segment type ( - 0 - for tokens of ASR hypothesis, - 1 - for tokens of the first candidate - ... - 10 - for tokens of the tenth candidate - ) - input_ids_for_subwords: indices of real subwords (as tokenized by bert tokenizer) - input_mask_for_subwords: list of bools with 0s in place of input_ids_for_subwords to be masked - segment_ids_for_subwords: same as segment_ids but for input_ids_for_subwords - character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords) - fragment_indices: list of tuples (start_position, end_position, candidate_id), end is exclusive, candidate_id can be -1 if not set - labels_mask: bool tensor with 0s in place of label tokens to be masked - labels: indices of semiotic classes which should be predicted from each of the - corresponding input tokens - spans: list of tuples (class_id, start_position, end_position), end is exclusive, class is always 1(CUSTOM) - default_label: The default label - """ - # deprecation warning - deprecated_warning("BertExample") - - input_len = len(input_ids) - if not ( - input_len == len(input_mask) - and input_len == len(segment_ids) - and input_len == len(labels_mask) - and input_len == len(labels) - and input_len == len(character_pos_to_subword_pos) - ): - raise ValueError("All feature lists should have the same length ({})".format(input_len)) - - input_len_for_subwords = len(input_ids_for_subwords) - if not ( - input_len_for_subwords == len(input_mask_for_subwords) - and input_len_for_subwords == len(segment_ids_for_subwords) - ): - raise ValueError( - "All feature lists for subwords should have the same length ({})".format(input_len_for_subwords) - ) - - self.features = OrderedDict( - [ - ("input_ids", input_ids), - ("input_mask", input_mask), - ("segment_ids", segment_ids), - ("input_ids_for_subwords", input_ids_for_subwords), - ("input_mask_for_subwords", input_mask_for_subwords), - ("segment_ids_for_subwords", segment_ids_for_subwords), - ("character_pos_to_subword_pos", character_pos_to_subword_pos), - ("fragment_indices", fragment_indices), - ("labels_mask", labels_mask), - ("labels", labels), - ("spans", spans), - ] - ) - self._default_label = default_label - - -class BertExampleBuilder(object): - """Builder class for BertExample objects.""" - - def __init__( - self, - label_map: Dict[str, int], - semiotic_classes: Dict[str, int], - tokenizer: PreTrainedTokenizerBase, - max_seq_length: int, - ) -> None: - """Initializes an instance of BertExampleBuilder. - - Args: - label_map: Mapping from tags to tag IDs. - semiotic_classes: Mapping from semiotic classes to their ids. - tokenizer: Tokenizer object. - max_seq_length: Maximum sequence length. - """ - # deprecation warning - deprecated_warning("BertExampleBuilder") - - self._label_map = label_map - self._semiotic_classes = semiotic_classes - self._tokenizer = tokenizer - self._max_seq_length = max_seq_length - # one span usually covers one or more words and it only exists for custom phrases, so there are much less spans than characters. - self._max_spans_length = max(4, int(max_seq_length / 20)) - self._pad_id = self._tokenizer.pad_token_id - self._default_label = 0 - - def build_bert_example( - self, hyp: str, ref: str, target: Optional[str] = None, span_info: Optional[str] = None, infer: bool = False - ) -> Optional[BertExample]: - """Constructs a BERT Example. - - Args: - hyp: Hypothesis text. - ref: Candidate customization variants divided by ';' - target: - if infer==False, string of labels (each label is 1-based index of correct candidate) or 0. - if infer==True, it can be None or string of labels (each label is 1-based index of some candidate). In inference this can be used to get corresponding fragments to fragment_indices. - span_info: - string of format "CUSTOM 6 20;CUSTOM 40 51", number of parts corresponds to number of targets. Can be empty if target is 0. - If infer==False, numbers are correct start and end(exclusive) positions of the corresponding target candidate in the text. - If infer==True, numbers are EXPECTED positions in the text. In inference this can be used to get corresponding fragments to fragment_indices. - infer: inference mode - Returns: - BertExample, or None if the conversion from text to tags was infeasible - - Example (infer=False): - hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" - ref: "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y" - target: "1 3" - span_info: "CUSTOM 12 23;CUSTOM 28 41" - """ - if not ref.count(";") == 9: - raise ValueError("Expect 10 candidates: " + ref) - - span_info_parts = [] - targets = [] - - if len(target) > 0 and target != "0": - span_info_parts = span_info.split(";") - targets = list(map(int, target.split(" "))) - if len(span_info_parts) != len(targets): - raise ValueError( - "len(span_info_parts)=" - + str(len(span_info_parts)) - + " is different from len(target_parts)=" - + str(len(targets)) - ) - - tags = [0 for _ in hyp.split()] - if not infer: - for p, t in zip(span_info_parts, targets): - c, start, end = p.split(" ") - start = int(start) - end = int(end) - tags[start:end] = [t for i in range(end - start)] - - # get input features for characters - ( - input_ids, - input_mask, - segment_ids, - labels_mask, - labels, - _, - _, - ) = self._get_input_features(hyp=hyp, ref=ref, tags=tags) - - # get input features for words - hyp_with_words = hyp.replace(" ", "").replace("_", " ") - ref_with_words = ref.replace(" ", "").replace("_", " ") - ( - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - _, - _, - _, - _, - ) = self._get_input_features(hyp=hyp_with_words, ref=ref_with_words, tags=None) - - # used in forward to concatenate subword embeddings to character embeddings - character_pos_to_subword_pos = self._map_characters_to_subwords(input_ids, input_ids_for_subwords) - - fragment_indices = [] - if infer: - # used in inference to take argmax over whole fragments instead of separate characters to get more consistent predictions - fragment_indices = self._get_fragment_indices(hyp, targets, span_info_parts) - - spans = [] - if not infer: - # during training spans are used in validation step to calculate accuracy on whole custom phrases instead of separate characters - spans = self._get_spans(span_info_parts) - - if len(input_ids) > self._max_seq_length or len(spans) > self._max_spans_length: - print( - "Max len exceeded: len(input_ids)=", - len(input_ids), - "; _max_seq_length=", - self._max_seq_length, - "; len(spans)=", - len(spans), - "; _max_spans_length=", - self._max_spans_length, - ) - return None - - example = BertExample( - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - input_ids_for_subwords=input_ids_for_subwords, - input_mask_for_subwords=input_mask_for_subwords, - segment_ids_for_subwords=segment_ids_for_subwords, - character_pos_to_subword_pos=character_pos_to_subword_pos, - fragment_indices=fragment_indices, - labels_mask=labels_mask, - labels=labels, - spans=spans, - default_label=self._default_label, - ) - return example - - def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]: - """Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample - - Example: - span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] - result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)] - """ - result_spans = [] - - for p in span_info_parts: - if p == "": - break - c, start, end = p.split(" ") - if c not in self._semiotic_classes: - raise KeyError("class=" + c + " not found in self._semiotic_classes") - cid = self._semiotic_classes[c] - # +1 because this should be indexing on input_ids which has [CLS] token at beginning - start = int(start) + 1 - end = int(end) + 1 - result_spans.append((cid, start, end)) - return result_spans - - def _get_fragment_indices( - self, hyp: str, targets: List[int], span_info_parts: List[str] - ) -> Tuple[List[Tuple[int, int, int]]]: - """Build fragment indices for real candidates. - This is used only at inference. - After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams). - In this function we - 1) adjust start/end positions to match word borders (possibly in multiple ways). - 2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment). - - Args: - hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore). - targets: list of candidate ids (only for real candidates, not dummy) - span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text. - Returns: - List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id. - Note that returned fragments can be unsorted and can overlap, it's ok. - Example: - hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" - targets: [1 2 3 4 6 7 9] - span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion. - fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)] - """ - - fragment_indices = [] - - letters = hyp.split() - - for target, p in zip(targets, span_info_parts): - _, start, end = p.split(" ") - start = int(start) - end = min(int(end), len(hyp)) # guarantee that end is not outside length - - # Adjusting strategy 1: expand both sides to the nearest space. - # Adjust start by finding the nearest left space or beginning of text. If start is already some word beginning, it won't change. - k = start - while k > 0 and letters[k] != '_': - k -= 1 - adjusted_start = k if k == 0 else k + 1 - - # Adjust end by finding the nearest right space. If end is already space or sentence end, it won't change. - k = end - while k < len(letters) and letters[k] != '_': - k += 1 - adjusted_end = k - - # +1 because this should be indexing on input_ids which has [CLS] token at beginning - fragment_indices.append((adjusted_start + 1, adjusted_end + 1, target)) - - # Adjusting strategy 2: try to shrink to the closest space (from left or right or both sides). - # For example, here the candidate "shippers" has a matching n-gram covering part of previous word - # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w - # 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 - expanded_fragment = "".join(letters[adjusted_start:adjusted_end]) - left_space_position = expanded_fragment.find("_") - right_space_position = expanded_fragment.rfind("_") - is_left_shrink = False - is_right_shrink = False - if left_space_position > -1 and left_space_position < len(expanded_fragment) / 2: - # +1 because of CLS token, another +1 to put start position after found space - fragment_indices.append((adjusted_start + 1 + left_space_position + 1, adjusted_end + 1, target)) - is_left_shrink = True - if right_space_position > -1 and right_space_position > len(expanded_fragment) / 2: - fragment_indices.append((adjusted_start + 1, adjusted_start + 1 + right_space_position, target)) - is_right_shrink = True - if is_left_shrink and is_right_shrink: - fragment_indices.append( - (adjusted_start + 1 + left_space_position + 1, adjusted_start + 1 + right_space_position, target) - ) - - return fragment_indices - - def _map_characters_to_subwords(self, input_ids: List[int], input_ids_for_subwords: List[int]) -> List[int]: - """Maps each single character to the position of its corresponding subword. - - Args: - input_ids: List of character token ids. - input_ids_for_subwords: List of subword token ids. - Returns: - List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids) - - Example: - input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102] - input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102] - result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47] - """ - character_pos_to_subword_pos = [0 for _ in input_ids] - - ## '[CLS]', 'a', 's', 't', 'r', 'o', 'n', 'o', 'm', 'e', 'r', 's', '_', 'd', 'i', ..., 'l', 'o', '[SEP]', 'd', 'i', 'd', 'i', 'e', 'r', '_', 's', 'a', 'u', 'm', 'o', 'n', ..., '[SEP]' - tokens = self._tokenizer.convert_ids_to_tokens(input_ids) - ## '[CLS]', 'astronomers', 'did', '##ie', 'so', '##mon', 'and', 'tri', '##sti', '##an', 'g', '##llo', '[SEP]', 'did', '##ier', 'sa', '##um', '##on', '[SEP]', 'astro', '##no', '##mie', '[SEP]', 'tristan', 'gui', '##llo', '##t', '[SEP]', ..., '[SEP]', 'mercy', '[SEP]'] - tokens_for_subwords = self._tokenizer.convert_ids_to_tokens(input_ids_for_subwords) - j = 0 # index for tokens_for_subwords - j_offset = 0 # current letter index within subword - for i in range(len(tokens)): - character = tokens[i] - subword = tokens_for_subwords[j] - if character == "[CLS]" and subword == "[CLS]": - character_pos_to_subword_pos[i] = j - j += 1 - continue - if character == "[SEP]" and subword == "[SEP]": - character_pos_to_subword_pos[i] = j - j += 1 - continue - if character == "[CLS]" or character == "[SEP]" or subword == "[CLS]" or subword == "[SEP]": - raise IndexError( - "character[" - + str(i) - + "]=" - + character - + "; subword[" - + str(j) - + ";=" - + subword - + "subwords=" - + str(tokens_for_subwords) - ) - # At this point we expect that - # subword either 1) is a normal first token of a word or 2) starts with "##" (not first word token) - # character either 1) is a normal character or 2) is a space character "_" - if character == "_": - character_pos_to_subword_pos[i] = j - 1 # space is assigned to previous subtoken - continue - if j_offset < len(subword): - if character == subword[j_offset]: - character_pos_to_subword_pos[i] = j - j_offset += 1 - else: - raise IndexError( - "character mismatch:" - + "i=" - + str(i) - + "j=" - + str(j) - + "j_offset=" - + str(j_offset) - + "; len(tokens)=" - + str(len(tokens)) - + "; len(subwords)=" - + str(len(tokens_for_subwords)) - ) - # if subword is finished, increase j - if j_offset >= len(subword): - j += 1 - j_offset = 0 - if j >= len(tokens_for_subwords): - break - if tokens_for_subwords[j].startswith("##"): - j_offset = 2 - # check that all subword tokens are processed - if j < len(tokens_for_subwords): - raise IndexError( - "j=" - + str(j) - + "; len(tokens)=" - + str(len(tokens)) - + "; len(subwords)=" - + str(len(tokens_for_subwords)) - ) - return character_pos_to_subword_pos - - def _get_input_features( - self, hyp: str, ref: str, tags: List[int] - ) -> Tuple[List[int], List[int], List[int], List[int], List[int], List[str], List[int]]: - """Converts given ASR-hypothesis(hyp) and candidate string(ref) to features(token ids, mask, segment ids, etc). - - Args: - hyp: Hypothesis text. - ref: Candidate customization variants divided by ';' - tags: List of labels corresponding to each token of ASR-hypothesis or None when building an example during inference. - Returns: - Features (input_ids, input_mask, segment_ids, labels_mask, labels, hyp_tokens, token_start_indices) - - Note that this method is called both for character-based example and for word-based example (to split to subwords). - - Character-based example: - hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" - ref: "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y" - tags: "0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3" - - resulting token sequence: - '[CLS]', 'a', 's', 't', 'r', 'o', 'n', 'o', 'm', 'e', 'r', 's', '_', 'd', 'i', ..., 'l', 'o', '[SEP]', 'd', 'i', 'd', 'i', 'e', 'r', '_', 's', 'a', 'u', 'm', 'o', 'n', ..., '[SEP]' - - Word-based example: - hyp: "astronomers didie somon and tristian gllo" - ref: "didier saumon;astronomie;tristan guillot;tristesse;monade;christian;astronomer;solomon;dididididi;mercy" - tags: None (not used for word-based case) - - resulting token sequence: - '[CLS]', 'astronomers', 'did', '##ie', 'so', '##mon', 'and', 'tri', '##sti', '##an', 'g', '##llo', '[SEP]', 'did', '##ier', 'sa', '##um', '##on', '[SEP]', 'astro', '##no', '##mie', '[SEP]', 'tristan', 'gui', '##llo', '##t', '[SEP]', ..., '[SEP]', 'mercy', '[SEP]'] - """ - - labels_mask = [] - labels = [] - if tags is None: - hyp_tokens, token_start_indices = self._split_to_wordpieces(hyp.split()) - else: - hyp_tokens, labels, token_start_indices = self._split_to_wordpieces_with_labels(hyp.split(), tags) - references = ref.split(";") - all_ref_tokens = [] - all_ref_segment_ids = [] - for i in range(len(references)): - ref_tokens, _ = self._split_to_wordpieces(references[i].split()) - all_ref_tokens.extend(ref_tokens + ["[SEP]"]) - all_ref_segment_ids.extend([i + 1] * (len(ref_tokens) + 1)) - - input_tokens = ["[CLS]"] + hyp_tokens + ["[SEP]"] + all_ref_tokens # ends with [SEP] - input_ids = self._tokenizer.convert_tokens_to_ids(input_tokens) - input_mask = [1] * len(input_ids) - segment_ids = [0] + [0] * len(hyp_tokens) + [0] + all_ref_segment_ids - if len(input_ids) != len(segment_ids): - raise ValueError( - "len(input_ids)=" - + str(len(input_ids)) - + " is different from len(segment_ids)=" - + str(len(segment_ids)) - ) - - if tags: - labels_mask = [0] + [1] * len(labels) + [0] + [0] * len(all_ref_tokens) - labels = [0] + labels + [0] + [0] * len(all_ref_tokens) - return (input_ids, input_mask, segment_ids, labels_mask, labels, hyp_tokens, token_start_indices) - - def _split_to_wordpieces_with_labels( - self, tokens: List[str], labels: List[int] - ) -> Tuple[List[str], List[int], List[int]]: - """Splits tokens (and the labels accordingly) to WordPieces. - - Args: - tokens: Tokens to be split. - labels: Labels (one per token) to be split. - - Returns: - 3-tuple with the split tokens, split labels, and the indices of starting tokens of words - """ - bert_tokens = [] # Original tokens split into wordpieces. - bert_labels = [] # Label for each wordpiece. - # Index of each wordpiece that starts a new token. - token_start_indices = [] - for i, token in enumerate(tokens): - # '+ 1' is because bert_tokens will be prepended by [CLS] token later. - token_start_indices.append(len(bert_tokens) + 1) - pieces = self._tokenizer.tokenize(token) - bert_tokens.extend(pieces) - bert_labels.extend([labels[i]] * len(pieces)) - return bert_tokens, bert_labels, token_start_indices - - def _split_to_wordpieces(self, tokens: List[str]) -> Tuple[List[str], List[int]]: - """Splits tokens to WordPieces. - - Args: - tokens: Tokens to be split. - - Returns: - tuple with the split tokens, and the indices of the WordPieces that start a token. - """ - bert_tokens = [] # Original tokens split into wordpieces. - # Index of each wordpiece that starts a new token. - token_start_indices = [] - for i, token in enumerate(tokens): - # '+ 1' is because bert_tokens will be prepended by [CLS] token later. - token_start_indices.append(len(bert_tokens) + 1) - pieces = self._tokenizer.tokenize(token) - bert_tokens.extend(pieces) - return bert_tokens, token_start_indices - - def read_input_file( - self, input_filename: str, infer: bool = False - ) -> Union[List['BertExample'], Tuple[List['BertExample'], Tuple[str, str]]]: - """Reads in Tab Separated Value file and converts to training/inference-ready examples. - - Args: - example_builder: Instance of BertExampleBuilder - input_filename: Path to the TSV input file. - infer: If true, input examples do not contain target info. - - Returns: - examples: List of converted examples (BertExample). - or - (examples, hyps_refs): If infer==true, returns h - """ - - if not path.exists(input_filename): - raise ValueError("Cannot find file: " + input_filename) - examples = [] # output list of BertExample - hyps_refs = [] # output list of tuples (ASR-hypothesis, candidate_str) - with open(input_filename, 'r') as f: - for line in f: - if len(examples) % 1000 == 0: - logging.info("{} examples processed.".format(len(examples))) - if infer: - parts = line.rstrip('\n').split('\t') - hyp, ref, target, span_info = parts[0], parts[1], None, None - if len(parts) == 4: - target, span_info = parts[2], parts[3] - try: - example = self.build_bert_example(hyp, ref, target=target, span_info=span_info, infer=infer) - except Exception as e: - logging.warning(str(e)) - logging.warning(line) - continue - if example is None: - logging.info("cannot create example: ") - logging.info(line) - continue - hyps_refs.append((hyp, ref)) - examples.append(example) - else: - hyp, ref, target, semiotic_info = line.rstrip('\n').split('\t') - try: - example = self.build_bert_example( - hyp, ref, target=target, span_info=semiotic_info, infer=infer - ) - except Exception as e: - logging.warning(str(e)) - logging.warning(line) - continue - if example is None: - logging.info("cannot create example: ") - logging.info(line) - continue - examples.append(example) - logging.info(f"Done. {len(examples)} examples converted.") - if infer: - return examples, hyps_refs - return examples diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py b/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py deleted file mode 100644 index 5898e6e83bdd..000000000000 --- a/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py +++ /dev/null @@ -1,523 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pickle -from io import BytesIO -from typing import Dict, List, Optional, Tuple - -import braceexpand -import numpy as np -import torch -import webdataset as wds - -from nemo.collections.nlp.data.spellchecking_asr_customization.bert_example import BertExampleBuilder -from nemo.core.classes.dataset import Dataset, IterableDataset -from nemo.core.neural_types import ChannelType, IntType, LabelsType, MaskType, NeuralType -from nemo.utils import logging -from nemo.utils.distributed import webdataset_split_by_workers - -__all__ = [ - "SpellcheckingAsrCustomizationDataset", - "SpellcheckingAsrCustomizationTestDataset", - "TarredSpellcheckingAsrCustomizationDataset", -] - - -def collate_train_dataset( - batch: List[ - Tuple[ - np.ndarray, - np.ndarray, - np.ndarray, - np.ndarray, - np.ndarray, - np.ndarray, - np.ndarray, - np.ndarray, - np.ndarray, - np.ndarray, - ] - ], - pad_token_id: int, -) -> Tuple[ - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, -]: - """collate batch of training items - Args: - batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, labels_mask, labels, spans). - pad_token_id: integer id of padding token (to use in padded_input_ids, padded_input_ids_for_subwords) - """ - max_length = 0 - max_length_for_subwords = 0 - max_length_for_spans = 1 # to avoid empty tensor - for ( - input_ids, - input_mask, - segment_ids, - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - character_pos_to_subword_pos, - labels_mask, - labels, - spans, - ) in batch: - if len(input_ids) > max_length: - max_length = len(input_ids) - if len(input_ids_for_subwords) > max_length_for_subwords: - max_length_for_subwords = len(input_ids_for_subwords) - if len(spans) > max_length_for_spans: - max_length_for_spans = len(spans) - - padded_input_ids = [] - padded_input_mask = [] - padded_segment_ids = [] - padded_input_ids_for_subwords = [] - padded_input_mask_for_subwords = [] - padded_segment_ids_for_subwords = [] - padded_character_pos_to_subword_pos = [] - padded_labels_mask = [] - padded_labels = [] - padded_spans = [] - for ( - input_ids, - input_mask, - segment_ids, - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - character_pos_to_subword_pos, - labels_mask, - labels, - spans, - ) in batch: - if len(input_ids) < max_length: - pad_length = max_length - len(input_ids) - padded_input_ids.append(np.pad(input_ids, pad_width=[0, pad_length], constant_values=pad_token_id)) - padded_input_mask.append(np.pad(input_mask, pad_width=[0, pad_length], constant_values=0)) - padded_segment_ids.append(np.pad(segment_ids, pad_width=[0, pad_length], constant_values=0)) - padded_labels_mask.append(np.pad(labels_mask, pad_width=[0, pad_length], constant_values=0)) - padded_labels.append(np.pad(labels, pad_width=[0, pad_length], constant_values=0)) - padded_character_pos_to_subword_pos.append( - np.pad(character_pos_to_subword_pos, pad_width=[0, pad_length], constant_values=0) - ) - else: - padded_input_ids.append(input_ids) - padded_input_mask.append(input_mask) - padded_segment_ids.append(segment_ids) - padded_labels_mask.append(labels_mask) - padded_labels.append(labels) - padded_character_pos_to_subword_pos.append(character_pos_to_subword_pos) - - if len(input_ids_for_subwords) < max_length_for_subwords: - pad_length = max_length_for_subwords - len(input_ids_for_subwords) - padded_input_ids_for_subwords.append( - np.pad(input_ids_for_subwords, pad_width=[0, pad_length], constant_values=pad_token_id) - ) - padded_input_mask_for_subwords.append( - np.pad(input_mask_for_subwords, pad_width=[0, pad_length], constant_values=0) - ) - padded_segment_ids_for_subwords.append( - np.pad(segment_ids_for_subwords, pad_width=[0, pad_length], constant_values=0) - ) - else: - padded_input_ids_for_subwords.append(input_ids_for_subwords) - padded_input_mask_for_subwords.append(input_mask_for_subwords) - padded_segment_ids_for_subwords.append(segment_ids_for_subwords) - - if len(spans) < max_length_for_spans: - padded_spans.append(np.ones((max_length_for_spans, 3), dtype=int) * -1) # pad value is [-1, -1, -1] - if len(spans) > 0: - padded_spans[-1][: spans.shape[0], : spans.shape[1]] = spans # copy actual spans to the beginning - else: - padded_spans.append(spans) - - return ( - torch.LongTensor(np.array(padded_input_ids)), - torch.LongTensor(np.array(padded_input_mask)), - torch.LongTensor(np.array(padded_segment_ids)), - torch.LongTensor(np.array(padded_input_ids_for_subwords)), - torch.LongTensor(np.array(padded_input_mask_for_subwords)), - torch.LongTensor(np.array(padded_segment_ids_for_subwords)), - torch.LongTensor(np.array(padded_character_pos_to_subword_pos)), - torch.LongTensor(np.array(padded_labels_mask)), - torch.LongTensor(np.array(padded_labels)), - torch.LongTensor(np.array(padded_spans)), - ) - - -def collate_test_dataset( - batch: List[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]], - pad_token_id: int, -) -> Tuple[ - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, - torch.LongTensor, -]: - """collate batch of test items - Args: - batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, fragment_indices). - pad_token_id: integer id of padding token (to use in padded_input_ids, padded_input_ids_for_subwords) - """ - max_length = 0 - max_length_for_subwords = 0 - max_length_for_fragment_indices = 1 # to avoid empty tensor - for ( - input_ids, - input_mask, - segment_ids, - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - character_pos_to_subword_pos, - fragment_indices, - ) in batch: - if len(input_ids) > max_length: - max_length = len(input_ids) - if len(input_ids_for_subwords) > max_length_for_subwords: - max_length_for_subwords = len(input_ids_for_subwords) - if len(fragment_indices) > max_length_for_fragment_indices: - max_length_for_fragment_indices = len(fragment_indices) - - padded_input_ids = [] - padded_input_mask = [] - padded_segment_ids = [] - padded_input_ids_for_subwords = [] - padded_input_mask_for_subwords = [] - padded_segment_ids_for_subwords = [] - padded_character_pos_to_subword_pos = [] - padded_fragment_indices = [] - for ( - input_ids, - input_mask, - segment_ids, - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - character_pos_to_subword_pos, - fragment_indices, - ) in batch: - if len(input_ids) < max_length: - pad_length = max_length - len(input_ids) - padded_input_ids.append(np.pad(input_ids, pad_width=[0, pad_length], constant_values=pad_token_id)) - padded_input_mask.append(np.pad(input_mask, pad_width=[0, pad_length], constant_values=0)) - padded_segment_ids.append(np.pad(segment_ids, pad_width=[0, pad_length], constant_values=0)) - padded_character_pos_to_subword_pos.append( - np.pad(character_pos_to_subword_pos, pad_width=[0, pad_length], constant_values=0) - ) - else: - padded_input_ids.append(input_ids) - padded_input_mask.append(input_mask) - padded_segment_ids.append(segment_ids) - padded_character_pos_to_subword_pos.append(character_pos_to_subword_pos) - - if len(input_ids_for_subwords) < max_length_for_subwords: - pad_length = max_length_for_subwords - len(input_ids_for_subwords) - padded_input_ids_for_subwords.append( - np.pad(input_ids_for_subwords, pad_width=[0, pad_length], constant_values=pad_token_id) - ) - padded_input_mask_for_subwords.append( - np.pad(input_mask_for_subwords, pad_width=[0, pad_length], constant_values=0) - ) - padded_segment_ids_for_subwords.append( - np.pad(segment_ids_for_subwords, pad_width=[0, pad_length], constant_values=0) - ) - else: - padded_input_ids_for_subwords.append(input_ids_for_subwords) - padded_input_mask_for_subwords.append(input_mask_for_subwords) - padded_segment_ids_for_subwords.append(segment_ids_for_subwords) - - if len(fragment_indices) < max_length_for_fragment_indices: - # we use [0, 1, 0] as padding value for fragment_indices, it corresponds to [CLS] token, which is ignored and won't affect anything - p = np.zeros((max_length_for_fragment_indices, 3), dtype=int) - p[:, 1] = 1 - p[:, 2] = 0 - padded_fragment_indices.append(p) - if len(fragment_indices) > 0: - padded_fragment_indices[-1][ - : fragment_indices.shape[0], : fragment_indices.shape[1] - ] = fragment_indices # copy actual fragment_indices to the beginning - else: - padded_fragment_indices.append(fragment_indices) - - return ( - torch.LongTensor(np.array(padded_input_ids)), - torch.LongTensor(np.array(padded_input_mask)), - torch.LongTensor(np.array(padded_segment_ids)), - torch.LongTensor(np.array(padded_input_ids_for_subwords)), - torch.LongTensor(np.array(padded_input_mask_for_subwords)), - torch.LongTensor(np.array(padded_segment_ids_for_subwords)), - torch.LongTensor(np.array(padded_character_pos_to_subword_pos)), - torch.LongTensor(np.array(padded_fragment_indices)), - ) - - -class SpellcheckingAsrCustomizationDataset(Dataset): - """ - Dataset as used by the SpellcheckingAsrCustomizationModel for training and validation pipelines. - - Args: - input_file (str): path to tsv-file with data - example_builder: instance of BertExampleBuilder - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - "input_ids": NeuralType(('B', 'T'), ChannelType()), - "input_mask": NeuralType(('B', 'T'), MaskType()), - "segment_ids": NeuralType(('B', 'T'), ChannelType()), - "input_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()), - "input_mask_for_subwords": NeuralType(('B', 'T'), MaskType()), - "segment_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()), - "character_pos_to_subword_pos": NeuralType(('B', 'T'), ChannelType()), - "labels_mask": NeuralType(('B', 'T'), MaskType()), - "labels": NeuralType(('B', 'T'), LabelsType()), - "spans": NeuralType(('B', 'T', 'C'), IntType()), - } - - def __init__(self, input_file: str, example_builder: BertExampleBuilder) -> None: - self.example_builder = example_builder - self.examples = self.example_builder.read_input_file(input_file, infer=False) - self.pad_token_id = self.example_builder._pad_id - - def __len__(self): - return len(self.examples) - - def __getitem__(self, idx: int): - example = self.examples[idx] - input_ids = np.array(example.features["input_ids"], dtype=np.int16) - input_mask = np.array(example.features["input_mask"], dtype=np.int8) - segment_ids = np.array(example.features["segment_ids"], dtype=np.int8) - input_ids_for_subwords = np.array(example.features["input_ids_for_subwords"], dtype=np.int16) - input_mask_for_subwords = np.array(example.features["input_mask_for_subwords"], dtype=np.int8) - segment_ids_for_subwords = np.array(example.features["segment_ids_for_subwords"], dtype=np.int8) - character_pos_to_subword_pos = np.array(example.features["character_pos_to_subword_pos"], dtype=np.int16) - labels_mask = np.array(example.features["labels_mask"], dtype=np.int8) - labels = np.array(example.features["labels"], dtype=np.int8) - spans = np.array(example.features["spans"], dtype=np.int16) - return ( - input_ids, - input_mask, - segment_ids, - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - character_pos_to_subword_pos, - labels_mask, - labels, - spans, - ) - - def _collate_fn(self, batch): - """collate batch of items - Args: - batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, labels_mask, labels, spans). - """ - return collate_train_dataset(batch, pad_token_id=self.pad_token_id) - - -class TarredSpellcheckingAsrCustomizationDataset(IterableDataset): - """ - This Dataset loads training examples from tarred tokenized pickle files. - If using multiple processes the number of shards should be divisible by the number of workers to ensure an - even split among workers. If it is not divisible, logging will give a warning but training will proceed. - Additionally, please note that the len() of this DataLayer is assumed to be the number of tokens - of the text data. Shard strategy is scatter - each node gets a unique set of shards, which are permanently - pre-allocated and never changed at runtime. - Args: - text_tar_filepaths: a string (can be brace-expandable). - shuffle_n (int): How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - Defaults to 0. - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 1. - pad_token_id: id of pad token (used in collate_fn) - """ - - def __init__( - self, - text_tar_filepaths: str, - shuffle_n: int = 1, - global_rank: int = 0, - world_size: int = 1, - pad_token_id: int = -1, # use real value or get error - ): - super(TarredSpellcheckingAsrCustomizationDataset, self).__init__() - if pad_token_id < 0: - raise ValueError("use non-negative pad_token_id: " + str(pad_token_id)) - - self.pad_token_id = pad_token_id - - # Replace '(', '[', '<' and '_OP_' with '{' - brace_keys_open = ['(', '[', '<', '_OP_'] - for bkey in brace_keys_open: - if bkey in text_tar_filepaths: - text_tar_filepaths = text_tar_filepaths.replace(bkey, "{") - - # Replace ')', ']', '>' and '_CL_' with '}' - brace_keys_close = [')', ']', '>', '_CL_'] - for bkey in brace_keys_close: - if bkey in text_tar_filepaths: - text_tar_filepaths = text_tar_filepaths.replace(bkey, "}") - - # Brace expand - text_tar_filepaths = list(braceexpand.braceexpand(text_tar_filepaths)) - - logging.info("Tarred dataset shards will be scattered evenly across all nodes.") - if len(text_tar_filepaths) % world_size != 0: - logging.warning( - f"Number of shards in tarred dataset ({len(text_tar_filepaths)}) is not divisible " - f"by number of distributed workers ({world_size}). " - f"Some shards will not be used ({len(text_tar_filepaths) % world_size})." - ) - begin_idx = (len(text_tar_filepaths) // world_size) * global_rank - end_idx = begin_idx + (len(text_tar_filepaths) // world_size) - logging.info('Begin Index : %d' % (begin_idx)) - logging.info('End Index : %d' % (end_idx)) - text_tar_filepaths = text_tar_filepaths[begin_idx:end_idx] - logging.info( - "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx - ) - - self.tarpath = text_tar_filepaths - - # Put together WebDataset - self._dataset = wds.DataPipeline( - wds.SimpleShardList(urls=text_tar_filepaths), - webdataset_split_by_workers, - wds.shuffle(shuffle_n), - wds.tarfile_to_samples(), - wds.rename(pkl='pkl', key='__key__'), - wds.to_tuple('pkl', 'key'), - wds.map(self._build_sample), - ) - - def _build_sample(self, fname): - # Load file - pkl_file, _ = fname - pkl_file = BytesIO(pkl_file) - data = pickle.load(pkl_file) - pkl_file.close() - input_ids = data["input_ids"] - input_mask = data["input_mask"] - segment_ids = data["segment_ids"] - input_ids_for_subwords = data["input_ids_for_subwords"] - input_mask_for_subwords = data["input_mask_for_subwords"] - segment_ids_for_subwords = data["segment_ids_for_subwords"] - character_pos_to_subword_pos = data["character_pos_to_subword_pos"] - labels_mask = data["labels_mask"] - labels = data["labels"] - spans = data["spans"] - - return ( - input_ids, - input_mask, - segment_ids, - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - character_pos_to_subword_pos, - labels_mask, - labels, - spans, - ) - - def __iter__(self): - return self._dataset.__iter__() - - def _collate_fn(self, batch): - """collate batch of items - Args: - batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, labels_mask, labels, spans). - """ - return collate_train_dataset(batch, pad_token_id=self.pad_token_id) - - -class SpellcheckingAsrCustomizationTestDataset(Dataset): - """ - Dataset for inference pipeline. - - Args: - sents: list of strings - example_builder: instance of BertExampleBuilder - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - "input_ids": NeuralType(('B', 'T'), ChannelType()), - "input_mask": NeuralType(('B', 'T'), MaskType()), - "segment_ids": NeuralType(('B', 'T'), ChannelType()), - "input_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()), - "input_mask_for_subwords": NeuralType(('B', 'T'), MaskType()), - "segment_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()), - "character_pos_to_subword_pos": NeuralType(('B', 'T'), ChannelType()), - "fragment_indices": NeuralType(('B', 'T', 'C'), IntType()), - } - - def __init__(self, input_file: str, example_builder: BertExampleBuilder) -> None: - self.example_builder = example_builder - self.examples, self.hyps_refs = self.example_builder.read_input_file(input_file, infer=True) - self.pad_token_id = self.example_builder._pad_id - - def __len__(self): - return len(self.examples) - - def __getitem__(self, idx: int): - example = self.examples[idx] - input_ids = np.array(example.features["input_ids"]) - input_mask = np.array(example.features["input_mask"]) - segment_ids = np.array(example.features["segment_ids"]) - input_ids_for_subwords = np.array(example.features["input_ids_for_subwords"]) - input_mask_for_subwords = np.array(example.features["input_mask_for_subwords"]) - segment_ids_for_subwords = np.array(example.features["segment_ids_for_subwords"]) - character_pos_to_subword_pos = np.array(example.features["character_pos_to_subword_pos"], dtype=np.int64) - fragment_indices = np.array(example.features["fragment_indices"], dtype=np.int16) - return ( - input_ids, - input_mask, - segment_ids, - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - character_pos_to_subword_pos, - fragment_indices, - ) - - def _collate_fn(self, batch): - """collate batch of items - Args: - batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos). - """ - return collate_test_dataset(batch, pad_token_id=self.pad_token_id) diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py b/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py deleted file mode 100644 index 7385f19b414a..000000000000 --- a/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py +++ /dev/null @@ -1,929 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import json -import math -import random -import re -from collections import defaultdict, namedtuple -from typing import Dict, List, Set, Tuple, Union - -import numpy as np -from numba import jit - -"""Utility functions for Spellchecking ASR Customization.""" - - -def replace_diacritics(text): - text = re.sub(r"[éèëēêęěė]", "e", text) # latin - text = re.sub(r"[ё]", "е", text) # cyrillic - text = re.sub(r"[ãâāáäăàąåạảǎ]", "a", text) - text = re.sub(r"[úūüùưûů]", "u", text) - text = re.sub(r"[ôōóöõòőø]", "o", text) - text = re.sub(r"[ćçč]", "c", text) - text = re.sub(r"[ïīíîıì]", "i", text) - text = re.sub(r"[ñńňņ]", "n", text) - text = re.sub(r"[țťţ]", "t", text) - text = re.sub(r"[łľļ]", "l", text) - text = re.sub(r"[żžź]", "z", text) - text = re.sub(r"[ğ]", "g", text) - text = re.sub(r"[ďđ]", "d", text) - text = re.sub(r"[ķ]", "k", text) - text = re.sub(r"[ř]", "r", text) - text = re.sub(r"[ý]", "y", text) - text = re.sub(r"[æ]", "ae", text) - text = re.sub(r"[œ]", "oe", text) - text = re.sub(r"[șşšś]", "s", text) - return text - - -def load_ngram_mappings(input_name: str, max_misspelled_freq: int = 1000000000) -> Tuple[defaultdict, Set]: - """Loads n-gram mapping vocabularies in form required by dynamic programming - Args: - input_name: file with n-gram mappings - max_misspelled_freq: threshold on misspelled n-gram frequency - Returns: - vocab: dict {key=original_ngram, value=dict{key=misspelled_ngram, value=frequency}} - ban_ngram: set of banned misspelled n-grams - - Input format: - u t o u+i t o 49 8145 114 - u t o t e 63 8145 16970 - u t o o+_ t o 42 8145 1807 - """ - vocab = defaultdict(dict) - ban_ngram = set() - - with open(input_name, "r", encoding="utf-8") as f: - for line in f: - orig, misspelled, joint_freq, orig_freq, misspelled_freq = line.strip().split("\t") - if orig == "" or misspelled == "": - raise ValueError("Empty n-gram: orig=" + orig + "; misspelled=" + misspelled) - misspelled = misspelled.replace("", "=") - if misspelled.replace("=", "").strip() == "": # skip if resulting ngram doesn't contain any real character - continue - if int(misspelled_freq) > max_misspelled_freq: - ban_ngram.add(misspelled + " ") # space at the end is required within get_index function - vocab[orig][misspelled] = int(joint_freq) / int(orig_freq) - return vocab, ban_ngram - - -def load_ngram_mappings_for_dp(input_name: str) -> Tuple[defaultdict, defaultdict, defaultdict, int]: - """Loads n-gram mapping vocabularies in form required by dynamic programming - Args: - input_name: file with n-gram mappings - Returns: - joint_vocab: dict where key=(original_ngram, misspelled_ngram), value=frequency - orig_vocab: dict where key=original_ngram, value=frequency - misspelled_vocab: dict where key=misspelled_ngram, value=frequency - max_len: maximum n-gram length seen in vocabulary - - Input format: original \t misspelled \t joint_freq \t original_freq \t misspelled_freq - u t o u+i t o 49 8145 114 - u t o t e 63 8145 16970 - u t o o+_ t o 42 8145 1807 - """ - joint_vocab = defaultdict(int) - orig_vocab = defaultdict(int) - misspelled_vocab = defaultdict(int) - max_len = 0 - with open(input_name, "r", encoding="utf-8") as f: - for line in f: - orig, misspelled, joint_freq, _, _ = line.strip().split("\t") - if orig == "" or misspelled == "": - raise ValueError("Emty n-gram: orig=" + orig + "; misspelled=" + misspelled) - misspelled = misspelled.replace("", " ").replace("+", " ") - misspelled = " ".join(misspelled.split()) - if misspelled == "": # skip if resulting ngram doesn't contain any real character - continue - max_len = max(max_len, orig.count(" ") + 1, misspelled.count(" ") + 1) - joint_vocab[(orig, misspelled)] += int(joint_freq) - orig_vocab[orig] += int(joint_freq) - misspelled_vocab[misspelled] += int(joint_freq) - return joint_vocab, orig_vocab, misspelled_vocab, max_len - - -def get_alignment_by_dp( - ref_phrase: str, hyp_phrase: str, dp_data: Tuple[defaultdict, defaultdict, defaultdict, int] -) -> List[Tuple[str, str, float, float, int, int, int]]: - """Get best alignment path between a reference and (possibly) misspelled phrase using n-gram mappings vocabulary. - Args: - ref_phrase: candidate reference phrase (letters separated by space, real space replaced by underscore) - hyp_phrase: (possibly) misspelled phrase (letters separated by space, real space replaced by underscore) - dp_data: n-gram mapping vocabularies used by dynamic programming - Returns: - list of tuples (hyp_ngram, ref_ngram, logprob, sum_logprob, joint_freq, orig_freq, misspelled_freq) - This is best alignment path. - - Example: - ref_phrase: "a n h y d r i d e" - hyp_phrase: "a n d _ h y d r o d" - - Result: - [("*", "*", 0.0, 0.0, 0, 0, 0) - ("a n d _ h", "a n h", -2.34, -2.34, 226, 2338, 2203) - ("y d r o", "y d r i", -2.95, -5.29, 11, 211, 1584) - ("d", "d e", -1.99, -7.28, 60610, 444714, 2450334) - ] - Final path score is in path[-1][3]: -7.28 - Note that the order of ref_phrase and hyp_phrase matters, because n-gram mappings vocabulary is not symmetrical. - """ - joint_vocab, orig_vocab, misspelled_vocab, max_len = dp_data - hyp_letters = ["*"] + hyp_phrase.split() - ref_letters = ["*"] + ref_phrase.split() - DpInfo = namedtuple( - "DpInfo", ["hyp_pos", "ref_pos", "best_hyp_ngram_len", "best_ref_ngram_len", "score", "sum_score"] - ) - history = defaultdict(DpInfo) - history[(0, 0)] = DpInfo( - hyp_pos=0, ref_pos=0, best_hyp_ngram_len=1, best_ref_ngram_len=1, score=0.0, sum_score=0.0 - ) - for hyp_pos in range(len(hyp_letters)): - for ref_pos in range(len(ref_letters)): - if hyp_pos == 0 and ref_pos == 0: # cell (0, 0) is already defined - continue - # consider cell (hyp_pos, ref_pos) and find best path to get there - best_hyp_ngram_len = 0 - best_ref_ngram_len = 0 - best_ngram_score = float("-inf") - best_sum_score = float("-inf") - # loop over paths ending on non-empty ngram mapping - for hyp_ngram_len in range(1, 1 + min(max_len, hyp_pos + 1)): - hyp_ngram = " ".join(hyp_letters[(hyp_pos - hyp_ngram_len + 1) : (hyp_pos + 1)]) - for ref_ngram_len in range(1, 1 + min(max_len, ref_pos + 1)): - ref_ngram = " ".join(ref_letters[(ref_pos - ref_ngram_len + 1) : (ref_pos + 1)]) - if (ref_ngram, hyp_ngram) not in joint_vocab: - continue - joint_freq = joint_vocab[(ref_ngram, hyp_ngram)] - orig_freq = orig_vocab.get(ref_ngram, 1) - ngram_score = math.log(joint_freq / orig_freq) - previous_cell = (hyp_pos - hyp_ngram_len, ref_pos - ref_ngram_len) - if previous_cell not in history: - print("cell ", previous_cell, "does not exist") - continue - previous_score = history[previous_cell].sum_score - sum_score = ngram_score + previous_score - if sum_score > best_sum_score: - best_sum_score = sum_score - best_ngram_score = ngram_score - best_hyp_ngram_len = hyp_ngram_len - best_ref_ngram_len = ref_ngram_len - # loop over two variants with deletion of one character - deletion_score = -6.0 - insertion_score = -6.0 - if hyp_pos > 0: - previous_cell = (hyp_pos - 1, ref_pos) - previous_score = history[previous_cell].sum_score - sum_score = deletion_score + previous_score - if sum_score > best_sum_score: - best_sum_score = sum_score - best_ngram_score = deletion_score - best_hyp_ngram_len = 1 - best_ref_ngram_len = 0 - - if ref_pos > 0: - previous_cell = (hyp_pos, ref_pos - 1) - previous_score = history[previous_cell].sum_score - sum_score = insertion_score + previous_score - if sum_score > best_sum_score: - best_sum_score = sum_score - best_ngram_score = insertion_score - best_hyp_ngram_len = 0 - best_ref_ngram_len = 1 - - if best_hyp_ngram_len == 0 and best_ref_ngram_len == 0: - raise ValueError("best_hyp_ngram_len = 0 and best_ref_ngram_len = 0") - - # save cell to history - history[(hyp_pos, ref_pos)] = DpInfo( - hyp_pos=hyp_pos, - ref_pos=ref_pos, - best_hyp_ngram_len=best_hyp_ngram_len, - best_ref_ngram_len=best_ref_ngram_len, - score=best_ngram_score, - sum_score=best_sum_score, - ) - # now trace back on best path starting from last positions - path = [] - hyp_pos = len(hyp_letters) - 1 - ref_pos = len(ref_letters) - 1 - cell_info = history[(hyp_pos, ref_pos)] - path.append(cell_info) - while hyp_pos > 0 or ref_pos > 0: - hyp_pos -= cell_info.best_hyp_ngram_len - ref_pos -= cell_info.best_ref_ngram_len - cell_info = history[(hyp_pos, ref_pos)] - path.append(cell_info) - - result = [] - for info in reversed(path): - hyp_ngram = " ".join(hyp_letters[(info.hyp_pos - info.best_hyp_ngram_len + 1) : (info.hyp_pos + 1)]) - ref_ngram = " ".join(ref_letters[(info.ref_pos - info.best_ref_ngram_len + 1) : (info.ref_pos + 1)]) - joint_freq = joint_vocab.get((ref_ngram, hyp_ngram), 0) - orig_freq = orig_vocab.get(ref_ngram, 0) - misspelled_freq = misspelled_vocab.get(hyp_ngram, 0) - result.append((hyp_ngram, ref_ngram, info.score, info.sum_score, joint_freq, orig_freq, misspelled_freq)) - return result - - -def get_index( - custom_phrases: List[str], - vocab: defaultdict, - ban_ngram_global: Set[str], - min_log_prob: float = -4.0, - max_phrases_per_ngram: int = 100, -) -> Tuple[List[str], Dict[str, List[Tuple[int, int, int, float]]]]: - """Given a restricted vocabulary of replacements, - loops through custom phrases, - generates all possible conversions and creates index. - - Args: - custom_phrases: list of all custom phrases, characters should be split by space, real space replaced to underscore. - vocab: n-gram mappings vocabulary - dict {key=original_ngram, value=dict{key=misspelled_ngram, value=frequency}} - ban_ngram_global: set of banned misspelled n-grams - min_log_prob: minimum log probability, after which we stop growing this n-gram. - max_phrases_per_ngram: maximum phrases that we allow to store per one n-gram. N-grams exceeding that quantity get banned. - - Returns: - phrases - list of phrases. Position in this list is used as phrase_id. - ngram2phrases - resulting index, i.e. dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob) - """ - - ban_ngram_local = set() # these ngrams are banned only for given custom_phrases - ngram_to_phrase_and_position = defaultdict(list) - - for custom_phrase in custom_phrases: - inputs = custom_phrase.split(" ") - begin = 0 - index_keys = [{} for _ in inputs] # key - letter ngram, index - beginning positions in phrase - - for begin in range(len(inputs)): - for end in range(begin + 1, min(len(inputs) + 1, begin + 5)): - inp = " ".join(inputs[begin:end]) - if inp not in vocab: - continue - for rep in vocab[inp]: - lp = math.log(vocab[inp][rep]) - - for b in range(max(0, end - 5), end): # try to grow previous ngrams with new replacement - new_ngrams = {} - for ngram in index_keys[b]: - lp_prev = index_keys[b][ngram] - if len(ngram) + len(rep) <= 10 and b + ngram.count(" ") == begin: - if lp_prev + lp > min_log_prob: - new_ngrams[ngram + rep + " "] = lp_prev + lp - index_keys[b].update(new_ngrams) # join two dictionaries - # add current replacement as ngram - if lp > min_log_prob: - index_keys[begin][rep + " "] = lp - - for b in range(len(index_keys)): - for ngram, lp in sorted(index_keys[b].items(), key=lambda item: item[1], reverse=True): - if ngram in ban_ngram_global: # here ngram ends with a space - continue - real_length = ngram.count(" ") - ngram = ngram.replace("+", " ").replace("=", " ") - ngram = " ".join(ngram.split()) # here ngram doesn't end with a space anymore - if ngram + " " in ban_ngram_global: # this can happen after deletion of + and = - continue - if ngram in ban_ngram_local: - continue - ngram_to_phrase_and_position[ngram].append((custom_phrase, b, real_length, lp)) - if len(ngram_to_phrase_and_position[ngram]) > max_phrases_per_ngram: - ban_ngram_local.add(ngram) - del ngram_to_phrase_and_position[ngram] - continue - - phrases = [] # id to phrase - phrase2id = {} # phrase to id - ngram2phrases = defaultdict(list) # ngram to list of tuples (phrase_id, begin, length, logprob) - - for ngram in ngram_to_phrase_and_position: - for phrase, b, length, lp in ngram_to_phrase_and_position[ngram]: - if phrase not in phrase2id: - phrases.append(phrase) - phrase2id[phrase] = len(phrases) - 1 - ngram2phrases[ngram].append((phrase2id[phrase], b, length, lp)) - - return phrases, ngram2phrases - - -def load_index(input_name: str) -> Tuple[List[str], Dict[str, List[Tuple[int, int, int, float]]]]: - """ Load index from file - Args: - input_name: file with index - Returns: - phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id. - ngram2phrases: dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob) - """ - phrases = [] # id to phrase - phrase2id = {} # phrase to id - ngram2phrases = defaultdict(list) # ngram to list of tuples (phrase_id, begin_pos, size, logprob) - with open(input_name, "r", encoding="utf-8") as f: - for line in f: - ngram, phrase, b, size, lp = line.split("\t") - b = int(b) - size = int(size) - lp = float(lp) - if phrase not in phrase2id: - phrases.append(phrase) - phrase2id[phrase] = len(phrases) - 1 - ngram2phrases[ngram].append((phrase2id[phrase], b, size, lp)) - return phrases, ngram2phrases - - -def search_in_index( - ngram2phrases: Dict[str, List[Tuple[int, int, int, float]]], phrases: List[str], letters: Union[str, List[str]] -) -> Tuple[np.ndarray, List[Set[str]]]: - """ Function used to search in index - - Args: - ngram2phrases: dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob) - phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id. - letters: list of letters of ASR-hypothesis. Should not contain spaces - real spaces should be replaced with underscores. - - Returns: - phrases2positions: a matrix of size (len(phrases), len(letters)). - It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere. - It is used later to find phrases with many hits within a contiguous window - potential matching candidates. - position2ngrams: positions in ASR-hypothesis mapped to sets of ngrams starting from that position. - It is used later to check how well each found candidate is covered by n-grams (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered). - """ - - if " " in letters: - raise ValueError("letters should not contain space: " + str(letters)) - - phrases2positions = np.zeros((len(phrases), len(letters)), dtype=float) - # positions mapped to sets of ngrams starting from that position - position2ngrams = [set() for _ in range(len(letters))] - - begin = 0 - for begin in range(len(letters)): - for end in range(begin + 1, min(len(letters) + 1, begin + 7)): - ngram = " ".join(letters[begin:end]) - if ngram not in ngram2phrases: - continue - for phrase_id, b, size, lp in ngram2phrases[ngram]: - phrases2positions[phrase_id, begin:end] = 1.0 - position2ngrams[begin].add(ngram) - return phrases2positions, position2ngrams - - -@jit(nopython=True) # Set "nopython" mode for best performance, equivalent to @njit -def get_all_candidates_coverage(phrases, phrases2positions): - """Get maximum hit coverage for each phrase - within a moving window of length of the phrase. - Args: - phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id. - phrases2positions: a matrix of size (len(phrases), len(ASR-hypothesis)). - It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere. - Returns: - candidate2coverage: list of size len(phrases) containing coverage (0.0 to 1.0) in best window. - candidate2position: list of size len(phrases) containing starting position of best window. - """ - candidate2coverage = [0.0] * len(phrases) - candidate2position = [-1] * len(phrases) - - for i in range(len(phrases)): - phrase_length = phrases[i].count(" ") + 1 - all_coverage = np.sum(phrases2positions[i]) / phrase_length - # if total coverage on whole ASR-hypothesis is too small, there is no sense in using moving window - if all_coverage < 0.4: - continue - moving_sum = np.sum(phrases2positions[i, 0:phrase_length]) - max_sum = moving_sum - best_pos = 0 - for pos in range(1, phrases2positions.shape[1] - phrase_length + 1): - moving_sum -= phrases2positions[i, pos - 1] - moving_sum += phrases2positions[i, pos + phrase_length - 1] - if moving_sum > max_sum: - max_sum = moving_sum - best_pos = pos - - coverage = max_sum / (phrase_length + 2) # smoothing - candidate2coverage[i] = coverage - candidate2position[i] = best_pos - return candidate2coverage, candidate2position - - -def get_candidates( - ngram2phrases: Dict[str, List[Tuple[int, int, int, float]]], - phrases: List[str], - letters: Union[str, List[str]], - pool_for_random_candidates: List[str], - min_phrase_coverage: float = 0.8, -) -> List[Tuple[str, int, int, float, float]]: - """Given an index of custom vocabulary and an ASR-hypothesis retrieve 10 candidates. - Args: - ngram2phrases: dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob) - phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id. - letters: list of letters of ASR-hypothesis. Should not contain spaces - real spaces should be replaced with underscores. - pool_for_random_candidates: large list of strings, from which to sample random candidates in case when there are less than 10 real candidates - min_phrase_coverage: We discard candidates which are not covered by n-grams to at least to this extent - (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered). - Returns: - candidates: list of tuples (candidate_text, approximate_begin_position, length, coverage of window in ASR-hypothesis, coverage of phrase itself). - """ - phrases2positions, position2ngrams = search_in_index(ngram2phrases, phrases, letters) - candidate2coverage, candidate2position = get_all_candidates_coverage(phrases, phrases2positions) - - # mask for each custom phrase, how many which symbols are covered by input ngrams - phrases2coveredsymbols = [[0 for x in phrases[i].split(" ")] for i in range(len(phrases))] - candidates = [] - k = 0 - for idx, coverage in sorted(enumerate(candidate2coverage), key=lambda item: item[1], reverse=True): - begin = candidate2position[idx] # this is most likely beginning of this candidate - phrase_length = phrases[idx].count(" ") + 1 - for pos in range(begin, begin + phrase_length): - # we do not know exact end of custom phrase in text, it can be different from phrase length - if pos >= len(position2ngrams): - break - for ngram in position2ngrams[pos]: - for phrase_id, b, size, lp in ngram2phrases[ngram]: - if phrase_id != idx: - continue - for ppos in range(b, b + size): - if ppos >= phrase_length: - break - phrases2coveredsymbols[phrase_id][ppos] = 1 - k += 1 - if k > 100: - break - real_coverage = sum(phrases2coveredsymbols[idx]) / len(phrases2coveredsymbols[idx]) - if real_coverage < min_phrase_coverage: - continue - candidates.append((phrases[idx], begin, phrase_length, coverage, real_coverage)) - - # no need to process this sentence further if it does not contain any real candidates - if len(candidates) == 0: - print("WARNING: no real candidates", candidates) - return [] - - while len(candidates) < 10: - dummy = random.choice(pool_for_random_candidates) - dummy = " ".join(list(dummy.replace(" ", "_"))) - candidates.append((dummy, -1, dummy.count(" ") + 1, 0.0, 0.0)) - - candidates = candidates[:10] - random.shuffle(candidates) - if len(candidates) != 10: - print("WARNING: cannot get 10 candidates", candidates) - return [] - - return candidates - - -def read_spellmapper_predictions(filename: str) -> List[Tuple[str, List[Tuple[int, int, str, float]], List[int]]]: - """Read results of SpellMapper inference from file. - Args: - filename: file with SpellMapper results - Returns: - list of tuples (sent, list of fragment predictions, list of letter predictions) - One fragment prediction is a tuple (begin, end, replacement_text, prob) - """ - results = [] - with open(filename, "r", encoding="utf-8") as f: - for line in f: - text, candidate_str, fragment_predictions_str, letter_predictions_str = line.strip().split("\t") - text = text.replace(" ", "").replace("_", " ") - candidate_str = candidate_str.replace(" ", "").replace("_", " ") - candidates = candidate_str.split(";") - letter_predictions = list(map(int, letter_predictions_str.split())) - if len(candidates) != 10: - raise IndexError("expect 10 candidates, got: ", len(candidates)) - if len(text) != len(letter_predictions): - raise IndexError("len(text)=", len(text), "; len(letter_predictions)=", len(letter_predictions)) - replacements = [] - if fragment_predictions_str != "": - for prediction in fragment_predictions_str.split(";"): - begin, end, candidate_id, prob = prediction.split(" ") - begin = int(begin) - end = int(end) - candidate_id = int(candidate_id) - prob = float(prob) - replacements.append((begin, end, candidates[candidate_id - 1], prob)) - replacements.sort() # it will sort by begin, then by end - results.append((text, replacements, letter_predictions)) - return results - - -def substitute_replacements_in_text( - text: str, replacements: List[Tuple[int, int, str, float]], replace_hyphen_to_space: bool -) -> str: - """Substitute replacements to the input text, iterating from end to beginning, so that indexing does not change. - Note that we expect intersecting replacements to be already filtered. - Args: - text: sentence; - replacements: list of replacements, each is a tuple (begin, end, text, probability); - replace_hyphen_to_space: if True, hyphens in replacements will be converted to spaces; - Returns: - corrected sentence - """ - replacements.sort() - last_begin = len(text) + 1 - corrected_text = text - for begin, end, candidate, prob in reversed(replacements): - if end > last_begin: - print("WARNING: skip intersecting replacement [", candidate, "] in text: ", text) - continue - if replace_hyphen_to_space: - candidate = candidate.replace("-", " ") - corrected_text = corrected_text[:begin] + candidate + corrected_text[end:] - last_begin = begin - return corrected_text - - -def apply_replacements_to_text( - text: str, - replacements: List[Tuple[int, int, str, float]], - min_prob: float = 0.5, - replace_hyphen_to_space: bool = False, - dp_data: Tuple[defaultdict, defaultdict, defaultdict, int] = None, - min_dp_score_per_symbol: float = -99.9, -) -> str: - """Filter and apply replacements to the input sentence. - Args: - text: input sentence; - replacements: list of proposed replacements (probably intersecting), each is a tuple (begin, end, text, probability); - min_prob: threshold on replacement probability; - replace_hyphen_to_space: if True, hyphens in replacements will be converted to spaces; - dp_data: n-gram mapping vocabularies used by dynamic programming, if None - dynamic programming is not used; - min_dp_score_per_symbol: threshold on dynamic programming sum score averaged by hypothesis length - Returns: - corrected sentence - """ - # sort replacements by positions - replacements.sort() - # filter replacements - # Note that we do not skip replacements with same text, otherwise intersecting candidates with lower probability can win - filtered_replacements = [] - for j in range(len(replacements)): - replacement = replacements[j] - begin, end, candidate, prob = replacement - fragment = text[begin:end] - candidate_spaced = " ".join(list(candidate.replace(" ", "_"))) - fragment_spaced = " ".join(list(fragment.replace(" ", "_"))) - # apply penalty if candidate length is bigger than fragment length - # to avoid cases like "forward-looking" replacing "looking" in "forward looking" resulting in "forward forward looking" - if len(candidate) > len(fragment): - penalty = len(fragment) / len(candidate) - prob *= penalty - # skip replacement with low probability - if prob < min_prob: - continue - # skip replacements with some predefined templates, e.g. "*'s" => "*s" - if check_banned_replacements(fragment, candidate): - continue - if dp_data is not None: - path = get_alignment_by_dp(candidate_spaced, fragment_spaced, dp_data) - # path[-1][3] is the sum of logprobs for best path of dynamic programming: divide sum_score by length - if path[-1][3] / (len(fragment)) < min_dp_score_per_symbol: - continue - - # skip replacement if it intersects with previous replacement and has lower probability, otherwise remove previous replacement - if len(filtered_replacements) > 0 and filtered_replacements[-1][1] > begin: - if filtered_replacements[-1][3] > prob: - continue - else: - filtered_replacements.pop() - filtered_replacements.append((begin, end, candidate, prob)) - - return substitute_replacements_in_text(text, filtered_replacements, replace_hyphen_to_space) - - -def update_manifest_with_spellmapper_corrections( - input_manifest_name: str, - short2full_name: str, - output_manifest_name: str, - spellmapper_results_name: str, - min_prob: float = 0.5, - replace_hyphen_to_space: bool = True, - field_name: str = "pred_text", - use_dp: bool = True, - ngram_mappings: Union[str, None] = None, - min_dp_score_per_symbol: float = -1.5, -) -> None: - """Post-process SpellMapper predictions and write corrected sentence to the specified field of nemo manifest. - The previous content of this field will be copied to "*_before_correction" field. - If the sentence was split into fragments before running SpellMapper, all replacements will be first gathered together and then applied to the original long sentence. - Args: - input_manifest_name: input nemo manifest; - short2full_name: text file with two columns: short_sent \t full_sent; - output_manifest_name: output nemo manifest; - spellmapper_results_name: text file with SpellMapper inference results; - min_prob: threshold on replacement probability; - replace_hyphen_to_space: if True, hyphens in replacements will be converted to spaces; - field_name: name of json field whose text we want to correct; - use_dp: bool = If True, additional replacement filtering will be applied using dynamic programming (works slow); - ngram_mappings: file with n-gram mappings, only needed if use_dp=True - min_dp_score_per_symbol: threshold on dynamic programming sum score averaged by hypothesis length - """ - short2full_sent = defaultdict(list) - sent2corrections = defaultdict(dict) - with open(short2full_name, "r", encoding="utf-8") as f: - for line in f: - s = line.strip() - short_sent, full_sent = s.split("\t") - short2full_sent[short_sent].append(full_sent) - sent2corrections[full_sent] = [] - - spellmapper_results = read_spellmapper_predictions(spellmapper_results_name) - dp_data = None - if use_dp: - dp_data = load_ngram_mappings_for_dp(ngram_mappings) - - for text, replacements, _ in spellmapper_results: - short_sent = text - if short_sent not in short2full_sent: - continue - # it can happen that one short sentence occurred in multiple full sentences - for full_sent in short2full_sent[short_sent]: - offset = full_sent.find(short_sent) - for begin, end, candidate, prob in replacements: - sent2corrections[full_sent].append((begin + offset, end + offset, candidate, prob)) - - out = open(output_manifest_name, "w", encoding="utf-8") - with open(input_manifest_name, "r", encoding="utf-8") as f: - for line in f: - record = json.loads(line.strip()) - sent = record[field_name] - record[field_name + "_before_correction"] = record[field_name] - if sent in sent2corrections: - record[field_name] = apply_replacements_to_text( - sent, - sent2corrections[sent], - min_prob=min_prob, - replace_hyphen_to_space=replace_hyphen_to_space, - dp_data=dp_data, - min_dp_score_per_symbol=min_dp_score_per_symbol, - ) - out.write(json.dumps(record) + "\n") - out.close() - - -def extract_and_split_text_from_manifest( - input_name: str, output_name: str, field_name: str = "pred_text", len_in_words: int = 16, step_in_words: int = 8 -) -> None: - """Extract text of the specified field in nemo manifest and split it into fragments (possibly with intersection). - The result is saved to a text file with two columns: short_sent \t full_sent. - This is useful if we want to process shorter sentences and then apply the results to the original long sentence. - Args: - input_name: input nemo manifest, - output_name: output text file, - field_name: name of json field from which we extract the sentence text, - len_in_words: maximum number of words in a fragment, - step_in_words: on how many words we move at each step. - For example, if the len_in_words=16 and step_in_words=8 the fragments will be intersected by half. - """ - short2full_sent = set() - with open(input_name, "r", encoding="utf-8") as f: - for line in f: - record = json.loads(line.strip()) - sent = record[field_name] - if " " in sent: - raise ValueError("found multiple space in: " + sent) - words = sent.split() - for i in range(0, len(words), step_in_words): - short_sent = " ".join(words[i : i + len_in_words]) - short2full_sent.add((short_sent, sent)) - - with open(output_name, "w", encoding="utf-8") as out: - for short_sent, full_sent in short2full_sent: - out.write(short_sent + "\t" + full_sent + "\n") - - -def check_banned_replacements(src: str, dst: str) -> bool: - """This function is used to check is a pair of words/phrases is matching some common template that we don't want to replace with one another. - Args: - src: first phrase - dst: second phrase - Returns True if this replacement should be banned. - """ - # customers' => customer's - if src.endswith("s'") and dst.endswith("'s") and src[0:-2] == dst[0:-2]: - return True - # customer's => customers' - if src.endswith("'s") and dst.endswith("s'") and src[0:-2] == dst[0:-2]: - return True - # customers => customer's - if src.endswith("s") and dst.endswith("'s") and src[0:-1] == dst[0:-2]: - return True - # customer's => customers - if src.endswith("'s") and dst.endswith("s") and src[0:-2] == dst[0:-1]: - return True - # customers => customers' - if src.endswith("s") and dst.endswith("s'") and src[0:-1] == dst[0:-2]: - return True - # customers' => customers - if src.endswith("s'") and dst.endswith("s") and src[0:-2] == dst[0:-1]: - return True - # utilities => utility's - if src.endswith("ies") and dst.endswith("y's") and src[0:-3] == dst[0:-3]: - return True - # utility's => utilities - if src.endswith("y's") and dst.endswith("ies") and src[0:-3] == dst[0:-3]: - return True - # utilities => utility - if src.endswith("ies") and dst.endswith("y") and src[0:-3] == dst[0:-1]: - return True - # utility => utilities - if src.endswith("y") and dst.endswith("ies") and src[0:-1] == dst[0:-3]: - return True - # group is => group's - if src.endswith(" is") and dst.endswith("'s") and src[0:-3] == dst[0:-2]: - return True - # group's => group is - if src.endswith("'s") and dst.endswith(" is") and src[0:-2] == dst[0:-3]: - return True - # trex's => trex - if src.endswith("'s") and src[0:-2] == dst: - return True - # trex => trex's - if dst.endswith("'s") and dst[0:-2] == src: - return True - # increases => increase (but trimass => trimas is ok) - if src.endswith("s") and (not src.endswith("ss")) and src[0:-1] == dst: - return True - # increase => increases ((but trimas => trimass is ok)) - if dst.endswith("s") and (not dst.endswith("ss")) and dst[0:-1] == src: - return True - # anticipate => anticipated - if src.endswith("e") and dst.endswith("ed") and src[0:-1] == dst[0:-2]: - return True - # anticipated => anticipate - if src.endswith("ed") and dst.endswith("e") and src[0:-2] == dst[0:-1]: - return True - # blocks => blocked - if src.endswith("s") and dst.endswith("ed") and src[0:-1] == dst[0:-2]: - return True - # blocked => blocks - if src.endswith("ed") and dst.endswith("s") and src[0:-2] == dst[0:-1]: - return True - # lives => lived - if src.endswith("es") and dst.endswith("ed") and src[0:-2] == dst[0:-2]: - return True - # lived => lives - if src.endswith("ed") and dst.endswith("es") and src[0:-2] == dst[0:-2]: - return True - # regarded => regard - if src.endswith("ed") and src[0:-2] == dst: - return True - # regard => regarded - if dst.endswith("ed") and dst[0:-2] == src: - return True - # regardeding => regard - if src.endswith("ing") and src[0:-3] == dst: - return True - # regard => regarding - if dst.endswith("ing") and dst[0:-3] == src: - return True - # longer => long - if src.endswith("er") and src[0:-2] == dst: - return True - # long => longer - if dst.endswith("er") and dst[0:-2] == src: - return True - # discussed => discussing - if src.endswith("ed") and dst.endswith("ing") and src[0:-2] == dst[0:-3]: - return True - # discussing => discussed - if src.endswith("ing") and dst.endswith("ed") and src[0:-3] == dst[0:-2]: - return True - # live => living - if src.endswith("e") and dst.endswith("ing") and src[0:-1] == dst[0:-3]: - return True - # living => live - if src.endswith("ing") and dst.endswith("e") and src[0:-3] == dst[0:-1]: - return True - # discussion => discussing - if src.endswith("ion") and dst.endswith("ing") and src[0:-3] == dst[0:-3]: - return True - # discussing => discussion - if src.endswith("ing") and dst.endswith("ion") and src[0:-3] == dst[0:-3]: - return True - # alignment => aligning - if src.endswith("ment") and dst.endswith("ing") and src[0:-4] == dst[0:-3]: - return True - # aligning => alignment - if src.endswith("ing") and dst.endswith("ment") and src[0:-3] == dst[0:-4]: - return True - # dispensers => dispensing - if src.endswith("ers") and dst.endswith("ing") and src[0:-3] == dst[0:-3]: - return True - # dispensing => dispensers - if src.endswith("ing") and dst.endswith("ers") and src[0:-3] == dst[0:-3]: - return True - # integrate => integrity - if src.endswith("ate") and dst.endswith("ity") and src[0:-3] == dst[0:-3]: - return True - # integrity => integrate - if src.endswith("ity") and dst.endswith("ate") and src[0:-3] == dst[0:-3]: - return True - # discussion => discussed - if src.endswith("ion") and dst.endswith("ed") and src[0:-3] == dst[0:-2]: - return True - # discussed => discussion - if src.endswith("ed") and dst.endswith("ion") and src[0:-2] == dst[0:-3]: - return True - # anticipation => anticipate - if src.endswith("ion") and dst.endswith("e") and src[0:-3] == dst[0:-1]: - return True - # anticipate => anticipation - if src.endswith("e") and dst.endswith("ion") and src[0:-1] == dst[0:-3]: - return True - # incremental => increment - if src.endswith("ntal") and dst.endswith("nt") and src[0:-4] == dst[0:-2]: - return True - # increment => incremental - if src.endswith("nt") and dst.endswith("ntal") and src[0:-2] == dst[0:-4]: - return True - # national => nation - if src.endswith("nal") and dst.endswith("n") and src[0:-3] == dst[0:-1]: - return True - # nation => national - if src.endswith("n") and dst.endswith("nal") and src[0:-1] == dst[0:-3]: - return True - # significantly => significant - if src.endswith("ntly") and dst.endswith("nt") and src[0:-4] == dst[0:-2]: - return True - # significant => significantly - if src.endswith("nt") and dst.endswith("ntly") and src[0:-2] == dst[0:-4]: - return True - # delivery => deliverer - if src.endswith("ery") and dst.endswith("erer") and src[0:-3] == dst[0:-4]: - return True - # deliverer => delivery - if src.endswith("erer") and dst.endswith("ery") and src[0:-4] == dst[0:-3]: - return True - # deliver => deliverer - if src.endswith("er") and dst.endswith("erer") and src[0:-2] == dst[0:-4]: - return True - # deliverer => deliver - if src.endswith("erer") and dst.endswith("er") and src[0:-4] == dst[0:-2]: - return True - # comparably => comparable - if src.endswith("bly") and dst.endswith("ble") and src[0:-3] == dst[0:-3]: - return True - # comparable => comparably - if src.endswith("ble") and dst.endswith("bly") and src[0:-3] == dst[0:-3]: - return True - # comparably => comparability - if src.endswith("bly") and dst.endswith("bility") and src[0:-3] == dst[0:-6]: - return True - # comparability => comparably - if src.endswith("bility") and dst.endswith("bly") and src[0:-6] == dst[0:-3]: - return True - # beautiful => beautifully - if src.endswith("l") and dst.endswith("lly") and src[0:-1] == dst[0:-3]: - return True - # beautifully => beautiful - if src.endswith("lly") and dst.endswith("l") and src[0:-3] == dst[0:-1]: - return True - # active => actively - if src.endswith("e") and dst.endswith("ely") and src[0:-1] == dst[0:-3]: - return True - # actively => active - if src.endswith("ely") and dst.endswith("e") and src[0:-3] == dst[0:-1]: - return True - # america => american - if src.endswith("a") and dst.endswith("an") and src[0:-1] == dst[0:-2]: - return True - # american => america - if src.endswith("an") and dst.endswith("a") and src[0:-2] == dst[0:-1]: - return True - # reinvesting => investing - if src.startswith("re") and src[2:] == dst: - return True - # investing => reinvesting - if dst.startswith("re") and dst[2:] == src: - return True - # unchanged => changed - if src.startswith("un") and src[2:] == dst: - return True - # changed => unchanged - if dst.startswith("un") and dst[2:] == src: - return True - # disrespected => respected - if src.startswith("dis") and src[3:] == dst: - return True - # respected => disrespected - if dst.startswith("dis") and dst[3:] == src: - return True - # outperformance => performance - if src.startswith("out") and src[3:] == dst: - return True - # performance => outperformance - if dst.startswith("out") and dst[3:] == src: - return True - return False diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py deleted file mode 100644 index d82ee36a8833..000000000000 --- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py +++ /dev/null @@ -1,2000 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -__all__ = [ - 'BertPunctuationCapitalizationDataset', - 'LABEL_ID_DIR_FOR_NEMO_CHECKPOINT', - 'Progress', - 'PunctuationCapitalizationEvalDataConfig', - 'PunctuationCapitalizationTrainDataConfig', - 'create_label_ids', - 'create_masks_and_segment_ids', - 'is_legacy_data_config', - 'legacy_data_config_to_new_data_config', - 'load_label_ids', - 'raise_not_equal_labels_error', - 'save_label_ids', -] - -import itertools -import multiprocessing as mp -import os -import pickle -import tempfile -from dataclasses import dataclass -from math import ceil -from pathlib import Path -from queue import Empty -from time import sleep -from typing import Any, Dict, List, Optional, Set, Tuple, Union - -import numpy as np -import torch -from numpy import ndarray -from omegaconf import MISSING, DictConfig, OmegaConf -from torch.nn.utils.rnn import pad_sequence -from tqdm import tqdm - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.data_utils.data_preprocessing import get_label_stats, get_stats -from nemo.core.classes import Dataset -from nemo.core.neural_types import AudioSignal, ChannelType, LabelsType, LengthsType, MaskType, NeuralType -from nemo.utils import logging -from nemo.utils.get_rank import is_global_rank_zero - -try: - from nemo.collections.asr.parts.preprocessing import AudioSegment - - ASR_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - ASR_AVAILABLE = False - - -MAX_NUM_QUERIES_IN_SPLIT = 10 ** 4 -TOKENIZATION_PROGRESS_REPORT_PERIOD = 10 ** 3 -BATCH_MARK_UP_PROGRESS_REPORT_PERIOD = 10 ** 4 -BATCH_BUILDING_PROGRESS_REPORT_PERIOD = 10 ** 4 - -LABEL_ID_DIR_FOR_NEMO_CHECKPOINT = "label_id_files_for_nemo_checkpoint" - - -@dataclass -class PunctuationCapitalizationDataConfigBase: - """A base class for punctuation and capitalization data configs. This class does not define ``ds_item`` - attribute which works differently for train and evaluation data.""" - - ################################################### - # AUDIO DATASET PARAMETERS - ################################################### - use_audio: bool = False - """ - Whether to use audio or not. If set to True you should provide ``audio_file``. - """ - - audio_file: Optional[str] = None - """ - Path to the file with audio paths one per row. - """ - - sample_rate: Optional[int] = 16000 - """ - Sample rate of audios to use. - """ - - use_bucketing: Optional[bool] = True - """ - Whether to pack samples into ``tokens_in_batch`` or not. Increases GPU utilization but may cause significant RAM consumption if used together with ``use_audio``. - """ - - batch_size: Optional[int] = 32 - """ - Batch size used if ``use_bucketing`` set to False. - """ - - preload_audios: Optional[bool] = True - """ - If set to True audios will be loaded during ``__init__`` call of dataset. Otherwise it will be loaded during ``collate_fn ``call - """ - - ################################################### - # PARAMETERS COMMON FOR REGULAR AND TARRED DATASETS - ################################################### - use_tarred_dataset: bool = MISSING - """Whether to use tarred dataset. If True, then you should provide ``tar_metadata_file``. Otherwise, you should - provide ``text_file``, ``labels_file``, ``tokens_in_batch``.""" - - label_info_save_dir: Optional[str] = None - """A path to a directory where files created during dataset processing are stored. These files include label id - files and label stats files. By default, it is a directory containing ``text_file`` or ``tar_metadata_file``. - You may need this parameter if dataset directory is read-only and thus does not allow saving anything near dataset - files""" - - ################################################# - # REGULAR DATASET PARAMETERS - ################################################# - text_file: Optional[str] = None - """A path to a file with source text data without punctuation and capitalization.""" - - labels_file: Optional[str] = None - """A path to a file with punctuation and capitalization labels in NeMo format. NeMo format is described in - `documentation - `_ - """ - - tokens_in_batch: Optional[int] = None - """Number of tokens in a batch including paddings and special tokens ([CLS], [SEP], [UNK]). This config does - not have ``batch_size`` parameter.""" - - max_seq_length: int = 512 - """Max number of tokens in a source sequence. ``max_seq_length`` includes [CLS] and [SEP] tokens. Sequences - which are too long will be clipped by removal of tokens from the end of a sequence.""" - - num_samples: int = -1 - """A number of samples loaded from ``text_file`` and ``labels_file`` which are used in the dataset. If this - parameter equals ``-1``, then all samples are used.""" - - use_cache: bool = True - """Whether to use pickled features. If pickled features file does not exist or ``use_cache=False``, then features - are pickled in ``cache_dir``. Pickled features include input ids, subtokens mask (mask of first tokens in words), - encoded punctuation and capitalization labels, label ids. Features creation consumes considerable time and this - ``use_cache=True`` significantly speeds up training starting. Pickled features are also used for sharing features - between processes if data parallel training is used.""" - - cache_dir: Optional[str] = None - """A path to a directory containing cache or directory where newly created cache is saved. By default, it is - a directory containing ``text_file``. You may need this parameter if cache for a dataset is going to be created - and the dataset directory is read-only. - - ``cache_dir`` and ``label_info_save_dir`` are separate parameters for the case when a cache is ready and this cache - is stored in a read only directory. In this case you will separate ``label_info_save_dir``.""" - - get_label_frequences: bool = False - """Whether to show and save label frequencies. Frequencies are showed if ``verbose`` parameter is ``True``. If - ``get_label_frequencies=True``, then frequencies are saved into ``label_info_save_dir``""" - - verbose: bool = True - """If ``True`` dataset instance will print progress messages and examples of acquired features.""" - - n_jobs: Optional[int] = 0 - """Number of workers used for features creation (tokenization, label encoding, and clipping). If 0, then - multiprocessing is not used; if ``None``, then n_jobs is equal to the number of CPU cores. - There can be weird deadlocking errors with some tokenizers (e.g. SentencePiece) if ``n_jobs`` is greater than zero. - """ - - ################################################# - # TARRED DATASET PARAMETERS - ################################################# - tar_metadata_file: Optional[str] = None - """A path to tarred dataset metadata file. Tarred metadata file and other parts of tarred dataset are usually - created by the script - `examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py - `_ - """ - - tar_shuffle_n: int = 1 - """The size of shuffle buffer of `webdataset`. The number of batches which are permuted.""" - - shard_strategy: Optional[str] = 'scatter' - """Tarred dataset shard distribution strategy chosen as a str value during ddp. Accepted values are `scatter` and `replicate`. - `scatter`: The default shard strategy applied by WebDataset, where each node gets a unique set of shards, which are permanently - pre-allocated and never changed at runtime. `replicate` is an optional shard strategy, where each node gets the entire set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. The benefit of replication is that - it allows each node to sample data points from the entire dataset independently of other nodes, and reduces dependence on value of - ``tar_shuffle_n``. - - .. warning:: - Replicated strategy allows every node to sample the entire set of available tar files, and therefore more than one node may sample - the same tarfile, and even sample the same data points! As such, there is no assured guarantee that all samples in the dataset - will be sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific occasions (when the number of - shards is not divisible with ``world_size``), will not sample the entire dataset. For these reasons it is not advisable to use - tarred datasets as validation or test datasets. - """ - - ################################################# - # PYTORCH DATALOADER PARAMETERS - ################################################# - shuffle: bool = True - """Shuffle batches every epoch. For regular training datasets, the parameter also activates batch repacking every - epoch. For tarred dataset, it would be only batches permutation.""" - - drop_last: bool = False - """In cases when data parallelism is used, ``drop_last`` defines the way data pipeline behaves when some replicas - are out of data and some are not. If ``drop_last`` is ``True``, then epoch ends in the moment when any replica runs - out of data. If ``drop_last`` is ``False``, then the replica will replace missing batch with a batch from a pool of - batches that the replica has already processed. If data parallelism is not used, then parameter ``drop_last`` does - not do anything. For more information see ``torch.utils.data.distributed.DistributedSampler``""" - - pin_memory: bool = True - """See ``torch.utils.data.DataLoader`` documentation.""" - - num_workers: int = 8 - """See ``torch.utils.data.DataLoader`` documentation.""" - - persistent_workers: bool = True - """See ``torch.utils.data.DataLoader`` documentation.""" - - -@dataclass -class PunctuationCapitalizationTrainDataConfig(PunctuationCapitalizationDataConfigBase): - ds_item: Optional[str] = MISSING - """Path to a directory where `tar_metadata_file` or `text_file` and `labels_file` lay.""" - - -@dataclass -class PunctuationCapitalizationEvalDataConfig(PunctuationCapitalizationDataConfigBase): - ds_item: Optional[Any] = MISSING - """Path to a directory where `tar_metadata_file` or `text_file` and `labels_file` lay. ``Any`` = ``str`` or - ``List[str]``. If a ``List[str]``, then the model is tested or validated on several datasets.""" - - -def is_legacy_data_config(ds_section: DictConfig) -> bool: - return 'use_tarred_dataset' not in ds_section - - -def legacy_data_config_to_new_data_config( - ds_section: DictConfig, legacy_dataset_section: DictConfig, train: bool -) -> DictConfig: - """ - Transform old style dataset to new format dataset. - Args: - ds_section: a ds section (``train_ds``, or ``validation_ds``, or ``test_ds``) from old style config. Such - section contain ``batch_size`` parameter. - legacy_dataset_section: a ``model.dataset`` section. ``model.dataset`` section contains ``data_dir`` parameter - train: ``True`` if ``train_ds`` is transformed and ``False`` otherwise - - Returns: - New format dataset based on either ``PunctuationCapitalizationTrainDataConfig`` (``train=True``) or - ``PunctuationCapitalizationEvalDataConfig`` (``train=False``) - """ - if train: - cls = PunctuationCapitalizationTrainDataConfig - ds_item = legacy_dataset_section.get('data_dir') - else: - cls = PunctuationCapitalizationEvalDataConfig - ds_item = ds_section.get('ds_item') - ds_item = legacy_dataset_section.get('data_dir') if ds_item is None else ds_item - if ds_item is None: - raise ValueError( - f"Data directory was not found in legacy config.\nspecific dataset configuration:\n" - f"{OmegaConf.to_yaml(ds_section)}\nmodel.dataset:\n{OmegaConf.to_yaml(legacy_dataset_section)}" - ) - new_config = OmegaConf.structured( - cls( - use_tarred_dataset=False, - text_file=ds_section.text_file, - labels_file=ds_section.labels_file, - ds_item=ds_item, - max_seq_length=legacy_dataset_section.get( - 'max_seq_length', PunctuationCapitalizationDataConfigBase.max_seq_length - ), - ) - ) - return new_config - - -def _check_number_of_labels( - words: List[str], - query: str, - qi: int, - split_i: int, - punctuation_labels: List[str], - capitalization_labels: List[str], -) -> None: - if len(words) != len(punctuation_labels): - raise ValueError( - f"Number of punctuation labels for a query number {qi} in a split number {split_i} is not equal to " - f"number of words. Number of words: {len(words)}, number of punctuation labels: " - f"{len(punctuation_labels)}. First 100 characters of the query: '{query[:100]}', punctuation labels: " - f"'{punctuation_labels}'" - ) - if len(words) != len(capitalization_labels): - raise ValueError( - f"Number of capitalization labels for a query number {qi} in a split number {split_i} is not equal to " - f"number of words. Number of words: {len(words)}, number of capitalization labels: " - f"{len(capitalization_labels)}. First 100 characters of the query: '{query[:100]}', " - f"capitalization labels: '{capitalization_labels}'" - ) - - -def _show_prog(queues: Tuple[mp.Queue, ...], totals: List[int], descriptions: List[str], units: List[str]) -> None: - """ - Show several ``tqdm`` progress bars. - Args: - queues: a list of queues by which progress is delivered into this function. Each queue is responsible for one - progress bar. ``show_prog`` function extracts integers from ``queues`` elements and adds them to progress - bars. If value extracted from a queue equals ``-1``, then corresponding progress bar is closed. When all - progress bars are closed, this function returns. - totals: list of values 100% of progress bars. See more in a description of ``total`` parameter of - ``tqdm.tqdm`` function - descriptions: list of descriptions of progress bars. See more in a description of ``desc`` parameter of - ``tqdm.tqdm`` function - units: list of progress bar units. See more in a description of ``unit`` parameter of ``tqdm.tqdm`` function - """ - if not all([len(queues) == len(v) for v in [totals, descriptions, units]]): - raise ValueError( - f"All of parameters `queues`, `total_num_lines`, `descriptions`, `units` have to have equal lengths. " - f"len(queues)={len(queues)}, len(total_num_lines)={len(totals)}, " - f"len(descriptions)={len(descriptions)}, len(units)={len(units)}." - ) - prog = [ - tqdm(total=tt, desc=dd, unit=uu, unit_scale=True, position=i) - for i, (tt, dd, uu) in enumerate(zip(totals, descriptions, units)) - ] - finished = [False] * len(queues) - while True: - for i, queue in enumerate(queues): - stop = False - to_add = 0 - try: - v = queue.get(block=False) - while v != -1: - to_add += v - v = queue.get(block=False) - stop = True - except Empty: - if to_add == 0 and not stop: - continue - prog[i].n += to_add - prog[i].update(0) - if prog[i].n >= totals[i]: - finished[i] = True - prog[i].close() - if stop: - if prog[i].n < totals[i]: - logging.warning( - f"Progress with description '{descriptions[i]}' terminated before progress bar " - f"reached 100%. prog.n={prog[i].n}, total_num_lines={totals[i]}" - ) - finished[i] = True - prog[i].close() - if all(finished): - break - sleep(0.1) - - -class Progress: - """ - Manages several ``tqdm`` progress bars for multiprocess tasks. This class can be used as context manager. - - The class starts separate process which creates and updates progress bars. Information to progress process is - passed via multiprocessing queues. There is a separate queue for every progress bar. - - You can use it as context manager: - - .. code-block:: python - with Progress([10, 20], ["progress bar 1", "progress bar 2"], ["parrot", "frog"]) as progress_queues: - num_processes = 10 - with multiprocessing.Pool(num_processes) as pool: - data = list(zip(my_data, [progress_queues[0]] * num_processes, [progress_queues[1]] * num_processes)) - pool.starmap(worker_func, data) - - Or without context manager: - - .. code-block:: python - progress = Progress([10, 20], ["progress bar 1", "progress bar 2"], ["parrot", "frog"]) - progress_queues = progress.get_queue() - num_processes = 10 - with multiprocessing.Pool(num_processes) as pool: - data = list(zip(my_data, [progress_queues[0]] * num_processes, [progress_queues[1]] * num_processes)) - pool.starmap(worker_func, data) - progress.finish() - - In a worker function you will have to put number of processed items into the progress queues. For example: - - .. code-block:: python - def worker_func(my_datum, parrot_progress_queue, frog_progress_queue): - ... - for i in range(10): - parrot_progress_queue.put(1) - frog_progress_queue.put(2) - - Progress bars and progress process are closed when ``finish`` or ``__exit__`` methods are called. - """ - - def __init__(self, total: Union[int, List[int]], desc: Union[str, List[str]], unit: Union[str, List[str]]) -> None: - """ - Starts progress process and creates queues for passing information to the progress process. Number of progress - bars is equal to the max length of lists ``total``, ``desc``, ``unit``. If none of these parameters is a list, - then 1 progress bar is created. - - Args: - total: a list of ``int`` which length is equal to the number of progress bars OR an ``int`` OR a list of - one ``int``. Number which comprises 100% of progress bar. When sum of values passed through the - corresponding queue equals ``total`` corresponding progress bar reaches 100%. If ``total`` is an - ``int`` or a list of one element, then all progress bars have equal ``total`` parameter. - desc: a list of ``str`` which length is equal to the number of progress bars OR a ``str`` OR a list of one - ``str``. Description of a progress bar which is showed as a prefix. See more in description of - parameter ``desc`` of function ``tqdm.tqdm``. - unit: a list of ``str`` which length is equal to the number of progress bars OR a ``str`` OR a list of one - ``str``. A unit of a progress bar. See more in description of parameter ``unit`` of function - ``tqdm.tqdm``. - """ - if not isinstance(total, list): - total = [total] - if not isinstance(desc, list): - desc = [desc] - if not isinstance(unit, list): - unit = [unit] - num_processes = max([len(total), len(desc), len(unit)]) - for param in [total, desc, unit]: - if len(param) not in [num_processes, 1]: - raise ValueError( - f"If parameter of `Progress.__init__` method is a list, then it has to be the same length as other " - f"parameters which are lists" - ) - if len(param) == 1: - param *= num_processes - manager = mp.Manager() - self.progress_queues = tuple(manager.Queue() for _ in range(num_processes)) - self.progress_process = mp.Process(target=_show_prog, args=(self.progress_queues, total, desc, unit)) - self.progress_process.start() - - def __enter__(self) -> Tuple[mp.Queue, ...]: - return self.get_queues() - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: - self.finish() - - def get_queues(self) -> Tuple[mp.Queue, ...]: - return self.progress_queues - - def finish(self) -> None: - for q in self.progress_queues: - q.put(-1) - self.progress_process.join() - - -class TokenizeCreateMasksClipWorker: - """A worker for tokenization, encoding labels, creating masks for first token in a word, sequence clipping""" - - def __init__( - self, - max_seq_length: int, - tokenizer: TokenizerSpec, - punct_label_ids: Optional[Dict[str, int]], - capit_label_ids: Optional[Dict[str, int]], - pad_label: str, - verbose: bool, - progress_queue: mp.Queue, - ) -> None: - """ - Args: - max_seq_length: max number of tokens in an input sequence including [CLS] and [SEP] tokens. If number of - tokens in a sequence exceeds ``max_seq_length``, then excess tokens in the end of the sequence - are removed - tokenizer: a tokenizer instance which has properties ``cls_id``, ``pad_id``, ``sep_id``, ``unk_id`` - punct_label_ids: dict to map punctuation labels to label ids. Starts with pad_label->0. - capit_label_ids: dict to map capitalization labels to label ids. Starts with pad_label->0. - pad_label: pad value use for labels. By default, it's the neutral label for punctuation and capitalization. - Its id in ``punct_label_ids`` and ``capit_label_ids`` has to be ``0`` - verbose: whether to report when the worker finishes its job - progress_queue: a multiprocessing queue used for reporting progress. Useful for creating tarred dataset - """ - self.max_seq_length = max_seq_length - self.tokenizer = tokenizer - self.punct_label_ids = punct_label_ids - self.capit_label_ids = capit_label_ids - self.pad_label = pad_label - self.verbose = verbose - self.progress_queue = progress_queue - - def _maybe_clip(self, values: List[int], append_value: int) -> List[int]: - if len(values) > self.max_seq_length: - return values[: self.max_seq_length - 1] + [append_value] - return values - - def __call__( - self, - queries: List[str], - punct_label_lines: Optional[Union[List[str], Tuple[str, ...]]], - capit_label_lines: Optional[Union[List[str], Tuple[str, ...]]], - split_i: int, - audio_queries: Optional[List[str]] = None, - sample_rate: Optional[int] = None, - preload_audios: Optional[bool] = True, - ) -> Tuple[ - List[ndarray], - List[ndarray], - List[ndarray], - List[ndarray], - Union[List[Any], List[None]], - Union[List[Any], List[None]], - Union[List[Any], List[None]], - ]: - """ - Tokenize, clip, encode labels, and create masks of first tokens in words. - - Args: - queries: text sequences - punct_label_lines: a list or a tuple of labels for every word in a sequence (str) - capit_label_lines: a list of a tuple labels for every word in a sequence (str) - split_i: number of a split which is processed. Used for logging - audio_queries: a list of audio filepaths - sample_rate: target sample rate of audios - preload_audios: whether to preload audios or not - - Returns: - input_ids: a list of 1D int32 arrays. Each array contains token ids of the corresponding query - subtokens_mask: a list of 1D boolean arrays. An array element is ``True`` if corresponding token is the - first token in a word - punct_labels: a list of 1D int32 arrays. Encoded punctuation labels for every token in a query. Tokens in - one word have identical labels - capit_labels: a list of 1D int32 arrays. Encoded capitalization labels for every token in a query. Tokens - in one word have identical labels - """ - all_input_ids, all_subtokens_mask, punct_all_labels, capit_all_labels = [], [], [], [] - dummy = [None] * len(queries) # Needed to avoid code duplication with different values of `self.use_audio` - all_audio_waveforms = [] if preload_audios else dummy - audio_lengths = [] if preload_audios else dummy - audio_filepaths = [] if not preload_audios else dummy - progress_made = 0 - queries = zip(queries, audio_queries) if audio_queries else zip(queries, dummy) - for i, (query, audio_query) in enumerate(queries): - words = query.split() - input_ids, subtokens_mask = [self.tokenizer.cls_id], [0] - _check_number_of_labels(words, query, i, split_i, punct_label_lines[i], capit_label_lines[i]) - pad_id = self.punct_label_ids[self.pad_label] - punct_labels = [pad_id] - punct_query_labels = [self.punct_label_ids[lab] for lab in punct_label_lines[i]] - capit_labels = [pad_id] - capit_query_labels = [self.capit_label_ids[lab] for lab in capit_label_lines[i]] - for j, word in enumerate(words): - word_ids = self.tokenizer.text_to_ids(word) - if not word_ids and len(word): - word_ids = [self.tokenizer.unk_id] - input_ids.extend(word_ids) - - subtokens_mask.append(1) - subtokens_mask.extend([0] * (len(word_ids) - 1)) - - punct_labels.extend([punct_query_labels[j]] * len(word_ids)) - capit_labels.extend([capit_query_labels[j]] * len(word_ids)) - - # add eos token - input_ids.append(self.tokenizer.sep_id) - subtokens_mask.append(0) - - all_input_ids.append(np.array(self._maybe_clip(input_ids, self.tokenizer.sep_id), dtype=np.int32)) - all_subtokens_mask.append(np.array(self._maybe_clip(subtokens_mask, 0), dtype=bool)) - - punct_labels.append(pad_id) - punct_all_labels.append(np.array(self._maybe_clip(punct_labels, pad_id), dtype=np.int32)) - capit_labels.append(pad_id) - capit_all_labels.append(np.array(self._maybe_clip(capit_labels, pad_id), dtype=np.int32)) - if preload_audios and audio_query: - if ASR_AVAILABLE: - segment = AudioSegment.from_file(audio_query.strip(), target_sr=sample_rate) - all_audio_waveforms.append(segment.samples) - audio_lengths.append(segment.num_samples) - else: - raise ModuleNotFoundError( - 'Nemo ASR was not installed, see https://github.com/NVIDIA/NeMo#installation for installation instructions' - ) - - elif audio_query: - audio_filepaths.append(audio_query.strip()) - - progress_made += 1 - if progress_made >= TOKENIZATION_PROGRESS_REPORT_PERIOD: - self.progress_queue.put(progress_made) - progress_made = 0 - - self.progress_queue.put(progress_made) - if self.verbose: - logging.info(f"Finished processing data split number {split_i}") - - return ( - all_input_ids, - all_subtokens_mask, - punct_all_labels, - capit_all_labels, - all_audio_waveforms, - audio_lengths, - audio_filepaths, - ) - - -def _get_features( - queries: Union[List[str], Tuple[str, ...]], - punct_label_lines: Union[List[str], Tuple[str, ...]], - capit_label_lines: Union[List[str], Tuple[str, ...]], - max_seq_length: int, - tokenizer: TokenizerSpec, - punct_label_ids: Dict[str, int] = None, - capit_label_ids: Dict[str, int] = None, - pad_label: str = 'O', - verbose: bool = True, - n_jobs: Optional[int] = 0, - progress_queue: Optional[mp.Queue] = None, - audio_queries: Optional[List[str]] = None, - sample_rate: Optional[int] = None, - preload_audios: Optional[bool] = True, -) -> Tuple[List[Any], List[Any], List[Any], List[Any], List[Any], List[Any], List[Any]]: - """ - Tokenizes data, encodes labels, creates masks of first tokens in words, clips sequences by number of tokens. - - Args: - queries: text sequences - max_seq_length: max number of tokens in an input sequence including [CLS] and [SEP] tokens. If number of tokens - in a sequence exceeds ``max_seq_length``, then excess tokens in the end of the sequence are removed - tokenizer: a tokenizer instance which has properties ``cls_id``, ``pad_id``, ``sep_id``, ``unk_id`` - punct_label_ids: dict to map punctuation labels to label ids. Starts with pad_label->0. - capit_label_ids: dict to map capitalization labels to label ids. Starts with pad_label->0. - pad_label: pad value use for labels. By default, it's the neutral label for punctuation and capitalization. - Its id in ``punct_label_ids`` and ``capit_label_ids`` has to be ``0`` - punct_label_lines: a list of a tuple of labels for every word in a sequence (str) - capit_label_lines: a list or a tuple of labels for every word in a sequence (str) - verbose: whether to show examples of tokenized data and various progress information - n_jobs: a number of workers used for preparing features. If ``n_jobs <= 0``, then do not use multiprocessing - and run features creation in this process. If not set, number of workers will be equal to the number of - CPUs. - - !!WARNING!! - There can be deadlocking problems with some tokenizers (e.g. SentencePiece, HuggingFace AlBERT) - if ``n_jobs > 0``. - - progress_queue: a multiprocessing queue used for reporting progress. Useful for creating tarred dataset - audio_queries: a list of audio filepaths - sample_rate: target sample rate of audios - preload_audios: whether to preload audios or not - - Returns: - input_ids: a list of 1D int32 arrays. Each array contains token ids of corresponding query - subtokens_mask: a list of 1D boolean arrays. An array element is ``True`` if corresponding token is the - first token in a word - punct_labels: a list of 1D int32 arrays. Encoded punctuation labels for every token in a query. Tokens in one - word have identical labels. - capit_labels: a list of 1D int32 arrays. Encoded capitalization labels for every token in a query. Tokens in - one word have identical labels - """ - if verbose: - logging.info("Start initial tokenization.") - create_progress_process = progress_queue is None - if n_jobs is None: - n_jobs = min(mp.cpu_count(), len(queries)) - - if verbose: - logging.info(f"Running tokenization with {n_jobs} jobs.") - - # Number of queries in split - split_size = min(len(queries) // max(n_jobs, 1), MAX_NUM_QUERIES_IN_SPLIT) - n_split = len(queries) // split_size - split_queries = [queries[split_size * i : split_size * (i + 1)] for i in range(n_split - 1)] + [ - queries[split_size * (n_split - 1) :] - ] - split_punct_labels_lines = [ - punct_label_lines[split_size * i : split_size * (i + 1)] for i in range(n_split - 1) - ] + [punct_label_lines[split_size * (n_split - 1) :]] - split_capit_labels_lines = [ - capit_label_lines[split_size * i : split_size * (i + 1)] for i in range(n_split - 1) - ] + [capit_label_lines[split_size * (n_split - 1) :]] - - args = list(zip(split_queries, split_punct_labels_lines, split_capit_labels_lines, range(n_split))) - if audio_queries: - split_audio_queries = [audio_queries[split_size * i : split_size * (i + 1)] for i in range(n_split - 1)] + [ - audio_queries[split_size * (n_split - 1) :] - ] - - args = list( - zip( - split_queries, - split_punct_labels_lines, - split_capit_labels_lines, - range(n_split), - split_audio_queries, - [sample_rate for _ in range(n_split)], - [preload_audios for _ in range(n_split)], - ) - ) - if create_progress_process: - progress = Progress(len(queries), "Tokenization", "query") - progress_queue = progress.get_queues()[0] - if n_jobs > 0: - with mp.Pool(n_jobs) as pool: - result = pool.starmap( - TokenizeCreateMasksClipWorker( - max_seq_length, tokenizer, punct_label_ids, capit_label_ids, pad_label, verbose, progress_queue, - ), - args, - ) - else: - result = [] - for x in args: - result.append( - TokenizeCreateMasksClipWorker( - max_seq_length, tokenizer, punct_label_ids, capit_label_ids, pad_label, verbose, progress_queue, - )(*x) - ) - if create_progress_process: - progress.finish() - - input_ids, subtokens_mask, punct_labels, capit_labels, waveforms, audio_lengths, audio_filepaths = tuple( - list(itertools.chain(*e)) for e in zip(*result) - ) - if verbose: - logging.info("Finished initial tokenization.") - get_stats([len(inp) for inp in input_ids]) - logging.info(f"Finished clipping and padding.") - for i in range(min(len(input_ids), 5)): - logging.info("*** Example ***") - logging.info("i: %s" % i) - logging.info("subtokens: %s" % " ".join(list(map(str, input_ids[i])))) - logging.info("subtokens_mask: %s" % " ".join(list(map(str, subtokens_mask[i])))) - logging.info("punct_labels: %s" % " ".join(list(map(str, punct_labels[i])))) - logging.info("capit_labels: %s" % " ".join(list(map(str, capit_labels[i])))) - - return ( - input_ids, - subtokens_mask, - waveforms, - audio_lengths, - audio_filepaths, - punct_labels, - capit_labels, - ) - - -def create_masks_and_segment_ids( - input_ids: np.ndarray, - subtokens_mask: np.ndarray, - pad_id: int, - cls_id: int, - sep_id: int, - ignore_start_end: bool, - ignore_extra_tokens: bool, -) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Creates segment ids array, input mask, loss mask. - - Segment ids array is BERT token type ids in HuggingFace terminology. It is a zeros array for punctuation - and capitalization task. - - Input mask element is ``True`` if an element of ``input_ids`` is not padding and ``False`` otherwise. - - Loss mask element is ``True`` for the first token in a word. If ``ignore_start_end=False``, then loss mask - element is ``True`` for [CLS] and [SEP] tokens. If ``ignore_extra_tokens=False``, then loss mask element is ``True`` - for all word tokens. In all other cases loss mask elements are ``False``. - - Args: - input_ids: an integer array of shape ``[Batch, Time]`` containing ids of source token ids - subtokens_mask: a boolean array of shape ``[Batch, Time]`` which elements are ``True`` if they correspond to - the first token of some word - pad_id: an id of padding token - cls_id: an id of [CLS] token - sep_id: an id of [SEP] token - ignore_start_end: whether to compute loss for [CLS] and [SEP] tokens - ignore_extra_tokens: whether to compute loss for not first tokens in words - - Returns: - segment_ids: int8 array of shape [Batch, Time] - input_mask: boolean array of shape [Batch, Time] - loss_mask: boolean array of shape [Batch, Time] - """ - segment_ids = np.zeros_like(input_ids, dtype=np.int8) - input_mask = np.not_equal(input_ids, pad_id) - special_mask = np.equal(input_ids, cls_id) & np.equal(input_ids, sep_id) - if ignore_start_end: - if ignore_extra_tokens: - loss_mask = subtokens_mask - else: - loss_mask = input_mask & ~special_mask - else: - if ignore_extra_tokens: - loss_mask = subtokens_mask | special_mask - else: - loss_mask = input_mask - return segment_ids, input_mask, loss_mask - - -def create_label_ids(unique_labels: Set[str], pad_label: str) -> Dict[str, int]: - """ - Returns label ids dictionary. ``pad_label`` always has id ``0``. Other labels are sorted in alphabetical order. - Args: - unique_labels: a set of labels from which label ids dictionary is created. May or may not contain ``pad_label`` - pad_label: label used for padding. It is also a neutral label - - Returns: - label ids dictionary - """ - label_ids = {pad_label: 0} - if pad_label in unique_labels: - unique_labels.remove(pad_label) - for label in sorted(unique_labels): - label_ids[label] = len(label_ids) - return label_ids - - -def load_label_ids(file_path: Union[str, os.PathLike]) -> Dict[str, int]: - ids = {} - with open(file_path, encoding='utf_8') as f: - for i, line in enumerate(f): - ids[line.strip()] = i - return ids - - -def save_label_ids(label_ids: Dict[str, int], file_path: Path) -> None: - """ - Saves label ids map to a file. In each line of a file one label is saved. Labels are saved in the order of - increasing of their ids. - - Args: - label_ids: label id dictionary. Pad label has to have id ``0`` - file_path: path to a file where labels will be saved - """ - file_path.parent.mkdir(parents=True, exist_ok=True) - with file_path.open('w', encoding='utf_8', newline='\n') as out: - labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1])) - out.write('\n'.join(labels)) - - -def raise_not_equal_labels_error( - first_labels: Dict[str, int], second_labels: Dict[str, int], first_labels_desc: str, second_labels_desc: str -) -> None: - """ - A helper function for raising comprehensible error if labels from 2 sources are different. - Such sources may include: - - labels stored in .nemo checkpoint - - labels stored in tarred dataset - - labels passed in config parameters ``model.common_dataset_parameters.{punct_label_ids,capit_label_ids}`` - - labels from files passed in config parameters ``model.class_labels.{punct_labels_file,capit_labels_file}`` - - labels in attributes ``PunctuationCapitalizationModel.{punct_label_ids,capit_label_ids}`` - - any other source - This function helps to detect configuration early and give error messages that are easy to interpret. - Call this function if ``first_labels != second_labels``. - - Args: - first_labels: first dictionary with labels - second_labels: second dictionary with labels - first_labels_desc: a description of first labels - second_labels_desc: a description of second labels - """ - missing_in_first = {k: second_labels[k] for k in set(second_labels) - set(first_labels)} - missing_in_second = {k: first_labels[k] for k in set(first_labels) - set(second_labels)} - not_equal = { - k: {'FIRST LABELS': first_labels[k], 'SECOND LABELS': second_labels[k]} - for k in set(first_labels) & set(second_labels) - if first_labels[k] != second_labels[k] - } - msg = f"{first_labels_desc} (FIRST LABELS) are not equal to {second_labels_desc} (SECOND LABELS)." - if len(missing_in_first) > 0: - msg += f" Number of SECOND LABELS missing in the FIRST LABELS: {len(missing_in_first)}." - if len(missing_in_second) > 0: - msg += f" Number of FIRST LABELS missing in the SECOND LABELS: {len(missing_in_second)}." - if len(not_equal) > 0: - msg += f" Number of labels which are not equal: {len(not_equal)}." - if len(missing_in_first) > 0: - msg += ( - f" Several examples of missing SECONDS LABELS in the FIRST LABELS: " - f"{dict(list(missing_in_first.items())[:3])}." - ) - if len(missing_in_second) > 0: - msg += ( - f" Several examples of missing FIRST LABELS in the SECOND LABELS: " - f"{dict(list(missing_in_second.items())[:3])}." - ) - if len(not_equal) > 0: - msg += f" Several examples of labels which are not equal: {dict(list(not_equal.items())[:3])}" - raise ValueError(msg) - - -def pad(vectors: List[np.ndarray], length: int, value: Union[int, float, bool]) -> np.ndarray: - """ - Pad vectors to length ``length`` and then stack. - Args: - vectors: a list of 1D arrays. Arrays to pad and stack - length: a length of padded sequence. Has to be greater or equal to the maximum length of an element of - ``vectors``. - value: a value used for padding - - Returns: - an array of padded vectors - """ - result = [] - for v in vectors: - result.append(np.concatenate([v, np.full([length - v.shape[0]], value, dtype=v.dtype)])) - return np.stack(result) - - -class BertPunctuationCapitalizationDataset(Dataset): - """ - A dataset to use during training for punctuation and capitalization tasks. - For inference, you will need - :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_infer_dataset.BertPunctuationCapitalizationInferDataset`. - For huge datasets which cannot be loaded into memory simultaneously use - :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.BertPunctuationCapitalizationTarredDataset`. - - Args: - text_file (:obj:`Union[str, os.PathLike]`): a path to a file with sequences, each line should contain a text - without punctuation and capitalization - labels_file (:obj:`Union[str, os.PathLike]`): a path to a file with labels, each line corresponds to word - labels for a sentence in the ``text_file``. Labels have to follow format described in this section of - documentation :ref:`NeMo Data Format`. - max_seq_length (:obj:`int`): max number of tokens in a source sequence. ``max_seq_length`` includes for [CLS] - and [SEP] tokens. Sequences which are too long will be clipped by removal of tokens from the end of the - sequence. - tokenizer (:obj:`TokenizerSpec`): a tokenizer instance which has properties ``unk_id``, ``sep_id``, ``bos_id``, - ``eos_id``. - num_samples (:obj:`int`, `optional`, defaults to :obj:`-1`): a number of samples you want to use for the - dataset. If ``-1``, use all dataset. Useful for testing. - tokens_in_batch (:obj:`int`, `optional`, defaults to :obj:`5000`): number of tokens in a batch including - paddings and special tokens ([CLS], [SEP], [UNK]). This class :meth:`__getitem__` method returns not - samples but ready batches. Number of samples in a batch is adjusted for input sequences lengths. If input - sequences are short, then a batch will contain more samples. Before packing into batches, samples are - sorted by number of tokens they contain. Sorting allows to reduce number of pad tokens in a batch - significantly. Regular PyTorch data loader shuffling will only permute batches with changing their content. - Proper shuffling is achieved via calling method :meth:`repack_batches_with_shuffle` every epoch. If - parameter ``number_of_batches_is_multiple_of`` is greater than 1, some batches may be split into smaller - pieces. - pad_label (:obj:`str`, `optional`, defaults to :obj:`'O'`): pad value to use for labels. It's also the neutral - label both for punctuation and capitalization. - punct_label_ids (:obj:`Dict[str, int]`, `optional`): dict to map punctuation labels to label ids. For dev set, - use label ids generated during training to support cases when not all labels are present in the dev set. - For training, it is recommended to set ``punct_label_ids`` to ``None`` or load from cache. - capit_label_ids (:obj:`Dict[str, int]`, `optional`): same ``punct_label_ids`` for capitalization labels. - ignore_extra_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to compute loss on - tokens which are not first tokens in a word. For example, assume that word ``'tokenization'`` is tokenized - into ``['token', 'ization']``. If ``ignore_extra_tokens=True``, loss mask for the word is - ``[True, False]``, and if ``ignore_extra_tokens=False``, then loss mask is ``[True, True]``. - ignore_start_end (:obj:`bool`, `optional`, defaults to :obj:`True`): whether to ignore [CLS] and [SEP] tokens - in the loss_mask. - use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): whether to use pickled features already present - in ``cache_dir`` or not. If pickled features file does not exist or ``use_cache=False``, then features are - pickled in ``cache_dir``. Pickled features include input ids, subtokens mask (mask of first tokens in - words), encoded punctuation and capitalization labels, label ids. Features creation consumes considerable - time and this ``use_cache=True`` significantly speeds up training starting. Pickled features are also - used for sharing features between processes if data parallel training is used. - cache_dir (:obj:`Union[str, os.PathLike]`, `optional`): a path to a directory where cache (pickled features) - is stored. By default, ``text_file`` parent directory is used. This parameter is useful if dataset - directory is read-only, and you wish to pickle features. In such a case specify a path to directory which - allows writing in ``cache_dir`` parameter. - get_label_frequencies (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to print and save label - frequencies. Frequencies are showed if ``verbose`` parameter is ``True``. If - ``get_label_frequencies=True``, then frequencies are saved into ``label_info_save_dir`` directory. - label_info_save_dir (:obj:`Union[str, os.PathLike]`, `optional`): a path to a directory where label frequencies - are saved. By default, a ``text_file`` parent directory is used. When method - :meth:`save_labels_and_get_file_paths` is called label ids are saved into ``label_info_save_dir`` - directory. This parameter is useful if directory containing ``text_file`` is read-only. - punct_label_vocab_file (:obj:`Union[str, os.PathLike]`, `optional`): a path to a .csv file containing - punctuation label vocabulary. Each line in such a vocabulary file contains exactly one label. The first - line has to contain `pad_label`, otherwise error will be raised. - capit_label_vocab_file (:obj:`Union[str, os.PathLike]`, `optional`): same as ``punct_label_vocab_file`` for - capitalization labels. - add_masks_and_segment_ids_to_batch (:obj:`bool`, `optional`, defaults to :obj:`True`): whether to add - ``'loss_mask'``, ``'input_mask'``, ``'segment_ids'`` items to a batch. Useful for creation of tarred - dataset and can NOT be used during model training and inference. - verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): whether to show data examples, label stats and - other useful information. - n_jobs (:obj:`int`, `optional`, defaults to :obj:`0`): number of workers used for tokenization, encoding - labels, creating "first token in word" mask, and clipping. If ``n_jobs <= 0`` data preparation is performed - without multiprocessing. By default, ``n_jobs`` is ``0``. - - .. warning:: - There can be deadlocking problems with some tokenizers (e.g. SentencePiece, HuggingFace AlBERT) - if ``n_jobs > 0``. - - number_of_batches_is_multiple_of (:obj:`int`, `optional`, defaults to :obj:`1`): number of batches in the - dataset is made divisible by ``number_of_batches_is_multiple_of``. If ``number_of_batches_is_multiple_of`` - is greater than 1, then several batches are split in parts until number of batches - is divisible by ``number_of_batches_is_multiple_of``. If there is no enough queries in the dataset to - create enough batches, then a warning is printed. This parameter is useful for dev and validation datasets - if multiple GPUs are used. The problem is that if number of batches is not evenly divisible by number of - GPUs, then some queries may be processed several times and metrics will be distorted. - batch_shuffling_random_seed (:obj:`int`, defaults to :obj:`int`): a random seed used for batches repacking and - shuffling. - tokenization_progress_queue (:obj:`multiprocessing.Queue`, `optional`): a queue for reporting tokenization - progress. Useful for creation of tarred dataset - batch_mark_up_progress_queue (:obj:`multiprocessing.Queue`, `optional`): a queue for reporting progress in - deciding which samples batches will contain. Useful for creation of tarred dataset - batch_building_progress_queue (:obj:`multiprocessing.Queue`, `optional`): a queue for reporting progress in - batch creation (stacking and padding). Useful for creation of tarred dataset - use_audio (:obj:`bool`, `optional`, defaults to :obj: `False`): If set to True dataset will return audio as well as text. - audio_file (:obj:`Union[str, os.PathLike]`, `optional`): a path to file with audio paths. - sample_rate (:obj:`int`, `optional`, defaults to :obj:`None`): sample rate of audios. Can be used for up sampling or down sampling of audio. - use_bucketing (:obj:`bool`, `optional`, defaults to :obj: `True`): If set to False dataset will return ``batch_size`` batches instead of ``number_of_tokens`` tokens. - preload_audios (:obj:`bool`, `optional`, defaults to :obj: `True`): If set to True batches will include waveforms, if set to False will store audio_filepaths instead and load audios during ``collate_fn`` call - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. """ - if self.use_audio: - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'punct_labels': NeuralType(('B', 'T'), LabelsType()), - 'capit_labels': NeuralType(('B', 'T'), LabelsType()), - 'features': NeuralType(('B', 'T'), AudioSignal()), - 'features_length': NeuralType(('B', 'T'), LengthsType()), - } - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'punct_labels': NeuralType(('B', 'T'), LabelsType()), - 'capit_labels': NeuralType(('B', 'T'), LabelsType()), - } - - def __init__( - self, - text_file: Union[str, os.PathLike], - labels_file: Union[str, os.PathLike], - max_seq_length: int, - tokenizer: TokenizerSpec, - num_samples: int = -1, - tokens_in_batch: int = 5000, - pad_label: str = 'O', - punct_label_ids: Optional[Union[Dict[str, int], DictConfig]] = None, - capit_label_ids: Optional[Union[Dict[str, int], DictConfig]] = None, - ignore_extra_tokens: bool = False, - ignore_start_end: bool = True, - use_cache: bool = True, - cache_dir: Optional[Union[str, os.PathLike]] = None, - get_label_frequencies: bool = False, - label_info_save_dir: Optional[Union[str, os.PathLike]] = None, - punct_label_vocab_file: Optional[Union[str, os.PathLike]] = None, - capit_label_vocab_file: Optional[Union[str, os.PathLike]] = None, - add_masks_and_segment_ids_to_batch: bool = True, - verbose: bool = True, - n_jobs: Optional[int] = 0, - number_of_batches_is_multiple_of: int = 1, - batch_shuffling_random_seed: int = 42, - tokenization_progress_queue: Optional[mp.Queue] = None, - batch_mark_up_progress_queue: Optional[mp.Queue] = None, - batch_building_progress_queue: Optional[mp.Queue] = None, - use_audio: Optional[bool] = False, - audio_file: Optional[Union[str, os.PathLike]] = None, - sample_rate: Optional[int] = None, - use_bucketing: Optional[bool] = True, - preload_audios: Optional[bool] = True, - ) -> None: - """ Initializes BertPunctuationCapitalizationDataset. """ - if isinstance(punct_label_ids, DictConfig): - punct_label_ids = OmegaConf.to_container(punct_label_ids) - if isinstance(capit_label_ids, DictConfig): - capit_label_ids = OmegaConf.to_container(capit_label_ids) - - self._check_constructor_parameters( - text_file, - labels_file, - punct_label_ids, - capit_label_ids, - punct_label_vocab_file, - capit_label_vocab_file, - num_samples, - use_cache, - number_of_batches_is_multiple_of, - use_audio, - audio_file, - sample_rate, - ) - - if punct_label_vocab_file is not None: - punct_label_vocab_file = Path(punct_label_vocab_file).expanduser() - punct_label_ids = load_label_ids(punct_label_vocab_file) - if capit_label_vocab_file is not None: - capit_label_vocab_file = Path(capit_label_vocab_file).expanduser() - capit_label_ids = load_label_ids(capit_label_vocab_file) - self.text_file, self.labels_file = Path(text_file).expanduser(), Path(labels_file).expanduser() - if label_info_save_dir is None: - self.label_info_save_dir = self.text_file.parent - else: - self.label_info_save_dir = Path(label_info_save_dir).expanduser() - - self.tokens_in_batch = tokens_in_batch - self.tokenizer = tokenizer - self.pad_label = pad_label - self.ignore_extra_tokens = ignore_extra_tokens - self.ignore_start_end = ignore_start_end - self.add_masks_and_segment_ids_to_batch = add_masks_and_segment_ids_to_batch - self.verbose = verbose - self.batch_mark_up_progress_queue = batch_mark_up_progress_queue - self.batch_building_progress_queue = batch_building_progress_queue - self.use_audio = use_audio - self.audio_file = audio_file - self.sample_rate = sample_rate - self.use_bucketing = use_bucketing - self.preload_audios = preload_audios - - master_device = is_global_rank_zero() - self.features_pkl = self._get_path_to_pkl_features( - self.text_file, self.labels_file, cache_dir, max_seq_length, num_samples - ) - features = None - if master_device and not (self.features_pkl.is_file() and use_cache): - if verbose: - logging.info( - f'Processing {self.text_file}' + f' {self.audio_file if self.audio_file else ""} '.rstrip() - ) - - ( - text_lines, - punct_label_lines, - capit_label_lines, - punct_unique_labels, - capit_unique_labels, - audio_lines, - ) = self._read_dataset(self.text_file, self.labels_file, num_samples, self.audio_file) - - if punct_label_ids: - self._check_label_ids_vs_unique_labels( - punct_label_ids, punct_unique_labels, 'punct', 'punctuation', self.labels_file - ) - else: - punct_label_ids = create_label_ids(punct_unique_labels, self.pad_label) - if capit_label_ids: - self._check_label_ids_vs_unique_labels( - capit_label_ids, capit_unique_labels, 'capit', 'capitalization', self.labels_file - ) - else: - capit_label_ids = create_label_ids(capit_unique_labels, self.pad_label) - features = _get_features( - text_lines, - punct_label_lines, - capit_label_lines, - max_seq_length, - self.tokenizer, - pad_label=self.pad_label, - punct_label_ids=punct_label_ids, - capit_label_ids=capit_label_ids, - verbose=self.verbose, - progress_queue=tokenization_progress_queue, - n_jobs=n_jobs, - audio_queries=audio_lines if self.use_audio else None, - sample_rate=self.sample_rate, - preload_audios=self.preload_audios, - ) - self.features_pkl.parent.mkdir(parents=True, exist_ok=True) - - # save features to a temp file first to make sure that non-master processes don't start reading the file - # until the master process is done with writing - ofd, tmp_features_pkl = tempfile.mkstemp( - suffix='.pkl', prefix=os.path.basename(self.features_pkl), dir=os.path.dirname(self.features_pkl) - ) - with os.fdopen(ofd, 'wb') as temp_f: - pickle.dump(tuple(list(features) + [punct_label_ids, capit_label_ids]), temp_f) - - os.rename(tmp_features_pkl, self.features_pkl) - - if self.verbose: - logging.info(f'Features saved to {self.features_pkl}') - - # wait until the master process writes to the processed data files - if not master_device: - while features is None and not os.path.exists(self.features_pkl): - sleep(10) - - if features is None: - features = pickle.load(self.features_pkl.open('rb')) - li = features[-2:] - self._check_label_ids_loaded_from_pkl( - punct_label_ids, capit_label_ids, *li, punct_label_vocab_file, capit_label_vocab_file - ) - punct_label_ids, capit_label_ids = li[-2], li[-1] - if tokenization_progress_queue is not None: - tokenization_progress_queue.put(len(features[0])) - if self.verbose: - logging.info(f'Features restored from {self.features_pkl}') - features = features[:-2] - - ( - self.input_ids, - self.subtokens_mask, - self.waveforms, - self.waveforms_length, - self.audio_filepaths, - self.punct_labels, - self.capit_labels, - ) = features - self.punct_label_ids, self.capit_label_ids = punct_label_ids, capit_label_ids - self.number_of_batches_is_multiple_of = number_of_batches_is_multiple_of - self.batch_shuffling_random_state = np.random.RandomState(batch_shuffling_random_seed) - if get_label_frequencies: - self.punct_label_frequencies = self._calculate_and_save_label_frequencies(self.punct_labels, 'punct') - self.capit_label_frequencies = self._calculate_and_save_label_frequencies(self.capit_labels, 'capit') - if self.use_bucketing: - self.batches = self._pack_into_batches( - input_ids=self.input_ids, - subtokens_mask=self.subtokens_mask, - punct_labels=self.punct_labels, - capit_labels=self.capit_labels, - waveforms=self.waveforms, - audio_lengths=self.waveforms_length, - audio_filepaths=self.audio_filepaths, - ) - else: - self.batches = self._form_batches( - input_ids=self.input_ids, - subtokens_mask=self.subtokens_mask, - punct_labels=self.punct_labels, - capit_labels=self.capit_labels, - waveforms=self.waveforms, - audio_lengths=self.waveforms_length, - audio_filepaths=self.audio_filepaths, - ) - - def _get_path_to_pkl_features( - self, - text_file: Path, - labels_file: Path, - cache_dir: Optional[Union[str, os.PathLike]], - max_seq_length: int, - num_samples: int, - ) -> Path: - if cache_dir is None: - cache_dir = text_file.parent - else: - cache_dir = Path(cache_dir).expanduser() - vocab_size = getattr(self.tokenizer, "vocab_size", 0) - features_pkl = cache_dir / "cached.{}.{}.max_seq_length{}.vocab{}.{}.punctuation_capitalization.pkl".format( - '__' + text_file.name + '__' + labels_file.name + '__', - self.tokenizer.name, - max_seq_length, - vocab_size, - f'num_samples{num_samples}' if num_samples > 0 else 'all_samples', - ) - return features_pkl - - @staticmethod - def _check_constructor_parameters( - text_file: Union[str, os.PathLike], - labels_file: Union[str, os.PathLike], - punct_label_ids: Optional[Dict[str, int]], - capit_label_ids: Optional[Dict[str, int]], - punct_label_vocab_file: Union[str, os.PathLike], - capit_label_vocab_file: Union[str, os.PathLike], - num_samples: int, - use_cache: bool, - number_of_batches_is_multiple_of: int, - use_audio: bool = False, - audio_file: Optional[Union[str, os.PathLike]] = None, - sample_rate: Optional[int] = None, - ) -> None: - if torch.distributed.is_initialized() and torch.distributed.get_world_size() > 1 and not use_cache: - raise ValueError( - f"If you already created process group and the world size is greater than 1, then `use_cache` " - f"parameter has to be `True`. Only master process prepares features and if `use_cache=False`, then " - f"other processes will not be able to obtain features. Alternatively, you may set `use_cache=False` " - f"and set up data before spawning processes. Use `cache_dir` dataset directory with " - f"`text_file` and `labels_file` is read-only." - ) - if not (os.path.exists(text_file) and os.path.exists(labels_file)): - raise FileNotFoundError( - f'{text_file} or {labels_file} not found. The data should be split into 2 files: text.txt and ' - f'labels.txt. Each line of the text.txt file contains text sequences, where words are separated with ' - f'spaces. The labels.txt file contains corresponding labels for each word in text.txt, the labels are ' - f'separated with spaces. Each line of the files should follow the format:\n' - f' [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and ' - f' [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).' - ) - if not use_audio and audio_file: - raise ValueError(f"Audio file {audio_file} was passed but use_audio was set to False") - if use_audio and audio_file and not os.path.exists(audio_file): - raise FileNotFoundError( - f'use_audio was set to True but {audio_file} not found. Audio data should be listed in .txt file with one path per line' - ) - if punct_label_ids is not None and punct_label_vocab_file is not None: - punct_label_vocab_file = Path(punct_label_vocab_file).expanduser() - file_punct_label_ids = load_label_ids(punct_label_vocab_file) - if file_punct_label_ids != punct_label_ids: - raise_not_equal_labels_error( - first_labels=punct_label_ids, - second_labels=file_punct_label_ids, - first_labels_desc='Punctuation labels passed to the `PunctuationCapitalizationDataset` ' - 'constructor in parameter `punct_label_ids`', - second_labels_desc=f'Punctuation labels loaded from file {punct_label_vocab_file} path to which ' - f'is passed in parameter `punct_label_vocab_file`', - ) - if capit_label_ids is not None and capit_label_vocab_file is not None: - capit_vocab_file = Path(capit_label_vocab_file).expanduser() - file_capit_label_ids = load_label_ids(capit_vocab_file) - if file_capit_label_ids != capit_label_ids: - raise_not_equal_labels_error( - first_labels=capit_label_ids, - second_labels=file_capit_label_ids, - first_labels_desc='Capitalization labels passed to the `PunctuationCapitalizationDataset` ' - 'constructor in parameter `capit_label_ids`', - second_labels_desc=f'Capitalization labels loaded from file {capit_label_vocab_file} path to ' - f'which is passed in parameter `capit_label_vocab_file`', - ) - if num_samples == 0: - raise ValueError( - f"Parameter `num_samples` has to be positive or negative whereas `num_samples={num_samples}`. " - f"Negative `num_samples` is for using all samples in a dataset." - ) - if number_of_batches_is_multiple_of < 1 or not isinstance(number_of_batches_is_multiple_of, int): - raise ValueError( - f"Parameter `number_of_batches_is_multiple_of` has to be positive integer whereas " - f"{number_of_batches_is_multiple_of} is given." - ) - - if use_audio and not isinstance(sample_rate, int): - raise TypeError(f'use_audio was set to True but sample_rate was not set') - - if use_audio and sample_rate < 1: - raise ValueError(f'sample_rate set to {sample_rate} but it cannot be less than 1') - - def _check_label_ids_loaded_from_pkl( - self, - parameter_punct_label_ids: Dict[str, int], - parameter_capit_label_ids: Dict[str, int], - pkl_punct_label_ids: Any, - pkl_capit_label_ids: Any, - punct_label_vocab_file: Optional[Path], - capit_label_vocab_file: Optional[Path], - ) -> None: - if not isinstance(pkl_punct_label_ids, dict): - raise ValueError( - f"Punctuation label ids loaded from features file {self.features_pkl} have wrong type " - f"{type(pkl_punct_label_ids)}" - ) - if parameter_punct_label_ids is not None: - if parameter_punct_label_ids != pkl_punct_label_ids: - raise_not_equal_labels_error( - first_labels=parameter_punct_label_ids, - second_labels=pkl_punct_label_ids, - first_labels_desc="Punctuation labels passed in parameter `punct_label_ids`" - if punct_label_vocab_file is None - else f"Punctuation labels loaded from file {punct_label_vocab_file}", - second_labels_desc=f"Punctuation label ids loaded from features file {self.features_pkl}", - ) - if not isinstance(pkl_capit_label_ids, dict): - raise ValueError( - f"Capitalization label ids loaded from features file {self.features_pkl} has wrong type " - f"{type(pkl_capit_label_ids)}" - ) - if parameter_capit_label_ids is not None: - if parameter_capit_label_ids != pkl_capit_label_ids: - raise_not_equal_labels_error( - first_labels=parameter_capit_label_ids, - second_labels=pkl_capit_label_ids, - first_labels_desc="Capitalization labels passed in parameter `capit_label_ids`" - if capit_label_vocab_file is None - else f"Capitalization labels loaded from file {capit_label_vocab_file}", - second_labels_desc=f"Capitalization label ids loaded from features file {self.features_pkl}", - ) - - @staticmethod - def _check_label_ids_vs_unique_labels( - label_ids: Dict[str, int], unique_labels: Set[str], label_type: str, task: str, label_file: Path - ) -> None: - if unique_labels - set(label_ids): - not_present_labels = list(unique_labels - set(label_ids)) - raise ValueError( - f"{len(not_present_labels)} {task} labels found in {label_file} are not present in " - f"`{label_type}_label_ids`. Examples of unexpected labels from {label_file}: {not_present_labels[:3]}" - ) - - @staticmethod - def _read_dataset( - text_file: Path, labels_file: Path, num_samples: int, audio_file: Optional[Path] = None - ) -> Union[Tuple[Any, Any, Any, Set[Any], Set[Any], Any], Tuple[Any, Any, Any, Set[Any], Set[Any]]]: - with open(text_file, 'r', encoding='utf_8') as f: - text_lines = f.readlines() - punct_unique_labels, capit_unique_labels = set(), set() - punct_labels_lines, capit_labels_lines = [], [] - with labels_file.open(encoding='utf_8') as f: - for i, line in enumerate(f): - pairs = line.split() - if not all([len(p) == 2 for p in pairs]): - raise ValueError( - f"Some label pairs are not pairs but have wrong length (!= 2) in line {i} in label file " - f"{labels_file}" - ) - words = text_lines[i].split() - if len(pairs) != len(words): - raise ValueError( - f"In line {i} in text file {text_file} number of words {len(words)} is not equal to the " - f"number of labels {len(pairs)} in labels file {labels_file}." - ) - punct_line, capit_line = zip(*pairs) - punct_labels_lines.append(punct_line) - capit_labels_lines.append(capit_line) - punct_unique_labels.update(punct_line) - capit_unique_labels.update(capit_line) - if len(punct_labels_lines) != len(text_lines): - raise ValueError( - f"Number of text lines {len(text_lines)} in text file {text_file} is not equal to the number of lines " - f"{len(punct_labels_lines)} in labels file {labels_file}." - ) - - if audio_file: - with open(audio_file, 'r') as f: - audio_lines = f.readlines() - if len(audio_lines) != len(text_lines): - raise ValueError( - f'Number of lines in {audio_file} equals {len(audio_lines)} which is not equal to ' - f'number of lines in {text_file} which is {len(text_lines)}' - ) - dataset = list(zip(text_lines, punct_labels_lines, capit_labels_lines, audio_lines)) - else: - dataset = list(zip(text_lines, punct_labels_lines, capit_labels_lines)) - if len(dataset) == 0: - raise ValueError(f"Dataset loaded from files {text_file} and {labels_file} is empty.") - if num_samples > 0: - dataset = dataset[:num_samples] - if audio_file: - text_lines, punct_labels_lines, capit_labels_lines, audio_lines = zip(*dataset) - return ( - text_lines, - punct_labels_lines, - capit_labels_lines, - punct_unique_labels, - capit_unique_labels, - audio_lines, - ) - else: - text_lines, punct_labels_lines, capit_labels_lines = zip(*dataset) - return text_lines, punct_labels_lines, capit_labels_lines, punct_unique_labels, capit_unique_labels, None - - @staticmethod - def calc_batch_seq_length(queries: List[np.ndarray], length_is_multiple_of: int) -> int: - return ceil(max([len(elem) for elem in queries]) / length_is_multiple_of) * length_is_multiple_of - - def _adjust_number_of_batches( - self, - input_ids: List[np.ndarray], - batch_beginnings: List[int], - batch_sizes: List[int], - batch_seq_lengths: List[int], - ) -> Tuple[List[int], List[int], List[int]]: - """ - If length of ``batch_sizes`` list is not divisible by ``self.number_of_batches_is_multiple_of``, then - one or several batches are split into parts until number of batches is divisible by - ``self.number_of_batches_is_multiple_of``. - - The method selects a batch and tries to slice smaller batches with 8 elements each from the batch. If - the batch cannot be sliced any further and there are still not enough batches, then the next batch from dataset - is selected. - - If slicing batches of size 8 is not enough, then batches of size 1 are created. - - If dataset is too small to create enough batches, then a warning is shown. - - Args: - input_ids: tokenized queries of the dataset. `input_ids` are expected to be sorted by length in ascending - order. - batch_beginnings: indices of first elements of batches created inside :meth:`_mark_up_batches` method. - Expected to be sorted in ascending order. - batch_sizes: sizes of batches created inside :meth:`_mark_up_batches` method. - batch_seq_lengths: lengths of elements in batch after padding created inside :meth:`_mark_up_batches` - method. - - Returns: - batch_beginnings: a list of indices in ``input_ids`` of first samples of every batch - batch_sizes: a list of numbers of samples in batches - batch_seq_lengths: a list of sequence lengths after padding for every batch - """ - batch_beginnings, batch_sizes = batch_beginnings.copy(), batch_sizes.copy() - batch_seq_lengths = batch_seq_lengths.copy() - num_missing_batches = ( - self.number_of_batches_is_multiple_of - len(batch_sizes) % self.number_of_batches_is_multiple_of - ) - if num_missing_batches == 0: - return batch_beginnings, batch_sizes, batch_seq_lengths - if sum(batch_sizes) - len(batch_sizes) < num_missing_batches: - logging.warning( - f"Unable to achieve number of batches multiple of {self.number_of_batches_is_multiple_of} because " - f"dataset in files '{self.text_file}' and '{self.labels_file}' contains not enough queries " - f"({sum(batch_sizes)}) or queries in the dataset are too long. Dataset will have " - f"{len(batch_sizes)} batches instead. For validation or test dataset if multiple GPUs are used " - f"this will lead to distorted metrics because some batches will be processed several times. " - f"To fix this problem you may try to tweak (increase) parameter `tokens_in_batch`, though result is " - f"not guaranteed." - ) - return batch_beginnings, batch_sizes, batch_seq_lengths - num_cut = 0 - for ss in [8, 1]: # ss - split_size - old_num_batches = len(batch_sizes) - # Starting from the last batch because its size is likely to be not multiple of 8. Thus number of - # batches which size is not multiple of 8 can be reduced by 1. - original_batch_index = old_num_batches - 1 - while original_batch_index >= 0 and num_cut < num_missing_batches: - bs, bb = batch_sizes[original_batch_index], batch_beginnings[original_batch_index] - rb = 0 # an index of sliced first element of sliced batch in original batch (relative beginning) - if rb < bs - ss: - while rb < bs - ss and num_cut < num_missing_batches: - batch_sizes.append(ss) - batch_beginnings.append(bb + rb) - batch_seq_lengths.append( - self.calc_batch_seq_length(input_ids[bb + rb : bb + rb + ss], length_is_multiple_of=8) - ) - rb += ss - num_cut += 1 - assert len(input_ids[bb + rb : bb + bs]) > 0 - batch_sizes[original_batch_index] = bs - rb - batch_beginnings[original_batch_index] = bb + rb - batch_seq_lengths[original_batch_index] = self.calc_batch_seq_length( - input_ids[bb + rb : bb + bs], length_is_multiple_of=8 - ) - original_batch_index -= 1 - # Keeping order of batches. - batch_beginnings, batch_sizes, batch_seq_lengths = map( - list, zip(*sorted(zip(batch_beginnings, batch_sizes, batch_seq_lengths), key=lambda x: x[0])) - ) - assert len(batch_beginnings) % self.number_of_batches_is_multiple_of == 0 - assert len(batch_sizes) % self.number_of_batches_is_multiple_of == 0 - assert len(batch_seq_lengths) % self.number_of_batches_is_multiple_of == 0 - return batch_beginnings, batch_sizes, batch_seq_lengths - - def _mark_up_batches(self, input_ids: List[np.ndarray]) -> Tuple[List[int], List[int], List[int]]: - """ - Computes indices of first samples in batch, batch sizes, seq lengths for batches. ``input_ids`` has to be - sorted by number of tokens in ascending order. - - Batches are marked up with respect to following conditions: - - total number of tokens in batch including paddings is less or equal to ``self.tokens_in_batch`` - - batch size is evenly divisible by 8 (except for the last batch) - - seq length (elements of the third returned object) is evenly divisible by 8 - - If ``self.batch_mark_up_progress_queue`` is not None, then the progress in mark up is reported via - ``self.batch_mark_up_progress_queue``. Otherwise, ``tqdm`` instance is created in this function. - - Args: - input_ids: a list of 1D int32 arrays. Elements of ``input_ids`` have to be sorted by length in ascending - order - - Returns: - batch_beginnings: a list of indices in ``input_ids`` of first samples of every batch - batch_sizes: a list of numbers of samples in batches - batch_seq_lengths: a list of sequence lengths after padding for every batch - """ - batch_beginnings, batch_sizes, batch_seq_lengths = [], [], [] - current_max_length = 0 - start = 0 - if self.batch_mark_up_progress_queue is None: - inp_iterator = tqdm(enumerate(input_ids), total=len(input_ids), desc="Batch mark up", unit="query") - else: - inp_iterator = enumerate(input_ids) - progress_made = 0 - for i, inp in inp_iterator: - current_max_length = max(current_max_length, ceil(len(inp) / 8) * 8) - if current_max_length * (i + 1 - start) > self.tokens_in_batch: - batch_size = (i - start) // 8 * 8 - if batch_size == 0: - if i > start: - batch_size = i - start - logging.warning( - f"Could not create batch with multiple of 8 size. Probably, there is a too long sequence " - f"in the dataset or parameter `tokens_in_batch` is too small. Current length of sequences " - f"in batch is {current_max_length}. Batch size will be reduced to {batch_size}. " - f"tokens_in_batch={self.tokens_in_batch}. The batch includes sequences from " - f"{start} to {i - 1}." - ) - else: - logging.warning( - f"Input sequence number {i - 1} is too long. Could not fit it into batch with " - f"{self.tokens_in_batch} tokens. Sequence number {i - 1} will not be added to batches." - ) - start = i - current_max_length = ceil(len(inp) / 8) * 8 - continue - seq_length = self.calc_batch_seq_length(input_ids[start : start + batch_size], length_is_multiple_of=8) - batch_beginnings.append(start) - batch_sizes.append(batch_size) - batch_seq_lengths.append(seq_length) - start += batch_size - current_max_length = self.calc_batch_seq_length(input_ids[start : i + 1], length_is_multiple_of=8) - if self.batch_mark_up_progress_queue is not None: - progress_made += 1 - if progress_made >= BATCH_MARK_UP_PROGRESS_REPORT_PERIOD: - self.batch_mark_up_progress_queue.put(progress_made) - progress_made = 0 - if start < len(input_ids): - seq_length = self.calc_batch_seq_length(input_ids[start:], length_is_multiple_of=8) - batch_beginnings.append(start) - batch_sizes.append(len(input_ids) - start) - batch_seq_lengths.append(seq_length) - if self.batch_mark_up_progress_queue is not None: - self.batch_mark_up_progress_queue.put(progress_made) - if len(batch_beginnings) % self.number_of_batches_is_multiple_of: - batch_beginnings, batch_sizes, batch_seq_lengths = self._adjust_number_of_batches( - input_ids, batch_beginnings, batch_sizes, batch_seq_lengths - ) - assert sum(batch_sizes) == len(input_ids) - for i in range(len(batch_beginnings) - 1): - assert batch_beginnings[i] + batch_sizes[i] == batch_beginnings[i + 1] - assert batch_seq_lengths[i] >= max( - [len(inp) for inp in input_ids[batch_beginnings[i] : batch_beginnings[i] + batch_sizes[i]]] - ) - return batch_beginnings, batch_sizes, batch_seq_lengths - - def _form_batches( - self, - input_ids: List[np.ndarray], - subtokens_mask: List[np.ndarray], - punct_labels: List[np.ndarray], - capit_labels: List[np.ndarray], - waveforms: Optional[List[np.ndarray]] = None, - audio_lengths: Optional[List[np.ndarray]] = None, - audio_filepaths: Optional[List[str]] = None, - ) -> List[Dict[str, np.ndarray]]: - """ - - Args: - input_ids: a list of 1D int32 arrays which contain token ids of dataset source - subtokens_mask: a list of 1D boolean arrays which elements are ``True`` if corresponding token is the - first token in some word - punct_labels: a list of 1D int32 arrays which contain encoded punctuation labels - capit_labels: a list of 1D int32 arrays which contain encoded capitalization labels - waveforms: a list of 1D float arrays which contain raw waveforms of audios. - audio_lengths: a list of 1D int32 arrays which contain length of corresponding audio from `waveforms` - audio_filepaths: a list of strings which contain paths to audio - - Returns: - a list of batches. Each batch is a dictionary with items: - - ``'input_ids'``: a ``np.int32`` numpy array; - - ``'subtokens_mask'``: a boolean numpy array; - - ``'punct_labels'``: a ``np.int32`` numpy array; - - ``'capit_labels'``: a ``np.int32`` numpy array. - If ``self.add_masks_and_segment_ids_to_batch`` is ``True``, then a batch also contain items - - ``'segment_ids'``: a ``np.int8`` numpy array; - - ``'input_mask'``: a boolean numpy array; - - ``'loss_mask'``: a boolean numpy array. - If ``waveforms`` is not ``None``, then a batch also contain items - - ``features``: a ``np.float64`` numpy array. - - ``features_length`` a ``np.int32`` numpy array. - If ``audio_filepaths`` is not ``None``, then a natch also contain items - - ``audio_filepaths`` a list of strings. - - The values of a batch dictionary are numpy arrays of identical shape. - """ - batches = [] - dummy = [None] * len(input_ids) - - zipped = list( - zip( - input_ids, - subtokens_mask, - punct_labels, - capit_labels, - waveforms if waveforms else dummy, - audio_lengths if audio_lengths else dummy, - audio_filepaths if audio_filepaths else dummy, - ) - ) - - for item in zipped: - batch = { - "input_ids": item[0], - "subtokens_mask": item[1], - "punct_labels": item[2].astype(np.int64), - "capit_labels": item[3].astype(np.int64), - } - if self.use_audio and self.preload_audios: - batch['features'] = item[4].astype(np.float64) - batch['features_length'] = item[5] - elif self.use_audio and not self.preload_audios: - batch['audio_filepaths'] = item[6] - batches.append(batch) - return batches - - def _pack_into_batches( - self, - input_ids: List[np.ndarray], - subtokens_mask: List[np.ndarray], - punct_labels: List[np.ndarray], - capit_labels: List[np.ndarray], - waveforms: Optional[List[np.ndarray]] = None, - audio_lengths: Optional[List[np.ndarray]] = None, - audio_filepaths: Optional[List[str]] = None, - ) -> List[Dict[str, np.ndarray]]: - """ - Shuffle input sequences, sort them by number of tokens, pad, and pack into batches which satisfy following - conditions: - - total number of tokens in batch including paddings is less or equal to ``self.tokens_in_batch`` - - batch size is evenly divisible by 8 (except for the last batch) - - seq length (elements of the third returned object) is evenly divisible by 8 - Created batches are shuffled before returning. - - If ``self.add_masks_and_segment_ids_to_batch`` is ``True``, then ``'segment_ids'``, ``'loss_mask'``, and - ``'input_mask'`` are added to the batch. - - If ``self.batch_building_progress_queue`` is not ``None``, then padding progress is reported to - ``self.batch_building_progress_queue``. Otherwise, a new ``tqdm`` instance is created in ``pack_into_batches`` - method. - - Args: - input_ids: a list of 1D int32 arrays which contain token ids of dataset source - subtokens_mask: a list of 1D boolean arrays which elements are ``True`` if corresponding token is the - first token in some word - punct_labels: a list of 1D int32 arrays which contain encoded punctuation labels - capit_labels: a list of 1D int32 arrays which contain encoded capitalization labels - waveforms: a list of 1D float arrays which contain raw waveforms of audios. - audio_lengths: a list of 1D int32 arrays which contain length of corresponding audio from `waveforms` - audio_filepaths: a list of strings which contain paths to audio - - Returns: - a list of batches. Each batch is a dictionary with items: - - ``'input_ids'``: a ``np.int32`` numpy array; - - ``'subtokens_mask'``: a boolean numpy array; - - ``'punct_labels'``: a ``np.int32`` numpy array; - - ``'capit_labels'``: a ``np.int32`` numpy array. - If ``self.add_masks_and_segment_ids_to_batch`` is ``True``, then a batch also contain items - - ``'segment_ids'``: a ``np.int8`` numpy array; - - ``'input_mask'``: a boolean numpy array; - - ``'loss_mask'``: a boolean numpy array. - If ``waveforms`` is not ``None``, then a batch also contain items - - ``features``: a ``np.float64`` numpy array. - - ``features_length`` a ``np.int32`` numpy array. - If ``audio_filepaths`` is not ``None``, then a natch also contain items - - ``audio_filepaths`` a list of strings. - - The values of a batch dictionary are numpy arrays of identical shape. - """ - dummy = [None] * len(input_ids) - zipped = list( - zip( - input_ids, - subtokens_mask, - punct_labels, - capit_labels, - waveforms if waveforms else dummy, - audio_lengths if audio_lengths else dummy, - audio_filepaths if audio_filepaths else dummy, - ) - ) - self.batch_shuffling_random_state.shuffle(zipped) - - dim_sort = 4 if self.use_audio and self.preload_audios else 0 - - input_ids, subtokens_mask, punct_labels, capit_labels, waveforms, audio_lengths, audio_filepaths = zip( - *sorted(zipped, key=lambda x: x[dim_sort].shape[0]) - ) - batch_beginnings, batch_sizes, batch_seq_lengths = self._mark_up_batches(input_ids) - batches = [] - if self.batch_building_progress_queue is None: - inp_iterator = tqdm( - zip(batch_beginnings, batch_sizes, batch_seq_lengths), - total=len(batch_beginnings), - desc="Batch building", - unit="batch", - ) - else: - # In this case we report number of queries not number of batches - inp_iterator = zip(batch_beginnings, batch_sizes, batch_seq_lengths) - progress_made = 0 - for start, size, length in inp_iterator: - batch_input_ids = pad(input_ids[start : start + size], length, self.tokenizer.pad_id) - batch_subtokens_mask = pad(subtokens_mask[start : start + size], length, False) - batch = { - "input_ids": batch_input_ids, - "subtokens_mask": batch_subtokens_mask, - "punct_labels": pad( - punct_labels[start : start + size], length, self.punct_label_ids[self.pad_label] - ).astype(np.int64), - "capit_labels": pad( - capit_labels[start : start + size], length, self.capit_label_ids[self.pad_label] - ).astype(np.int64), - } - if self.use_audio and self.preload_audios: - batch['features'] = pad( - waveforms[start : start + size], max(audio_lengths[start : start + size]), 0.0 - ).astype(np.float64) - batch['features_length'] = audio_lengths[start : start + size] - elif self.use_audio and not self.preload_audios: - batch['audio_filepaths'] = audio_filepaths[start : start + size] - - if self.add_masks_and_segment_ids_to_batch: - batch_segment_ids, batch_input_mask, batch_loss_mask = create_masks_and_segment_ids( - batch_input_ids, - batch_subtokens_mask, - self.tokenizer.pad_id, - self.tokenizer.cls_id, - self.tokenizer.sep_id, - self.ignore_start_end, - self.ignore_extra_tokens, - ) - batch['segment_ids'] = batch_segment_ids - batch['input_mask'] = batch_input_mask - batch['loss_mask'] = batch_loss_mask - batches.append(batch) - if self.batch_building_progress_queue is not None: - progress_made += size - if progress_made >= BATCH_BUILDING_PROGRESS_REPORT_PERIOD: - self.batch_building_progress_queue.put(progress_made) - progress_made = 0 - if self.batch_building_progress_queue is not None: - self.batch_building_progress_queue.put(progress_made) - self.batch_shuffling_random_state.shuffle(batches) - return batches - - def repack_batches_with_shuffle(self) -> None: - """A function for proper shuffling of a dataset. Pytorch data loader shuffling will only permute batches.""" - if not self.use_bucketing: - return - logging.info("Shuffling training dataset") - self.batches = self._pack_into_batches( - self.input_ids, - self.subtokens_mask, - self.punct_labels, - self.capit_labels, - self.waveforms, - self.waveforms_length, - self.audio_filepaths, - ) - - def _calculate_and_save_label_frequencies(self, all_labels: List[np.ndarray], name: str) -> Dict[str, float]: - """Calculates and saves labels frequencies in :attr:`label_info_save_dir`.""" - merged_labels = itertools.chain.from_iterable(all_labels) - if self.verbose: - logging.info('Three most popular labels') - self.label_info_save_dir.mkdir(parents=True, exist_ok=True) - _, label_frequencies, _ = get_label_stats( - merged_labels, str(self.label_info_save_dir / f'label_count_{name}.tsv') - ) - return label_frequencies - - def save_labels_and_get_file_paths( - self, punct_labels_file_name: str, capit_labels_file_name: str - ) -> Tuple[Path, Path]: - """ - Saves label ids into files located in ``self.label_info_save_dir``. Saved label ids are usually used for - ``.nemo`` checkpoint creation. - - The signatures of this method and the signature of the method - :meth:`~nemo.collections.nlp.data.token_classification.BertPunctuationCapitalizationTarredDataset.save_labels_and_get_file_paths` - must be identical. - - Args: - punct_labels_file_name (:obj:`str`): a name of a punctuation labels file - capit_labels_file_name (:obj:`str`): a name of a capitalization labels file - - Returns: - :obj:`Tuple[pathlib.Path, pathlib.Path]`: a tuple containing: - - - :obj:`pathlib.Path`: a path to the saved punctuation labels file - - :obj:`pathlib.Path`: a path to the saved capitalization labels file - """ - nemo_dir = self.label_info_save_dir / LABEL_ID_DIR_FOR_NEMO_CHECKPOINT - punct_labels_file = nemo_dir / punct_labels_file_name - capit_labels_file = nemo_dir / capit_labels_file_name - save_label_ids(self.punct_label_ids, punct_labels_file) - save_label_ids(self.capit_label_ids, capit_labels_file) - return punct_labels_file, capit_labels_file - - def __len__(self) -> int: - return len(self.batches) - - def collate_fn(self, batches: List[Dict[str, np.ndarray]]) -> Dict[str, torch.Tensor]: - """ - If ``self.use_bucketing`` set to ``True`` returns zeroth batch from ``batches`` list passed for collating and casts ``'segment_ids'``, ``'punct_labels'``, - ``'capit_labels'`` to types supported by - :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel` - or :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationLexicalAudioModel` if ``self.use_audio`` set to ``True`` - All output tensors have shape ``[Batch, Time]``. - - .. warning:: - A ``batch_size`` parameter of a PyTorch data loader and sampler has to be ``1`` if ``self.use_bucketing`` set to ``True`` - - Args: - batches (:obj:`List[Dict[str, np.ndarray]]`): a list containing 1 batch passed for collating - - Returns: - :obj:`Dict[str, torch.Tensor]`: a batch dictionary with following items (for detailed description of batch - items see method :meth:`__getitem__`): - - - ``'input_ids'`` (:obj:`torch.Tensor`): :obj:`torch.int32` tensor, - - ``'subtokens_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor, - - ``'punct_labels'`` (:obj:`torch.Tensor`): :obj:`torch.int64` tensor, - - ``'capit_labels'`` (:obj:`torch.Tensor`): :obj:`torch.int64` tensor, - - ``'segment_ids'`` (:obj:`torch.Tensor`): :obj:`torch.int32` tensor, - - ``'input_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor, - - ``'loss_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor. - - ``'features'`` (:obj:`torch.Tensor`): :obj:`torch.float` tensor. - - ``'features_length'`` (:obj:`torch.Tensor`): :obj:`torch.long` tensor. - """ - if self.use_bucketing: - batch = {k: torch.as_tensor(v) for k, v in batches[0].items() if k != 'audio_filepaths'} - batch['segment_ids'] = batch['segment_ids'].int() - batch['punct_labels'] = batch['punct_labels'].long() - batch['capit_labels'] = batch['capit_labels'].long() - if self.use_audio and self.preload_audios: - batch['features'] = batch['features'].to(torch.float32) - return batch - else: - for batch in batches: - batch_segment_ids, batch_input_mask, batch_loss_mask = create_masks_and_segment_ids( - batch['input_ids'], - batch['subtokens_mask'], - self.tokenizer.pad_id, - self.tokenizer.cls_id, - self.tokenizer.sep_id, - self.ignore_start_end, - self.ignore_extra_tokens, - ) - batch['segment_ids'] = torch.as_tensor(batch_segment_ids, dtype=torch.int) - batch['input_mask'] = torch.as_tensor(batch_input_mask) - batch['loss_mask'] = torch.as_tensor(batch_loss_mask) - batch['input_ids'] = torch.as_tensor(batch['input_ids'], dtype=torch.int) - batch['subtokens_mask'] = torch.as_tensor(batch['subtokens_mask']) - batch['punct_labels'] = torch.as_tensor(batch['punct_labels'], dtype=torch.long) - batch['capit_labels'] = torch.as_tensor(batch['capit_labels'], dtype=torch.long) - if 'features' in batch: - batch['features'] = torch.as_tensor(batch['features'], dtype=torch.float) - batch['features_length'] = torch.as_tensor(batch['features_length'], dtype=torch.long) - elif self.use_audio: - if ASR_AVAILABLE: - waveform = AudioSegment.from_file(batch['audio_filepaths'], target_sr=self.sample_rate) - batch['features'] = torch.as_tensor(waveform.samples, dtype=torch.float) - batch['features_length'] = torch.as_tensor(waveform.num_samples, dtype=torch.long) - else: - raise ModuleNotFoundError( - 'Nemo ASR was not installed, see https://github.com/NVIDIA/NeMo#installation for installation instructions' - ) - - segment_ids = pad_sequence([batch['segment_ids'] for batch in batches]) - input_mask = pad_sequence([batch['input_mask'] for batch in batches]) - loss_mask = pad_sequence([batch['loss_mask'] for batch in batches]) - input_ids = pad_sequence([batch['input_ids'] for batch in batches], padding_value=self.tokenizer.pad_id) - subtokens_mask = pad_sequence([batch['subtokens_mask'] for batch in batches], padding_value=False) - punct_labels = pad_sequence([batch['punct_labels'] for batch in batches], padding_value=0) - capit_labels = pad_sequence([batch['capit_labels'] for batch in batches], padding_value=0) - features = pad_sequence([batch['features'] for batch in batches], padding_value=0.0) - features_length = torch.tensor([batch['features_length'] for batch in batches]) - return { - 'input_ids': input_ids.T, - 'subtokens_mask': subtokens_mask.T, - 'punct_labels': punct_labels.T, - 'capit_labels': capit_labels.T, - 'features': features.T, - 'features_length': features_length, - 'segment_ids': segment_ids.T, - 'input_mask': input_mask.T, - 'loss_mask': loss_mask.T, - } - - def __getitem__(self, idx: int) -> Dict[str, np.ndarray]: - """ - Return a batch with index ``idx``. The values of a batch dictionary are numpy arrays of identical shapes - ``[Batch, Time]``. Labels are identical for all tokens in a word. For example, if - - - word ``'Tokenization'`` is tokenized into tokens ``['token', 'ization']``, - - it is followed by comma, - - then punctuation labels are ``[',', ',']`` and capitalization labels are ``['U', 'U']`` (``'U'`` is a label - for words which start with upper case character). - - Args: - idx: an index of returned batch - - Returns: - :obj:`Dict[str, np.ndarray]`: a dictionary with items: - - - ``'input_ids'`` (:obj:`numpy.ndarray`): :obj:`numpy.int32` array containing encoded tokens, - - ``'subtokens_mask'`` (:obj:`numpy.ndarray`): :obj:`bool` array which elements are ``True`` if they - correspond to first token in a word, - - ``'punct_labels'`` (:obj:`numpy.ndarray`): :obj:`numpy.int32` array containing encoded punctuation - labels, - - ``'capit_labels'`` (:obj:`numpy.ndarray`): :obj:`numpy.int32` array containing encoded capitalization - labels. - - ``'segment_ids'`` (:obj:`numpy.ndarray`): :obj:`numpy.int8` array filled with zeros (BERT token types - in HuggingFace terminology) (if ``self.add_masks_and_segment_ids_to_batch`` is ``False``, then these - items is missing), - - ``'input_mask'`` (:obj:`numpy.ndarray`): :obj:`bool` array which elements are ``True`` if corresponding - token is not a padding token (if ``self.add_masks_and_segment_ids_to_batch`` is ``False``, then these - items is missing), - - ``'loss_mask'`` (:obj:`numpy.ndarray`): :obj:`bool` array which elements are ``True`` if loss is - computed for corresponding token. See more in description of constructor parameters - ``ignore_start_end``, ``ignore_extra_tokens`` (if ``self.add_masks_and_segment_ids_to_batch`` is - ``False``, then these items is missing). - - ``'features'`` (:obj:`numpy.ndarray`) :obj:`np.float64` array of waveforms of audio if ``self.preload_audio`` is set to ``True`` else empty. - - ``'features_length'`` (:obj:`numpy.ndarray`) :obj:`np.longlong` array of number of samples per audio. - - ``'audio_filepaths'`` (:obj:`List`) :obj:`str` contains paths of audio files if ``self.preload_audio`` set to ``False`` - """ - return self.batches[idx] diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py deleted file mode 100644 index 13bb30403553..000000000000 --- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py +++ /dev/null @@ -1,466 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import io -import itertools -from typing import Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -from numpy import ndarray -from torch import Tensor -from torch.nn.utils.rnn import pad_sequence - -from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis -from nemo.collections.common.tokenizers import TokenizerSpec -from nemo.collections.nlp.data import get_stats -from nemo.core import Dataset -from nemo.core.neural_types import ChannelType, Index, MaskType, NeuralType -from nemo.core.neural_types.elements import AudioSignal, BoolType, LengthsType -from nemo.utils import logging - -try: - from nemo.collections.asr.parts.preprocessing import AudioSegment - - ASR_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - ASR_AVAILABLE = False - - -def get_features_infer( - queries: List[str], - tokenizer: TokenizerSpec, - max_seq_length: int = 64, - step: Optional[int] = 8, - margin: Optional[int] = 16, - audio_queries: Optional[Union[List[bytes], List[str]]] = None, - target_sr: Optional[int] = None, -) -> Tuple[ - List[List[int]], - List[List[int]], - List[List[int]], - List[List[int]], - List[int], - List[int], - List[bool], - List[bool], - Optional[List[float]], - Optional[List[int]], -]: - """ - Processes the data and returns features. - - Args: - queries: text sequences - tokenizer: such as AutoTokenizer - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] - step: relative shift of consequent segments into which long queries are split. Long queries are split into - segments which can overlap. Parameter ``step`` controls such overlapping. Imagine that queries are - tokenized into characters, ``max_seq_length=5``, and ``step=2``. In such a case query "hello" is - tokenized into segments ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]``. - margin: number of subtokens near edges of segments which are not used for punctuation and capitalization - prediction. The first segment does not have left margin and the last segment does not have right - margin. For example, if input sequence is tokenized into characters, ``max_seq_length=5``, - ``step=1``, and ``margin=1``, then query "hello" will be tokenized into segments - ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'e', 'l', 'l', '[SEP]'], - ['[CLS]', 'l', 'l', 'o', '[SEP]']]``. These segments are passed to the model. Before final predictions - computation, margins are removed. In the next list, subtokens which logits are not used for final - predictions computation are marked with asterisk: ``[['[CLS]'*, 'h', 'e', 'l'*, '[SEP]'*], - ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]``. - audio_queries (:obj:`List[str]`, `optional`): paths to audio files. - target_sr (:obj:`int`, `optional`): target sample rate for audios. - - Returns: - all_input_ids: list of input ids of all segments - all_segment_ids: token type ids of all segments - all_input_mask: attention mask to use for BERT model - all_subtokens_mask: masks out all subwords besides the first one - all_quantities_of_preceding_words: number of words in query preceding a segment. Used for joining - predictions from overlapping segments. - all_query_ids: index of a query to which segment belongs - all_is_first: is segment first segment in a query - all_is_last: is segment last segment in a query - """ - st = [] - stm = [] - sent_lengths = [] - audios = [] - audio_queries = audio_queries if audio_queries else [None] * len(queries) # Dummy if no `audio_queries` passed - for i, (query, audio_query) in enumerate(zip(queries, audio_queries)): - subtokens, subtokens_mask = _get_subtokens_and_subtokens_mask(query, tokenizer) - sent_lengths.append(len(subtokens)) - st.append(subtokens) - stm.append(subtokens_mask) - if audio_query: - if ASR_AVAILABLE: - if isinstance(audio_query, bytes): - audios.append(AudioSegment.from_file(io.BytesIO(audio_query), target_sr=target_sr)) - elif isinstance(audio_query, str): - audios.append(AudioSegment.from_file(audio_query.strip(), target_sr=target_sr)) - else: - raise ModuleNotFoundError( - 'Nemo ASR was not installed, see https://github.com/NVIDIA/NeMo#installation for installation instructions' - ) - audios = audios if len(audios) else [None] * len(st) - _check_max_seq_length_and_margin_and_step(max_seq_length, margin, step) - if max_seq_length > max(sent_lengths) + 2: - max_seq_length = max(sent_lengths) + 2 - # If `max_seq_length` is greater than maximum length of input query, parameters ``margin`` and ``step`` are - # not used will not be used. - step = 1 - # Maximum number of word subtokens in segment. The first and the last tokens in segment are CLS and EOS - length = max_seq_length - 2 - else: - # Maximum number of word subtokens in segment. The first and the last tokens in segment are CLS and EOS - length = max_seq_length - 2 - step = min(length - margin * 2, step) - logging.info(f'Max length: {max_seq_length}') - get_stats(sent_lengths) - all_input_ids, all_segment_ids, all_subtokens_mask, all_input_mask, all_input_mask = [], [], [], [], [] - all_quantities_of_preceding_words, all_query_ids, all_is_first, all_is_last = [], [], [], [] - all_audio_queries, all_audio_lengths = [], [] - for q_i, (query_st, query_audio) in enumerate(zip(st, audios)): - q_inp_ids, q_segment_ids, q_subtokens_mask, q_inp_mask, q_quantities_of_preceding_words = [], [], [], [], [] - q_audio_queries, q_audio_lengths = [], [] - if query_audio and length < len(query_st): - logging.info(f'Ignoring query with id {q_i}') - continue - for i in range(0, max(len(query_st), length) - length + step, step): - subtokens = [tokenizer.cls_token] + query_st[i : i + length] + [tokenizer.sep_token] - q_inp_ids.append(tokenizer.tokens_to_ids(subtokens)) - q_segment_ids.append([0] * len(subtokens)) - q_subtokens_mask.append([False] + stm[q_i][i : i + length] + [False]) - q_inp_mask.append([True] * len(subtokens)) - q_quantities_of_preceding_words.append(np.count_nonzero(stm[q_i][:i])) - if query_audio: - samples = query_audio.samples - q_audio_queries.append(samples) - q_audio_lengths.append(len(samples)) - all_input_ids.append(q_inp_ids) - all_segment_ids.append(q_segment_ids) - all_subtokens_mask.append(q_subtokens_mask) - all_input_mask.append(q_inp_mask) - all_quantities_of_preceding_words.append(q_quantities_of_preceding_words) - all_query_ids.append([q_i] * len(q_inp_ids)) - all_is_first.append([True] + [False] * (len(q_inp_ids) - 1)) - all_is_last.append([False] * (len(q_inp_ids) - 1) + [True]) - if query_audio: - all_audio_queries.append(q_audio_queries) - all_audio_lengths.append(q_audio_lengths) - return ( - list(itertools.chain(*all_input_ids)), - list(itertools.chain(*all_segment_ids)), - list(itertools.chain(*all_input_mask)), - list(itertools.chain(*all_subtokens_mask)), - list(itertools.chain(*all_quantities_of_preceding_words)), - list(itertools.chain(*all_query_ids)), - list(itertools.chain(*all_is_first)), - list(itertools.chain(*all_is_last)), - list(itertools.chain(*all_audio_queries)), - list(itertools.chain(*all_audio_lengths)), - ) - - -def _check_max_seq_length_and_margin_and_step(max_seq_length: int, margin: int, step: int): - """ - Checks values of ``max_seq_length``, ``margin``, and ``step``. - Args: - max_seq_length: a segment length with ``[CLS]`` and ``[SEP]`` tokens - margin: a number of input tokens near edges of segments which are not used in punctuation and capitalization - prediction. - step: offset of consequent segments. - Returns: - None - """ - if max_seq_length < 3: - raise ValueError( - f"Parameter `max_seq_length={max_seq_length}` cannot be less than 3 because `max_seq_length` is a length " - f"of a segment with [CLS] and [SEP] tokens." - ) - if margin >= (max_seq_length - 2) // 2 and margin > 0 or margin < 0: - raise ValueError( - f"Parameter `margin` has to be not negative and less than `(max_seq_length - 2) // 2`. Don't forget about " - f"CLS and EOS tokens in the beginning and the end of segment. margin={margin}, " - f"max_seq_length={max_seq_length}" - ) - if step <= 0: - raise ValueError(f"Parameter `step` has to be positive whereas step={step}") - if step > max_seq_length - 2 - 2 * margin: - logging.warning( - f"Parameter step={step} is too big. It will be reduced to `min(max_seq_length, + 2) " - f"- 2 - 2 * margin`." - ) - - -def _get_subtokens_and_subtokens_mask(query: str, tokenizer: TokenizerSpec) -> Tuple[List[str], List[bool]]: - """ - Tokenizes input query into subtokens and creates subtokens mask. Subtokens mask is an array of the same length as - subtokens array and contains zeros and ones in which. If element of mask equals 1, then corresponding subtoken in - subtokens array is first subtoken in some word - Args: - query: a string that will be tokenized - tokenizer: an instance of tokenizer - Returns: - subtokens: list of subtokens - subtokens_mask: list of ints - """ - if isinstance(query, Hypothesis): - query = query.text - words = query.strip().split() - subtokens = [] - subtokens_mask = [] - for j, word in enumerate(words): - word_tokens = tokenizer.text_to_tokens(word) - subtokens.extend(word_tokens) - subtokens_mask.append(True) - subtokens_mask.extend([False] * (len(word_tokens) - 1)) - return subtokens, subtokens_mask - - -class BertPunctuationCapitalizationInferDataset(Dataset): - """ - Creates dataset to use during inference for punctuation and capitalization tasks with a pretrained model. - For dataset to use during training with labels, see - :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.BertPunctuationCapitalizationDataset` - and - :class:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.BertPunctuationCapitalizationTarredDataset`. - - Parameters ``max_seq_length``, ``step``, ``margin`` are for controlling the way queries are split into segments - which then processed by the model. Parameter ``max_seq_length`` is a length of a segment after tokenization - including special tokens [CLS] in the beginning and [SEP] in the end of a segment. Parameter ``step`` is shift - between consequent segments. Parameter ``margin`` is used to exclude negative effect of subtokens near - borders of segments which have only one side context. - - Args: - queries (:obj:`List[str]`): list of sequences. - tokenizer (:obj:`TokenizerSpec`): a tokenizer which was used for model training. It should have properties - ``cls_id``, ``sep_id``, ``unk_id``, ``pad_id``. - max_seq_length (:obj:`int`, `optional`, defaults to :obj:`128`): max sequence length which includes [CLS] and - [SEP] tokens - step (:obj:`int`, `optional`, defaults to :obj:`8`): relative shift of consequent segments into which long - queries are split. Long queries are split into segments which can overlap. Parameter ``step`` controls such - overlapping. Imagine that queries are tokenized into characters, ``max_seq_length=5``, and ``step=2``. In - such a case query "hello" is tokenized into segments - ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]``. - margin (:obj:`int`, `optional`, defaults to :obj:`16`): number of subtokens in the beginning and the end of - segments which are not used for prediction computation. The first segment does not have left margin and the - last segment does not have right margin. For example, if input sequence is tokenized into characters, - ``max_seq_length=5``, ``step=1``, and ``margin=1``, then query "hello" will be tokenized into segments - ``[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'e', 'l', 'l', '[SEP]'], - ['[CLS]', 'l', 'l', 'o', '[SEP]']]``. These segments are passed to the model. Before final predictions - computation, margins are removed. In the next list, subtokens which logits are not used for final - predictions computation are marked with asterisk: ``[['[CLS]'*, 'h', 'e', 'l'*, '[SEP]'*], - ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]``. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns neural types of :meth:`collate_fn` output.""" - if self.use_audio: - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'quantities_of_preceding_words': NeuralType(('B',), Index()), - 'query_ids': NeuralType(('B',), Index()), - 'is_first': NeuralType(('B',), BoolType()), - 'is_last': NeuralType(('B',), BoolType()), - 'features': NeuralType(('B', 'T'), AudioSignal()), - 'features_length': NeuralType(('B', 'T'), LengthsType()), - } - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'quantities_of_preceding_words': NeuralType(('B',), Index()), - 'query_ids': NeuralType(('B',), Index()), - 'is_first': NeuralType(('B',), BoolType()), - 'is_last': NeuralType(('B',), BoolType()), - } - - def __init__( - self, - queries: List[str], - tokenizer: TokenizerSpec, - max_seq_length: int = 64, - step: int = 8, - margin: int = 16, - audio_queries: Optional[Union[List[bytes], List[str]]] = None, - target_sr: Optional[int] = None, - ): - features = get_features_infer( - queries=queries, - max_seq_length=max_seq_length, - tokenizer=tokenizer, - step=step, - margin=margin, - audio_queries=audio_queries, - target_sr=target_sr, - ) - self.all_input_ids: List[List[int]] = features[0] - self.all_segment_ids: List[List[int]] = features[1] - self.all_input_mask: List[List[int]] = features[2] - self.all_subtokens_mask: List[List[int]] = features[3] - self.all_quantities_of_preceding_words: List[int] = features[4] - self.all_query_ids: List[int] = features[5] - self.all_is_first: List[bool] = features[6] - self.all_is_last: List[bool] = features[7] - self.all_audio_queries: Optional[List[List[float]]] = features[8] - self.all_audio_lengths: Optional[List[List[int]]] = features[9] - self.use_audio = audio_queries is not None - - def __len__(self) -> int: - return len(self.all_input_ids) - - def collate_fn( - self, - batch: List[ - Tuple[ - np.ndarray, - np.ndarray, - np.ndarray, - np.ndarray, - int, - int, - bool, - bool, - Optional[np.ndarray], - Optional[np.ndarray], - ] - ], - ) -> Union[ - Tuple[Tensor, Tensor, Tensor, Tensor, Any, Any, Any, Any], - Tuple[Tensor, Tensor, Tensor, Tensor, Any, Any, Any, Any, Any, Any], - ]: - """ - Collates samples into batches. - - Args: - batch (:obj:`List[tuple]`): a list of samples returned by :meth:`__getitem__` method. - - Returns: - :obj:`Tuple[torch.Tensor (x4), Tuple[int, ...] (x2), Tuple[bool, ...] (x2)]`: a tuple containing 8 - elements: - - - ``input_ids`` (:obj:`torch.Tensor`): an integer tensor of shape ``[Batch, Time]`` containing encoded - input text. - - ``segment_ids`` (:obj:`torch.Tensor`): an integer tensor of shape ``[Batch, Time]`` filled with zeros. - - ``input_mask`` (:obj:`torch.Tensor`): a boolean tensor of shape ``[Batch, Time]`` which elements are - ``True`` if corresponding token is not a padding token. - - ``subtokens_mask`` (:obj:`torch.Tensor`): a boolean tensor of shape ``[Batch, Time]`` which elements - are ``True`` if corresponding tken is the first token in a word. - - ``quantities_of_preceding_words`` (:obj:`Tuple[int, ...]`): a tuple containing number of words in - a query preceding current segment. - - ``query_ids`` (:obj:`Tuple[int, ...]`): a tuple containing indices of queries to which segments belong. - - ``is_first`` (:obj:`Tuple[bool, ...]`): a tuple booleans which elements are ``True`` if corresponding - segment is the first segment in a query. - - ``is_last`` (:obj:`Tuple[bool, ...]`): a tuple of booleans which elements are ``True`` if corresponding - segment is the last segment in a query. - - """ - if not self.use_audio: - inp_ids, segment_ids, inp_mask, st_mask, n_preceding, query_ids, is_first, is_last = zip(*batch) - return ( - pad_sequence([torch.tensor(x) for x in inp_ids], batch_first=True, padding_value=0), - pad_sequence([torch.tensor(x) for x in segment_ids], batch_first=True, padding_value=0), - pad_sequence([torch.tensor(x) for x in inp_mask], batch_first=True, padding_value=0), - pad_sequence([torch.tensor(x) for x in st_mask], batch_first=True, padding_value=0), - n_preceding, - query_ids, - is_first, - is_last, - ) - ( - inp_ids, - segment_ids, - inp_mask, - st_mask, - n_preceding, - query_ids, - is_first, - is_last, - features, - features_length, - ) = zip(*batch) - return ( - pad_sequence([torch.tensor(x) for x in inp_ids], batch_first=True, padding_value=0), - pad_sequence([torch.tensor(x) for x in segment_ids], batch_first=True, padding_value=0), - pad_sequence([torch.tensor(x) for x in inp_mask], batch_first=True, padding_value=0), - pad_sequence([torch.tensor(x) for x in st_mask], batch_first=True, padding_value=0), - n_preceding, - query_ids, - is_first, - is_last, - pad_sequence([torch.tensor(x) for x in features], batch_first=True, padding_value=0).float(), - torch.tensor(features_length, dtype=torch.long), - ) - - def __getitem__(self, idx: int) -> Union[ - Tuple[ndarray, ndarray, ndarray, ndarray, int, int, bool, bool], - Tuple[ndarray, ndarray, ndarray, ndarray, int, int, bool, bool, ndarray, List[int]], - ]: - """ - Returns batch used for punctuation and capitalization inference. - - Args: - idx (:obj:`int`): a batch index - - Returns: - :obj:`Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int, bool, bool]`: a tuple containing: - - - ``input_ids`` (:obj:`np.ndarray`): an integer numpy array of shape ``[Time]``. Ids of word - subtokens encoded using tokenizer passed in constructor ``tokenizer`` parameter. - - ``segment_ids`` (:obj:`np.ndarray`): an integer zeros numpy array of shape ``[Time]``. Indices - of segments for BERT model (token types in HuggingFace terminology). - - ``input_mask`` (:obj:`np.ndarray`): a boolean numpy array of shape ``[Time]``. An element of - this array is ``True`` if corresponding token is not padding token. - - ``subtokens_mask`` (:obj:`np.ndarray`): a boolean numpy array of shape ``[Time]``. An element - equals ``True`` if corresponding token is the first token in a word and ``False`` otherwise. For - example, if input query ``"language processing"`` is tokenized into - ``["[CLS]", "language", "process", "ing", "SEP"]``, then ``subtokens_mask`` will be - ``[False, True, True, False, False]``. - - ``quantities_of_preceding_words`` (:obj:`int`): a number of words preceding current segment in the - query to which the segment belongs. This parameter is used for uniting predictions from adjacent - segments. - - ``query_ids`` (:obj:`int`): an index of query to which the segment belongs - - ``is_first`` (:obj:`bool`): whether a segment is the first segment in a query. The left margin of - the first segment in a query is not removed. - - ``is_last`` (:obj:`bool`): whether a query is the last query in a query. The right margin of the last - segment in a query is not removed. - """ - if not self.use_audio: - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.float32), - np.array(self.all_subtokens_mask[idx]), - self.all_quantities_of_preceding_words[idx], - self.all_query_ids[idx], - self.all_is_first[idx], - self.all_is_last[idx], - ) - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.float32), - np.array(self.all_subtokens_mask[idx]), - self.all_quantities_of_preceding_words[idx], - self.all_query_ids[idx], - self.all_is_first[idx], - self.all_is_last[idx], - np.array(self.all_audio_queries[idx], dtype=np.float64), - self.all_audio_lengths[idx], - ) diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py deleted file mode 100644 index e88d87ba7c45..000000000000 --- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py +++ /dev/null @@ -1,1293 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import json -import multiprocessing as mp -import os -import pickle -import re -import shutil -import tempfile -from collections import deque -from pathlib import Path -from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple, Type, Union - -import numpy as np -import torch -import webdataset as wds -from joblib import Parallel, delayed -from omegaconf import DictConfig -from torch.utils.data import IterableDataset - -from nemo.collections.common.tokenizers import TokenizerSpec -from nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset import ( - LABEL_ID_DIR_FOR_NEMO_CHECKPOINT, - BertPunctuationCapitalizationDataset, - Progress, - create_label_ids, - create_masks_and_segment_ids, - load_label_ids, - raise_not_equal_labels_error, -) -from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer -from nemo.core.neural_types import AudioSignal, ChannelType, LabelsType, LengthsType, MaskType, NeuralType -from nemo.utils import logging -from nemo.utils.distributed import webdataset_split_by_workers - -NUMBER_RE = "(0|[1-9][0-9]*)" -TAR_FRAGMENT_TMPL_IN_PROGRESS = "fragment{fragment_idx}.{file_idx}.tar" -TAR_FRAGMENT_TMPL_FINISHED = "fragment{fragment_idx}.num_batches{num_batches}.{file_idx}.tar" -TAR_FRAGMENT_TMPL_TO_REPACK = "fragment{fragment_idx}.num_batches{num_batches}.{file_idx}.tar.to_repack" -TAR_FRAGMENT_PATTERN_IN_PROGRESS = re.compile(f"fragment{NUMBER_RE}.{NUMBER_RE}.tar$") -TAR_FRAGMENT_PATTERN_FINISHED = re.compile(f"fragment{NUMBER_RE}.num_batches{NUMBER_RE}.{NUMBER_RE}.tar$") -TAR_FRAGMENT_PATTERN_TO_REPACK = re.compile(f"fragment{NUMBER_RE}.num_batches{NUMBER_RE}.{NUMBER_RE}.tar.to_repack$") -NOT_ALLOWED_CHARACTERS_IN_FILE_NAME = re.compile(f"[^a-zA-Z0-9_.-]") -REPLACE_NOT_ALLOWED_CHARACTERS_IN_FILE_NAME = re.compile(f"-*[^a-zA-Z0-9_.-]+-*") - -DATASET_PARAMETERS_TMPL = "{prefix}.tokens{tokens_in_batch}.max_seq_length{max_seq_length}.{tokenizer}" -TAR_FINAL_TMPL = ".batches{num_batches}.{ctr}.tar" - -PROGRESS_REPORT_PERIOD = 10 ** 4 - -METADATA_PUNCT_LABEL_VOCAB_KEY = 'punct_label_vocab_file' -METADATA_CAPIT_LABEL_VOCAB_KEY = 'capit_label_vocab_file' -DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME = 'punct_label_vocab.csv' -DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME = 'capit_label_vocab.csv' - - -def count_lines_and_get_fragment_starting_positions( - file_name: Path, lines_per_dataset_fragment: int -) -> Tuple[int, List[int]]: - """ - Returns number of lines in a file and indices of fragment starting bytes. - - Args: - file_name: a path to a text or label file - lines_per_dataset_fragment: number of lines in a dataset fragment. The last fragment can contain fewer lines - - Returns: - num_lines: number of lines in a file - start_bytes: indices of fragment starting bytes - """ - pos = [0] - with file_name.open() as f: - i = 0 - line = f.readline() - while line: - i += 1 - if i % lines_per_dataset_fragment == 0: - pos.append(f.tell()) - line = f.readline() - return i, pos[:-1] if i % lines_per_dataset_fragment == 0 else pos - - -def get_fragment_start_bytes( - text_file: Path, labels_file: Path, lines_per_dataset_fragment: int, audio_file: Path = None -) -> Union[Tuple[Any, Any, Any, Any], Tuple[Any, Any, Any]]: - """ - A function for calculating borders of dataset fragments. The function is used to split ``text_file`` and - ``labels_file`` for processing them in parallel. - - Args: - text_file: a path to a dataset source file - labels_file: a path to a dataset label file - lines_per_dataset_fragment: a number of lines in one fragment - audio_file: a path to a dataset audio file if one needed - - Returns: - num_lines: total number of elements in the dataset (number of lines in ``text_file``` and ``labels_file``) - text_start_bytes: indices of the first bytes of fragments in ``text_file`` - label_start_bytes: indices of the first bytes of fragments in ``labels_file`` - """ - logging.info( - f"Counting lines in files {text_file} and {labels_file} and creating segment borders. This may take " - f"considerable time. 86GB, 1.27b lines file was processed in 7 minutes." - ) - if audio_file: - result = Parallel(n_jobs=3)( - delayed(count_lines_and_get_fragment_starting_positions)(file_name, lines_per_dataset_fragment) - for file_name in [text_file, labels_file, audio_file] - ) - num_lines = result[0][0] - if result[0][0] != result[1][0]: - raise ValueError( - f"Text file {text_file} and label file {labels_file} contain different number of lines. Number of lines " - f"in text file: {result[0][0]}, number of lines in label file: {result[1][0]}." - ) - text_start_bytes, label_start_bytes, manifest_start_bytes = result[0][1], result[1][1], result[2][1] - assert len(text_start_bytes) == len(label_start_bytes) == len(manifest_start_bytes) - return num_lines, text_start_bytes, label_start_bytes, manifest_start_bytes - else: - result = Parallel(n_jobs=2)( - delayed(count_lines_and_get_fragment_starting_positions)(file_name, lines_per_dataset_fragment) - for file_name in [text_file, labels_file] - ) - num_lines = result[0][0] - if result[0][0] != result[1][0]: - raise ValueError( - f"Text file {text_file} and label file {labels_file} contain different number of lines. Number of lines " - f"in text file: {result[0][0]}, number of lines in label file: {result[1][0]}." - ) - text_start_bytes, label_start_bytes = result[0][1], result[1][1] - assert len(text_start_bytes) == len(label_start_bytes) - return num_lines, text_start_bytes, label_start_bytes - - -def process_fragment( - text_file: Path, - labels_file: Path, - output_dir: Path, - text_start_pos: int, - label_start_pos: int, - lines_per_dataset_fragment: int, - max_seq_length: int, - tokens_in_batch: int, - num_batches_per_tarfile: int, - tokenizer_name: str, - tokenizer_model: Optional[Path], - vocab_file: Optional[Path], - merges_file: Optional[Path], - special_tokens: Dict[str, str], - use_fast_tokenizer: Optional[bool], - pad_label: str, - punct_label_ids: Dict[str, int], - capit_label_ids: Dict[str, int], - fragment_idx: int, - tokenization_progress_queue: mp.Queue, - batch_mark_up_progress_queue: mp.Queue, - batch_building_progress_queue: mp.Queue, - writing_to_tar_progress_queue: mp.Queue, - audio_file: Path = None, - sample_rate: int = None, - audio_file_start_pos: int = None, - use_audio: bool = False, -) -> None: - tokenizer = get_tokenizer( - tokenizer_name, - tokenizer_model=None if tokenizer_model is None else str(tokenizer_model), - vocab_file=None if vocab_file is None else str(vocab_file), - merges_file=None if merges_file is None else str(merges_file), - special_tokens=special_tokens, - use_fast=use_fast_tokenizer, - ) - tmp_text: Optional[str] = None - tmp_labels: Optional[str] = None - tmp_audio: Optional[str] = None - try: - otfd, tmp_text = tempfile.mkstemp(suffix='.txt', prefix=f'text_{fragment_idx}_', dir=output_dir, text=True) - olfd, tmp_labels = tempfile.mkstemp(suffix='.txt', prefix=f'labels_{fragment_idx}_', dir=output_dir, text=True) - if use_audio: - oafd, tmp_audio = tempfile.mkstemp( - suffix='.txt', prefix=f'audio_{fragment_idx}_', dir=output_dir, text=True - ) - with text_file.open() as tf, labels_file.open() as lf, os.fdopen(otfd, 'w') as otf, os.fdopen( - olfd, 'w' - ) as olf: # handle audio manifest - if use_audio: - mf = audio_file.open() - mf.seek(audio_file_start_pos) - oaf = os.fdopen(oafd, 'w') - tf.seek(text_start_pos) - lf.seek(label_start_pos) - for _ in range(lines_per_dataset_fragment): - text_line = tf.readline() - if not text_line: - break - otf.write(text_line) - olf.write(lf.readline()) - if use_audio: - oaf.write(mf.readline()) - if use_audio: - mf.close() - oaf.close() - dataset = BertPunctuationCapitalizationDataset( - tmp_text, - tmp_labels, - max_seq_length, - tokenizer, - tokens_in_batch=tokens_in_batch, - pad_label=pad_label, - punct_label_ids=punct_label_ids, - capit_label_ids=capit_label_ids, - n_jobs=0, - use_cache=False, - add_masks_and_segment_ids_to_batch=False, - verbose=False, - tokenization_progress_queue=tokenization_progress_queue, - batch_mark_up_progress_queue=batch_mark_up_progress_queue, - batch_building_progress_queue=batch_building_progress_queue, - audio_file=tmp_audio, - sample_rate=sample_rate, - use_audio=use_audio, - use_bucketing=True, - preload_audios=use_audio, - ) - finally: - if tmp_text is not None and os.path.exists(tmp_text): - os.remove(tmp_text) - if tmp_labels is not None and os.path.exists(tmp_labels): - os.remove(tmp_labels) - if tmp_audio is not None and os.path.exists(tmp_audio): - os.remove(tmp_audio) - dataset.features_pkl.unlink() - tar_ctr = 0 - current_file_name = output_dir / TAR_FRAGMENT_TMPL_IN_PROGRESS.format(fragment_idx=fragment_idx, file_idx=tar_ctr) - current_num_batches = 0 - sink = wds.TarWriter(str(current_file_name)) - progress_made = 0 - for batch_i, batch in enumerate(dataset): - sink.write({"__key__": f"fragment-{fragment_idx}-batch-{batch_i}", "batch.pyd": batch}) - current_num_batches += 1 - progress_made += len(batch['input_ids']) - if current_num_batches % num_batches_per_tarfile == 0: - sink.close() - current_file_name.rename( - output_dir - / TAR_FRAGMENT_TMPL_FINISHED.format( - fragment_idx=fragment_idx, num_batches=current_num_batches, file_idx=tar_ctr - ) - ) - writing_to_tar_progress_queue.put(progress_made) - progress_made = 0 - tar_ctr += 1 - current_file_name = output_dir / TAR_FRAGMENT_TMPL_IN_PROGRESS.format( - fragment_idx=fragment_idx, file_idx=tar_ctr - ) - current_num_batches = 0 - sink = wds.TarWriter(str(current_file_name)) - sink.close() - writing_to_tar_progress_queue.put(progress_made) - if progress_made > 0: - new_file_name = output_dir / TAR_FRAGMENT_TMPL_TO_REPACK.format( - fragment_idx=fragment_idx, num_batches=current_num_batches, file_idx=tar_ctr - ) - current_file_name.rename(new_file_name) - else: - current_file_name.unlink() - if fragment_idx == 0: - punct_label_ids_file, capit_label_ids_file = dataset.save_labels_and_get_file_paths( - DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME, DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME - ) - punct_label_ids_file.rename(output_dir / DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME) - capit_label_ids_file.rename(output_dir / DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME) - shutil.rmtree(punct_label_ids_file.parent) - - -def remove_unexpected_files_and_dirs(output_dir: Path, output_file_tmpl: str, metadata_file_name: Path) -> None: - """ - This function removes all files with names which may be used in the dataset creation. - - Args: - output_dir: a path to directory where removal is performed - output_file_tmpl: a format string for a name of final tar file. Must include fields ``ctr`` for number of the - file and ``num_batches`` for number of batches in the file. - metadata_file_name: a metadata file name - """ - if not output_dir.is_dir(): - return - tar_final_pattern = re.compile(output_file_tmpl.format(ctr=NUMBER_RE, num_batches=NUMBER_RE)) - unexpected_tar_files = [ - path - for path in output_dir.iterdir() - if any( - [ - p.match(path.name) is not None - for p in [ - TAR_FRAGMENT_PATTERN_IN_PROGRESS, - TAR_FRAGMENT_PATTERN_FINISHED, - TAR_FRAGMENT_PATTERN_TO_REPACK, - tar_final_pattern, - ] - ] - ) - ] - if unexpected_tar_files: - logging.warning( - f"Found {len(unexpected_tar_files)} unexpected tar files in the output directory {output_dir}. " - f"All of them are going to be removed. The files match one of 3 patterns: " - f"'{TAR_FRAGMENT_PATTERN_IN_PROGRESS.pattern}', '{TAR_FRAGMENT_PATTERN_FINISHED.pattern}', " - f"'{tar_final_pattern.pattern}'. The first unexpected files: " - f"{', '.join([str(f) for f in unexpected_tar_files[:3]])}." - ) - for fn in unexpected_tar_files: - fn.unlink() - if metadata_file_name.exists(): - logging.warning(f"Found metadata file {metadata_file_name}. It is going to be removed.") - metadata_file_name.unlink() - punct_label_ids = output_dir / DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME - capit_label_ids = output_dir / DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME - if punct_label_ids.exists(): - logging.warning(f"Found unexpected punctuation label file {punct_label_ids}. It is going to be removed.") - punct_label_ids.unlink() - if capit_label_ids.exists(): - logging.warning(f"Found unexpected capitalization label file {capit_label_ids}. It is going to be removed.") - capit_label_ids.unlink() - - -def collect_unique_labels_from_fragment( - labels_file: Path, start_pos: int, lines_per_dataset_fragment: int, progress_queue: mp.Queue, fragment_idx: int -) -> Tuple[Set[str], Set[str]]: - """ - Returns a set of unique punctuation labels and a set of unique capitalization labels. - - Args: - labels_file: a path to a file with labels - start_pos: an index of the first byte of a fragment in ``labels_file`` - lines_per_dataset_fragment: number of lines in dataset fragment. In the last fragment there can be less lines. - progress_queue: a queue for reporting number of processed lines - fragment_idx: a processed fragment index - - Returns: - unique_punct: a set of unique punctuation labels - unique_capit: a set of unique capitalization labels - """ - unique_punct, unique_capit = set(), set() - with labels_file.open() as f: - f.seek(start_pos) - progress_report = 0 - for i in range(lines_per_dataset_fragment): - line = f.readline() - if not line: - break - pairs = line.split() - if not all([len(p) == 2 for p in pairs]): - broken_pairs = [i for i, p in enumerate(pairs) if len(p) != 2] - raise ValueError( - f"Found broken labels line in number {fragment_idx * lines_per_dataset_fragment + i} in file " - f"{labels_file}. Indices of broken pairs of labels: {broken_pairs}" - ) - punct, capit = zip(*pairs) - unique_punct.update(punct) - unique_capit.update(capit) - progress_report += 1 - if progress_report >= PROGRESS_REPORT_PERIOD: - progress_queue.put(progress_report) - progress_report = 0 - progress_queue.put(progress_report) - return unique_punct, unique_capit - - -def create_label_dictionaries( - labels_file: Path, - text_start_bytes: List[int], - num_lines: int, - lines_per_dataset_fragment: int, - pad_label: str, - n_jobs: int, -) -> Tuple[Dict[str, int], Dict[str, int]]: - """ - Creates punctuation and capitalization label ids dictionaries based on labels present in ``labels_file``. - - Args: - labels_file: a path to file with labels - text_start_bytes: indices of first bytes of fragments in ``labels_file`` - num_lines: total number of lines in ``labels_file`` - lines_per_dataset_fragment: number of lines in dataset fragments. The last fragment can have fewer lines - pad_label: a label used for padding and for absence of punctuation and capitalization - n_jobs: a number of fragments processed in parallel - - Returns: - punct_label_ids: a dictionary with punctuation label ids - capit_label_ids: a dictionary with capitalization label ids - """ - with Progress(num_lines, "Creating label dictionary", "line") as progress_queues: - result = Parallel(n_jobs=min(n_jobs, len(text_start_bytes)))( - delayed(collect_unique_labels_from_fragment)( - labels_file, start_pos, lines_per_dataset_fragment, *progress_queues, fragment_idx - ) - for fragment_idx, start_pos in enumerate(text_start_bytes) - ) - unique_punct, unique_capit = zip(*result) - unique_punct = set().union(*unique_punct) - unique_capit = set().union(*unique_capit) - return create_label_ids(unique_punct, pad_label), create_label_ids(unique_capit, pad_label) - - -def check_label_ids(pad_label: str, punct_label_ids: Dict[str, int], capit_label_ids: Dict[str, int]) -> None: - """ - A function for checking that pad label has zeroth id in ``punct_label_dis`` and ``capit_label_ids`` dictionaries. - Args: - pad_label: a pad label - punct_label_ids: a dictionary with punctuation label ids - capit_label_ids: a dictionary with capitalization label ids - """ - msg = "Parameter `pad_label` has to have id 0 in dictionary `{param_name}` whereas it has id {id_}." + ( - '' if len(pad_label) > 10 else f" pad_label='{pad_label}'" - ) - if punct_label_ids is not None: - if punct_label_ids[pad_label] != 0: - raise ValueError(msg.format(param_name='punct_label_ids', id_=punct_label_ids[pad_label])) - if capit_label_ids is not None: - if capit_label_ids[pad_label] != 0: - raise ValueError(msg.format(param_name='capit_label_ids', id_=capit_label_ids[pad_label])) - - -def process_error(msg: str, error_class_or_function: Union[Type[Exception], Callable[[str], Any]]) -> None: - if inspect.isclass(error_class_or_function) and issubclass(error_class_or_function, Exception): - raise error_class_or_function(msg) - if callable(error_class_or_function): - error_class_or_function(msg) - raise ValueError( - f"Parameter `error_class_or_function` has to be a subclass of `Exception` or a function." - f"Given {type(error_class_or_function)}" - ) - - -def check_labels_for_being_unique_before_building_label_ids( - pad_label: str, - other_labels: List[str], - pad_label_name: str, - other_labels_name: str, - error_class_or_function: Union[Type[Exception], Callable[[str], Any]], -) -> None: - """ - A function for checking that that all labels are unique. - - Args: - pad_label: a pad label - other_labels: a list of labels except for the pad label - pad_label_name: a name of the pad label used in error message - other_labels_name: a name of other labels used in error message - error_class_or_function: a class of an exception which is raised if there is a problem with labels. - Alternatively it can be a function for handling exceptions, for example ``argparse.ArgumentParser.error``. - Such a function has to take one argument -- error message. - """ - for i, lbl in enumerate(other_labels): - if lbl == pad_label: - msg = f"Label number {i} in parameter `{other_labels_name}` is equal to `{pad_label_name}`." - process_error(msg, error_class_or_function) - for i in range(len(other_labels) - 1): - for lbl in other_labels[i + 1 :]: - if lbl == other_labels[i]: - msg = f"Label number {i} occurs at least 2 times in parameter `{other_labels_name}`." - process_error(msg, error_class_or_function) - - -def build_label_ids_from_list_of_labels(pad_label: str, other_labels: List[str]) -> Dict[str, int]: - """ - Builds label ids dictionary from pad label and list of other labels. Used for parsing command line arguments. - Args: - pad_label: a pad label - other_labels: list of labels except for the pad label - - Returns: - a dictionary with label ids - """ - check_labels_for_being_unique_before_building_label_ids( - pad_label, other_labels, 'pad_label', 'other_labels', ValueError - ) - ids = {pad_label: 0} - for lbl in other_labels: - ids[lbl] = len(ids) - return ids - - -def get_label_dictionaries( - labels_file: Path, - start_bytes: List[int], - num_lines: int, - lines_per_dataset_fragment: int, - pad_label: str, - punct_label_ids: Optional[Dict[str, int]], - capit_label_ids: Optional[Dict[str, int]], - punct_label_vocab_file: Optional[Path], - capit_label_vocab_file: Optional[Path], - n_jobs: int, -) -> Tuple[Dict[str, int], Dict[str, int]]: - """ - Return label ids if the label ids are present in parameters ``punct_label_ids``, ``capit_label_ids``, - ``punct_label_vocab_file``, ``capit_label_vocab_file``. Otherwise, label ids are created using ``labels_file``. - - Args: - labels_file: a path to file with labels. Labels have to be given in the format described in - https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#nemo-data-format - start_bytes: a list of positions in ``labels_file`` at which fragments start. Parameter ``start_bytes`` is used - for creating labels for several fragments in parallel - num_lines: total number of lines in ``labels_file``. Parameter ``num_lines`` is used for showing progress of - label ids collection - lines_per_dataset_fragment: number of lines in a dataset fragment - pad_label: a label used for padding and also neutral label showing there is no punctuation and capitalization. - Label ``pad_label`` has to have id ``0`` in parameters ``punct_label_ids``, ``capit_label_ids``, - ``punct_label_vocab_file``, ``capit_label_vocab_file`` if these parameters are provided. - punct_label_ids: a dictionary with punctuation label ids. Pad label has to have id ``0``. No more than 1 of - parameters ``punct_label_ids`` and ``punct_label_vocab_file`` can be provided. - capit_label_ids: a dictionary with capitalization label ids. Pad label has to have id ``0``. No more than 1 of - parameters ``capit_label_ids`` and ``capit_label_vocab_file`` can be provided. - punct_label_vocab_file: a text file with punctuation labels. Every line in the file contains 1 label. Pad label - has to be in the first line. No more than 1 of parameters ``punct_label_ids`` and - ``punct_label_vocab_file`` can be provided. - capit_label_vocab_file: a text file with capitalization labels. Every line in the file contains 1 label. Pad - label has to be in the first line. No more than 1 of parameters ``capit_label_ids`` and - ``capit_label_vocab_file`` can be provided. - n_jobs: a number of fragments processed in parallel - - Returns: - punct_label_ids: a dictionary with punctuation label ids - capit_label_ids: a dictionary with capitalization label ids - """ - if punct_label_ids is not None and punct_label_vocab_file is not None: - raise ValueError("You can provide at most one of parameters `punct_label_ids` and `punct_label_vocab_file`.") - if capit_label_ids is not None and capit_label_vocab_file is not None: - raise ValueError("You can provide at most one of parameters `capit_label_ids` and `capit_label_vocab_file`.") - if punct_label_ids is None and punct_label_vocab_file is not None: - punct_label_ids = load_label_ids(punct_label_vocab_file) - if capit_label_ids is None and capit_label_vocab_file is not None: - capit_label_ids = load_label_ids(capit_label_vocab_file) - check_label_ids(pad_label, punct_label_ids, capit_label_ids) - if punct_label_ids is None or capit_label_ids is None: - _punct_label_ids, _capit_label_ids = create_label_dictionaries( - labels_file, start_bytes, num_lines, lines_per_dataset_fragment, pad_label, n_jobs - ) - if punct_label_ids is None: - punct_label_ids = _punct_label_ids - if capit_label_ids is None: - capit_label_ids = _capit_label_ids - return punct_label_ids, capit_label_ids - - -def decode_pyd(key: str, value: bytes) -> Any: - """ - Used for decoding batch loaded by ``webdataset`` from tar files. - Args: - key: name of a batch - value: pickled batch - - Returns: - decoded batch - """ - return pickle.loads(value) - - -def repack_tar_files_with_not_enough_batches(output_dir: Path, num_batches_per_tarfile: int) -> None: - f""" - It is possible that number of batches in a fragment is not evenly divisible by ``num_batches_per_tarfile``. - In such a case excess batches are put in a tar file which matches a pattern - ``fragment(0|[1-9][0-9]*).num_batches(0|[1-9][0-9]*).(0|[1-9][0-9]*).tar.to_repack``. Such files are repacked by - ``repack_tar_files_with_not_enough_batches`` function into tar files with correct ``num_batches_per_tarfile`` - batches each. If there is no enough batches in repacked files, then up to ``num_batches_per_tarfile - 1`` - remaining batches may be discarded. - - Args: - output_dir: a path to the output directory which contains files to repack and where new files are saved - num_batches_per_tarfile: a number of batches in 1 tar file. If number of batches in files matching a pattern - ``fragment(0|[1-9][0-9]*).num_batches(0|[1-9][0-9]*).(0|[1-9][0-9]*).tar.to_repack`` is not evenly - divisible by ``num_batches_per_tarfile`` excess batches are discarded. - """ - files_to_repack_with_matches = [ - (path, TAR_FRAGMENT_PATTERN_TO_REPACK.match(path.name)) - for path in output_dir.iterdir() - if TAR_FRAGMENT_PATTERN_TO_REPACK.match(path.name) is not None - ] - files_to_repack_with_matches = sorted(files_to_repack_with_matches, key=lambda x: int(x[1].group(3))) - logging.info(f"Found {len(files_to_repack_with_matches)} files for repacking.") - files_to_repack_with_matches = deque(files_to_repack_with_matches) - total_batches_in_repacked_files = 0 - initial_number_of_files_to_repack = len(files_to_repack_with_matches) - pop_file_ds = None - new_file_sink = None - new_file_num_batches = 0 - while files_to_repack_with_matches: - assert pop_file_ds is None or new_file_sink is None - if new_file_sink is None: - # `append_file` is a file which content will serve as a start for new tar file. `append_file` content is - # copied into a `new_file` and then content of other files needing repacking is appended to content of - # `new_file`. - append_file, match = files_to_repack_with_matches.popleft() - new_file = append_file.parent / TAR_FRAGMENT_TMPL_FINISHED.format( - fragment_idx=match.group(1), num_batches=num_batches_per_tarfile, file_idx=match.group(3) - ) - new_file_sink = wds.TarWriter(str(new_file)) - append_ds_to_rewrite = wds.DataPipeline( - wds.SimpleShardList(urls=[str(append_file)]), - wds.tarfile_to_samples(), - wds.decode(wds.handle_extension('.pyd', decode_pyd)), - wds.to_tuple('__key__', 'batch.pyd'), - ) - for key, batch in iter(append_ds_to_rewrite): - new_file_sink.write({"__key__": key, "batch.pyd": batch}) - new_file_num_batches += 1 - total_batches_in_repacked_files += 1 - assert total_batches_in_repacked_files < initial_number_of_files_to_repack * num_batches_per_tarfile - assert new_file_num_batches == int(match.group(2)), ( - f"Number of batches {new_file_num_batches} in {append_file} is different from number of batches " - f"{match.group(2)} in repacked tar file with name {append_file}." - ) - append_file.unlink() - if files_to_repack_with_matches and pop_file_ds is None: - pop_file, _ = files_to_repack_with_matches.pop() - pop_file_ds = wds.DataPipeline( - wds.SimpleShardList([str(pop_file)]), - wds.tarfile_to_samples(), - wds.decode(wds.handle_extension('.pyd', decode_pyd)), - wds.to_tuple('__key__', 'batch.pyd'), - ) - pop_file_ds = iter(pop_file_ds) - if pop_file_ds is not None and new_file_sink is not None: - while new_file_num_batches < num_batches_per_tarfile: - try: - key, batch = next(pop_file_ds) - except StopIteration: - pop_file_ds = None - pop_file.unlink() - break - new_file_sink.write({"__key__": key, "batch.pyd": batch}) - total_batches_in_repacked_files += 1 - assert total_batches_in_repacked_files < initial_number_of_files_to_repack * num_batches_per_tarfile - new_file_num_batches += 1 - if new_file_num_batches >= num_batches_per_tarfile: - assert new_file_num_batches == num_batches_per_tarfile - new_file_sink.close() - new_file_sink = None - new_file_num_batches = 0 - if new_file_sink is not None: - new_file_sink.close() - new_file.unlink() - logging.info(f"Discarded {new_file_num_batches} batches.") - if pop_file_ds is not None: - pop_file.unlink() - logging.info(f"Repacked {total_batches_in_repacked_files} batches from short tar files") - - -def create_metadata_file( - output_dir: Path, output_file_tmpl: str, metadata_file_name: Path, num_batches_per_tarfile: int -) -> None: - """ - Rename tar files according to template ``output_file_tmpl`` and save metadata file. - Args: - output_dir: a path to directory which contains initial tar files and where renamed tar files are saved - output_file_tmpl: a template of a new tar file name - metadata_file_name: a path to a file into which metadata is going to be saved - num_batches_per_tarfile: a required number of batches in tar files. Used for checking that present tar files - have correct number of batches - """ - metadata = {"num_batches": 0, "tar_files": []} - for i, fn in enumerate([fn for fn in output_dir.iterdir() if TAR_FRAGMENT_PATTERN_FINISHED.match(fn.name)]): - nb = int(TAR_FRAGMENT_PATTERN_FINISHED.match(fn.name).group(2)) - assert nb == num_batches_per_tarfile - new_name = output_dir / output_file_tmpl.format(ctr=i, num_batches=nb) - fn.rename(new_name) - metadata['tar_files'].append(new_name.name) - metadata["num_batches"] += nb - metadata[METADATA_PUNCT_LABEL_VOCAB_KEY] = DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME - metadata[METADATA_CAPIT_LABEL_VOCAB_KEY] = DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME - logging.info(f"{metadata['num_batches']} batches are in tarred dataset with metadata file {metadata_file_name}") - with metadata_file_name.open('w') as f: - json.dump(metadata, f, indent=2) - - -def check_tar_file_prefix( - tar_file_prefix: str, error_class_or_function: Union[Type[Exception], Callable[[str], Any]], var_name: str -) -> None: - not_allowed_characters_in_prefix = NOT_ALLOWED_CHARACTERS_IN_FILE_NAME.findall(tar_file_prefix) - if not_allowed_characters_in_prefix: - not_allowed_characters_in_prefix = set(not_allowed_characters_in_prefix) - msg = ( - f"Found {len(not_allowed_characters_in_prefix)} not allowed characters in `{var_name}`. Only 'A-Z', " - f"'a-z', '0-9', '_', '-', '.' characters are allowed. Examples of not allowed characters: " - f"{list(not_allowed_characters_in_prefix)[:10]}. `{var_name}`[:30]={repr(tar_file_prefix)[:30]}." - ) - process_error(msg, error_class_or_function) - - -def create_tarred_dataset( - text_file: Union[os.PathLike, str], - labels_file: Union[os.PathLike, str], - output_dir: Union[os.PathLike, str], - max_seq_length: int, - tokens_in_batch: int, - lines_per_dataset_fragment: int, - num_batches_per_tarfile: int, - tokenizer_name: str, - tokenizer_model: Optional[Union[os.PathLike, str]] = None, - vocab_file: Optional[Union[os.PathLike, str]] = None, - merges_file: Optional[Union[os.PathLike, str]] = None, - special_tokens: Optional[Dict[str, str]] = None, - use_fast_tokenizer: Optional[bool] = False, - pad_label: str = 'O', - punct_label_ids: Optional[Dict[str, int]] = None, - capit_label_ids: Optional[Dict[str, int]] = None, - punct_label_vocab_file: Optional[Union[os.PathLike, str]] = None, - capit_label_vocab_file: Optional[Union[os.PathLike, str]] = None, - tar_file_prefix: Optional[str] = 'punctuation_capitalization', - n_jobs: Optional[int] = None, - audio_file: Optional[Path] = None, - use_audio: Optional[bool] = False, - sample_rate: Optional[int] = 16000, -) -> None: - """ - Creates tarred dataset from ``text_file`` and ``labels_file``. A tarred dataset allows to train on large amounts of - data without storing it all into memory simultaneously. You may use these function directly or try script - `examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py - `_. - - Tarred dataset is a directory which contains metadata file, tar files with batches, - ``punct_label_vocab.csv`` and ``capit_label_vocab.csv`` files. - - Metadata file is a JSON file with 4 items: ``'num_batches'``, ``'tar_files'``, ``'punct_label_vocab_file'``, - ``'capit_label_vocab_file'``. The item ``'num_batches'`` (``int``) is a total number of batches in tarred dataset. - ``'tar_files'`` is a list of paths to tar files relative to directory containing the metadata file. The items - ``'punct_label_vocab_file'`` and ``'capit_label_vocab_file'`` are correspondingly paths to punctuation and - capitalization label vocabulary files. These paths are relative to directory containing the metadata file. - - Every tar file contains objects written using ``webdataset.TarWriter``. Each object is a dictionary with two items: - ``'__key__'`` and ``'batch.pyd'``. ``'__key__'`` is a name of a batch and ``'batch.pyd'`` is a pickled dictionary - which contains ``'input_ids'``, ``'subtokens_mask'``, ``'punct_labels'``, ``'capit_labels'``. ``'input_ids'`` is an - array containing ids of source tokens, ``'subtokens_mask'`` is a boolean array showing first tokens in words, - ``'punct_labels'`` and ``'capit_labels'`` are arrays with ids of labels. - - Metadata file should be passed to constructor of :class:`BertPunctuationCapitalizationTarredDataset` and the - instance of the class will handle iteration and constructing masks and token types for BERT model. - - Args: - text_file (:obj:`Union[os.PathLike, str]`): a path to a file with dataset source. Dataset source is lowercase - text without punctuation. Number of lines in ``text_file`` has to be equal to the number of lines in - ``labels_file``. - labels_file (:obj:`Union[os.PathLike, str]`): a path to a file with labels. Labels are given in the format - described in :ref:`NeMo Data Format`. - output_dir (:obj:`Union[os.PathLike, str]`): a path to a directory where metadata file, tar files and - ``'punct_label_ids.csv'`` and ``'capit_label_ids.csv'`` files are saved. - max_seq_length (:obj:`int`): Maximum number of subtokens in an input sequence. A source sequence which contains - too many subtokens is clipped to ``max_seq_length - 2`` subtokens and then [CLS] token is prepended to the - clipped sequence and [SEP] token is appended to the clipped sequence. The clipping is performed via removal - of subtokens in the end of a source sequence. - tokens_in_batch (:obj:`int`): maximum number of tokens in a batch including [CLS], [SEP], [UNK], and [PAD] - tokens. Before packing into batches source sequences are sorted by number of tokens in order to reduce - number of pad tokens. So the number of samples in a batch may vary. - lines_per_dataset_fragment (:obj:`int`): a number of lines processed by one worker during creation of tarred - dataset. A worker tokenizes ``lines_per_dataset_fragment`` lines and keeps in RAM tokenized text labels - before packing them into batches. Reducing ``lines_per_dataset_fragment`` leads to reducing of the amount - of memory used by this function. - num_batches_per_tarfile (:obj:`int`): a number of batches saved in a tar file. If you increase - ``num_batches_per_tarfile``, then there will be less tar files in the dataset. There cannot be less than - ``num_batches_per_tarfile`` batches in a tar file, and all excess batches are removed. Maximum number of - discarded batches is ``num_batches_per_tarfile - 1``. - tokenizer_name (:obj:`str`): a name of the tokenizer used for tokenization of source sequences. Possible - options are ``'sentencepiece'``, ``'word'``, ``'char'``, HuggingFace tokenizers. For more options see - function ``nemo.collections.nlp.modules.common.get_tokenizer``. The tokenizer must have properties - ``cls_id``, ``pad_id``, ``sep_id``, ``unk_id``. - tokenizer_model (:obj:`Union[os.PathLike, str]`, `optional`): a path to a tokenizer model required for - ``'sentencepiece'`` tokenizer. - vocab_file (:obj:`Union[os.PathLike, str]`, `optional`): a path to a vocabulary file which can be used in - ``'word'``, ``'char'``, and HuggingFace tokenizers. - merges_file (:obj:`Union[os.PathLike, str]`, `optional`): a path to merges file which can be used in - HuggingFace tokenizers. - special_tokens (:obj:`Dict[str, str]`, `optional`): a dictionary with special tokens passed to constructors of - ``'char'``, ``'word'``, ``'sentencepiece'``, and various HuggingFace tokenizers. - use_fast_tokenizer (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to use fast HuggingFace - tokenizer. - pad_label (:obj:`str`, `optional`, defaults to :obj:`'O'`): a pad label both for punctuation and - capitalization. This label is also a neutral label (used for marking words which do not need punctuation - and capitalization). - punct_label_ids (:obj:`Dict[str, int]`, `optional`): a dictionary which keys are punctuation labels and values - are label ids. The pad label ``pad_label`` has to have id ``0``. You can provide at most one of parameters - ``punct_label_ids`` and ``punct_label_vocab_file``. If none of parameters ``punct_label_ids`` and - ``punct_label_vocab_file`` is provided, then punctuation label ids will be inferred from ``labels_file`` - file. - capit_label_ids (:obj:`Dict[str, int]`, `optional`): same as ``punct_label_ids`` for capitalization labels. - punct_label_vocab_file (:obj:`Union[os.PathLike, str]`, `optional`): a path to a file with punctuation labels. - These labels include pad label. The pad label has to be the first label in the file. Each label is written - on a separate line. Alternatively you can use ``punct_labels_ids`` parameter. If none of parameters - ``punct_labels_ids`` and ``punct_label_vocab_file`` is provided, then punctuation label ids will be - inferred from ``labels_file`` file. - capit_label_vocab_file (:obj:`Union[os.PathLike, str]`, `optional`): same as ``punct_label_vocab_file`` for - capitalization labels. - tar_file_prefix (:obj:`str`, `optional`, defaults :obj:`'punctuation_capitalization'`): a string from which tar - file names start. The string can contain only characters ``A-Z``, ``a-z``, ``0-9``, ``_``, ``-``, ``.``. - n_jobs (:obj:`int`, `optional`): a number of workers for creating tarred dataset. If ``None``, then ``n_jobs`` - is equal to number of CPUs. - audio_file (:obj:`Optional[Union[os.PathLike, str]]`, defaults to :obj:`None`): a path to a file with audio dataset file paths if dataset is lexical and audio. Must contain one path per line. - use_audio (:obj:`bool`, `optional`, defaults to :obj:`False`): If set to ``True`` dataset becomes lexical and audio rather than only lexical. - sample_rate (:obj:`int`, `optional`, defaults to :obj:`16000`) Targeted sample rate of audios If ``use_audio`` set to ``True``. - """ - check_tar_file_prefix(tar_file_prefix, ValueError, 'tar_file_prefix') - if n_jobs is None: - n_jobs = mp.cpu_count() - text_file, labels_file = Path(text_file).expanduser(), Path(labels_file).expanduser() - output_dir = Path(output_dir).expanduser() - ds_params_str = DATASET_PARAMETERS_TMPL.format( - prefix=tar_file_prefix, - tokens_in_batch=tokens_in_batch, - max_seq_length=max_seq_length, - tokenizer=REPLACE_NOT_ALLOWED_CHARACTERS_IN_FILE_NAME.sub('-', tokenizer_name), - ) - output_file_tmpl = ds_params_str + TAR_FINAL_TMPL - metadata_file_name = output_dir / ('metadata.' + ds_params_str + '.json') - remove_unexpected_files_and_dirs(output_dir, output_file_tmpl, metadata_file_name) - audio_start_bytes = None - if use_audio: - num_lines, text_start_bytes, label_start_bytes, audio_start_bytes = get_fragment_start_bytes( - text_file, labels_file, lines_per_dataset_fragment, audio_file - ) - else: - num_lines, text_start_bytes, label_start_bytes = get_fragment_start_bytes( - text_file, labels_file, lines_per_dataset_fragment - ) - if text_start_bytes: - output_dir.mkdir(parents=True, exist_ok=True) - else: - raise ValueError(f"Both {labels_file} and {text_file} are empty. Tarred dataset cannot be created.") - punct_label_ids, capit_label_ids = get_label_dictionaries( - labels_file, - label_start_bytes, - num_lines, - lines_per_dataset_fragment, - pad_label, - punct_label_ids, - capit_label_ids, - punct_label_vocab_file, - capit_label_vocab_file, - n_jobs, - ) - - with Progress( - num_lines, ["Tokenization", "Batch mark up", "Batch building", "Writing tarred dataset"], "query" - ) as progress_queues: - Parallel(n_jobs=min(n_jobs, len(text_start_bytes)))( - delayed(process_fragment)( - text_file, - labels_file, - output_dir, - text_start_pos, - label_start_pos, - lines_per_dataset_fragment, - max_seq_length, - tokens_in_batch, - num_batches_per_tarfile, - tokenizer_name, - None if tokenizer_model is None else Path(tokenizer_model).expanduser(), - None if vocab_file is None else Path(vocab_file).expanduser(), - None if merges_file is None else Path(merges_file).expanduser(), - special_tokens, - use_fast_tokenizer, - pad_label, - punct_label_ids, - capit_label_ids, - fragment_idx, - *progress_queues, - audio_file, - sample_rate, - audio_file_start_pos, - use_audio, - ) - for fragment_idx, (text_start_pos, label_start_pos, audio_file_start_pos) in enumerate( - zip( - text_start_bytes, - label_start_bytes, - audio_start_bytes if use_audio else [None for _ in range(len(text_start_bytes))], - ) - ) - ) - repack_tar_files_with_not_enough_batches(output_dir, num_batches_per_tarfile) - create_metadata_file(output_dir, output_file_tmpl, metadata_file_name, num_batches_per_tarfile) - - -class BertPunctuationCapitalizationTarredDataset(IterableDataset): - """ - Punctuation capitalization dataset which allows not to load all data in memory simultaneously. A tarred dataset - is created from text and label files using script - `examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py - `_ - or function - :func:`~nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.create_tarred_dataset`. - - Args: - metadata_file (:obj:`Union[os.PathLike, str]`): a path to tarred dataset metadata file. Metadata file and files - referenced in metadata file are created by - `examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py - `_. - Metadata file is a JSON file which contains ``'num_batches'``, ``'tar_files'``, - ``'punct_label_vocab_file'``, ``'capit_label_vocab_file'`` items. The first item is total number of batches - in a dataset, the second is a list of paths to tar files relative to directory containing - ``metadata_file``. Items ``'punct_label_vocab_file'`` and ``'capit_label_vocab_file'`` are paths to - ``.csv`` files which contain unique punctuation a capitalization label vocabularies. Vocabulary file paths - are relative to directory containing the ``metadata_file``. Each line in ``'punct_label_vocab_file'`` and - ``'capit_label_vocab_file'`` contains 1 label. The first lines in ``'punct_label_vocab_file'`` and - ``'capit_label_vocab_file'`` files are neutral labels which also serve as pad labels. Neutral labels for - punctuation and capitalization must be equal to the ``pad_label`` parameter. - tokenizer (:obj:`TokenizerSpec`): a tokenizer instance used for tokenization of dataset source. A tokenizer - instance is used for getting ids of [CLS], [PAD], and [SEP] tokens which are used for masks creation. - pad_label (:obj:`str`): a label that is used for padding and for absence of punctuation or - capitalization. Used for checking items ``'punct_label_vocab'`` and ``'capit_label_vocab'`` of dictionary - in ``metadata_file``. - label_info_save_dir (:obj:`Union[os.PathLike, str]`, `optional`): a path to a directory where label - vocabularies are copied when method :meth:`save_labels_and_get_file_paths` is called. This parameter is - useful if tarred dataset directory is read-only. - ignore_extra_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): whether to use only first token in a - word for loss computation and training. If set to ``True``, then loss will be computed only for the first - tokens of words. - ignore_start_end (:obj:`bool`, `optional`, defaults to :obj:`True`): whether to compute loss for [CLS] and - [SEP] tokens. If set to ``True``, then loss will not be computed for [CLS] and [SEP] tokens. - world_size (:obj:`int`, `optional`, defaults to :obj:`1`): a number of processes used for model training. It is - used together with a ``global_rank`` parameter to decide which tar files will be used in the current - process. - global_rank (:obj:`int`, `optional`, defaults to :obj:`0`): a number of current process in the pool of workers - used for model training. It is used together with ``world_size`` parameter to decide which tar files will - be used in the current process. - shuffle_n (:obj:`int`, `optional`, defaults to :obj:`1`): a number of shuffled batches in a buffer. - ``shuffle_n`` batches are loaded into memory, shuffled, and then yielded by a dataset instance. - shard_strategy (:obj:`str`, defaults to :obj:``'scatter'``): Tarred dataset shard distribution strategy chosen as - a str value during ddp. - - ``'scatter'``: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - ``'replicate'``: Optional shard strategy, where each node gets all the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of :param:`shuffle_n`. - - .. warning:: - Replicated strategy allows every node to sample the entire set of available tar files, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. """ - if self.use_audio: - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'punct_labels': NeuralType(('B', 'T'), LabelsType()), - 'capit_labels': NeuralType(('B', 'T'), LabelsType()), - 'features': NeuralType(('B', 'T'), AudioSignal()), - 'features_length': NeuralType(('B', 'T'), LengthsType()), - } - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'punct_labels': NeuralType(('B', 'T'), LabelsType()), - 'capit_labels': NeuralType(('B', 'T'), LabelsType()), - } - - def __init__( - self, - metadata_file: Union[os.PathLike, str], - tokenizer: TokenizerSpec, - pad_label: str, - label_info_save_dir: Optional[Union[os.PathLike, str]] = None, - ignore_extra_tokens: bool = False, - ignore_start_end: bool = True, - world_size: int = 1, - global_rank: int = 0, - shuffle_n: int = 1, - shard_strategy: str = "scatter", - use_audio: bool = False, - ) -> None: - super().__init__() - - valid_shard_strategies = ['scatter', 'replicate'] - if shard_strategy not in valid_shard_strategies: - raise ValueError( - f"Invalid shard strategy of type {type(shard_strategy)} " - f"{repr(shard_strategy) if len(repr(shard_strategy)) < 100 else repr(shard_strategy)[:100] + '...'}! " - f"Allowed values are: {valid_shard_strategies}." - ) - - self.tokenizer = tokenizer - self.metadata_file = Path(metadata_file).expanduser() - if label_info_save_dir is None: - self.for_nemo_ckpt = self.metadata_file.parent / LABEL_ID_DIR_FOR_NEMO_CHECKPOINT - else: - self.for_nemo_ckpt = Path(label_info_save_dir).expanduser() / LABEL_ID_DIR_FOR_NEMO_CHECKPOINT - with open(self.metadata_file) as f: - self.metadata = json.load(f) - self.ignore_extra_tokens = ignore_extra_tokens - self.ignore_start_end = ignore_start_end - self.tar_files = [] - for file_path in self.metadata['tar_files']: - file_path = Path(file_path).expanduser() - if file_path.is_absolute(): - self.tar_files.append(str(file_path)) - else: - self.tar_files.append(str(self.metadata_file.parent / file_path)) - self.punct_label_vocab_file = self.metadata_file.parent / self.metadata[METADATA_PUNCT_LABEL_VOCAB_KEY] - self.capit_label_vocab_file = self.metadata_file.parent / self.metadata[METADATA_CAPIT_LABEL_VOCAB_KEY] - self.punct_label_ids = load_label_ids(self.punct_label_vocab_file) - self.capit_label_ids = load_label_ids(self.capit_label_vocab_file) - self.pad_label = pad_label - self._check_pad_label() - - if shard_strategy == 'scatter': - logging.info("Tarred dataset shards will be scattered evenly across all nodes.") - if len(self.tar_files) % world_size != 0: - logging.warning( - f"Number of shards in tarred dataset ({len(self.tar_files)}) is not divisible " - f"by number of distributed workers ({world_size}). " - f"Some shards will not be used ({len(self.tar_files) % world_size})." - ) - begin_idx = (len(self.tar_files) // world_size) * global_rank - end_idx = begin_idx + (len(self.tar_files) // world_size) - logging.info( - "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx - ) - batches_per_tar = self.metadata['num_batches'] // len(self.tar_files) - self.tar_files = self.tar_files[begin_idx:end_idx] - self.length = batches_per_tar * len(self.tar_files) * world_size - - elif shard_strategy == 'replicate': - logging.info("All tarred dataset shards will be replicated across all nodes.") - self.length = self.metadata['num_batches'] - - else: - raise ValueError(f"Invalid shard strategy! Allowed values are: {valid_shard_strategies}") - - self._dataset = wds.DataPipeline( - wds.SimpleShardList(self.tar_files), - webdataset_split_by_workers, - wds.tarfile_to_samples(), - wds.decode(wds.handle_extension('.pyd', decode_pyd)), - wds.shuffle(shuffle_n), - wds.to_tuple('__key__', 'batch.pyd'), - wds.map(self._build_sample), - ) - - self.use_audio = use_audio - - def _check_pad_label(self) -> None: - """ - Checks the condition that ``pad_label`` passed to this class constructor has ``0`` id in - ``self.punct_label_ids`` and ``self.capit_label_ids`` loaded from tarred dataset. - """ - for label_ids, labels_file, task in [ - (self.punct_label_ids, self.metadata[METADATA_PUNCT_LABEL_VOCAB_KEY], "punctuation"), - (self.capit_label_ids, self.metadata[METADATA_CAPIT_LABEL_VOCAB_KEY], "capitalization"), - ]: - if label_ids[self.pad_label] != 0: - raise ValueError( - f"Pad label '{self.pad_label}' has non zero id {label_ids[self.pad_label]} in {task} " - f"ids dictionary loaded from {labels_file}." - ) - - def check_for_label_consistency_with_model_config( - self, - punct_label_ids: Optional[Dict[str, int]], - capit_label_ids: Optional[Dict[str, int]], - class_labels: DictConfig, - common_dataset_parameters_config: DictConfig, - ) -> None: - """ - Checks that label ids loaded from tarred dataset are identical to those provided in - ``model.common_dataset_parameters`` :ref:`config` item. In addition, - this method checks that label ids set in attributes ``punct_label_ids`` and ``capit_label_ids`` of an instance - of - :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel` - are identical to label ids loaded from tarred dataset. - - Args: - punct_label_ids: a content of ``punct_label_ids`` attribute of an instance of - :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel` - in which this tarred dataset is used. - capit_label_ids: a content of ``capit_label_ids`` attribute of an instance of - :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel` - in which this tarred dataset is used. - class_labels: a config item ``model.class_labels``. See more in description of - :ref:`class labels' config`. - common_dataset_parameters_config: a config item ``model.common_dataset_parameters``. See more in - of :ref:`common dataset parameters config`. - """ - tarred_dataset_label_desc_tmpl = ( - f'{{label_type}} labels loaded from tarred dataset with metadata file {self.metadata_file}' - ) - if punct_label_ids is not None: - if punct_label_ids != self.punct_label_ids: - raise_not_equal_labels_error( - first_labels=self.punct_label_ids, - second_labels=punct_label_ids, - first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Punctuation'), - second_labels_desc="Punctuation labels stored in an attribute " - "`PunctuationCapitalizationModel.punct_label_ids`", - ) - if capit_label_ids is not None: - if capit_label_ids != self.capit_label_ids: - raise_not_equal_labels_error( - first_labels=self.capit_label_ids, - second_labels=capit_label_ids, - first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Capitalization'), - second_labels_desc="Capitalization labels stored in an attribute" - "`PunctuationCapitalizationModel.capit_label_ids`", - ) - if common_dataset_parameters_config.punct_label_ids is not None: - cfg_punct_label_ids = dict(common_dataset_parameters_config.punct_label_ids) - if cfg_punct_label_ids != self.punct_label_ids: - raise_not_equal_labels_error( - first_labels=self.punct_label_ids, - second_labels=cfg_punct_label_ids, - first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Punctuation'), - second_labels_desc='Punctuation labels stored a config field ' - '`model.common_dataset_parameters.punct_label_ids`', - ) - if common_dataset_parameters_config.capit_label_ids is not None: - cfg_capit_label_ids = dict(common_dataset_parameters_config.capit_label_ids) - if cfg_capit_label_ids != self.capit_label_ids: - raise_not_equal_labels_error( - first_labels=self.capit_label_ids, - second_labels=cfg_capit_label_ids, - first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Capitalization'), - second_labels_desc='Capitalization labels stored a config field ' - '`model.common_dataset_parameters.capit_label_ids`', - ) - if common_dataset_parameters_config.label_vocab_dir is not None: - label_vocab_dir = Path(common_dataset_parameters_config.label_vocab_dir).expanduser() - punct_label_vocab_file = label_vocab_dir / class_labels.punct_labels_file - file_punct_vocab = load_label_ids(punct_label_vocab_file) - if file_punct_vocab != self.punct_label_ids: - raise_not_equal_labels_error( - first_labels=self.punct_label_ids, - second_labels=file_punct_vocab, - first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Punctuation'), - second_labels_desc=f'labels stored in file {punct_label_vocab_file} passed in ' - f'`model.common_dataset_parameters.punct_label_vocab_file`', - ) - capit_label_vocab_file = label_vocab_dir / class_labels.capit_labels_file - file_capit_vocab = load_label_ids(capit_label_vocab_file) - if file_capit_vocab != self.capit_label_ids: - raise_not_equal_labels_error( - first_labels=self.capit_label_ids, - second_labels=file_capit_vocab, - first_labels_desc=tarred_dataset_label_desc_tmpl.format(label_type='Capitalization'), - second_labels_desc=f'labels stored in file {capit_label_vocab_file} passed in ' - f'`model.common_dataset_parameters.capit_label_vocab_file`', - ) - - def save_labels_and_get_file_paths( - self, punct_labels_file_name: str, capit_labels_file_name: str - ) -> Tuple[Path, Path]: - """ - Copies label vocabulary files for punctuation and capitalization into directory passed in the constructor - parameter ``label_info_save_dir``. The names of new - files are ``punct_labels_file_name`` and ``capit_labels_file_name``. - - The signatures of this method and the signature of the method - :meth:`~nemo.collections.nlp.data.token_classification.BertPunctuationCapitalizationDataset.save_labels_and_get_file_paths` - must be identical. - - Args: - punct_labels_file_name (:obj:`str`): a name of punctuation labels file - capit_labels_file_name (:obj:`str`): a name of capitalization labels file - - Returns: - :obj:`Tuple[Path, Path]`: a tuple of 2 elements - - - :obj:`pathlib.Path`: a path to the new punctuation label ids file - - :obj:`pathlib.Path`: a path to the new capitalization label ids file - """ - self.for_nemo_ckpt.mkdir(parents=True, exist_ok=True) - punct_label_ids_file = self.for_nemo_ckpt / punct_labels_file_name - capit_label_ids_file = self.for_nemo_ckpt / capit_labels_file_name - shutil.copy(str(self.punct_label_vocab_file), str(punct_label_ids_file)) - shutil.copy(str(self.capit_label_vocab_file), str(capit_label_ids_file)) - return punct_label_ids_file, capit_label_ids_file - - def _build_sample(self, batch: Tuple[str, Dict[str, np.ndarray]]) -> Dict[str, np.ndarray]: - """ - Takes batch loaded from tarred dataset and transforms it for passing to the model. Adds ``'segment_ids'``, - ``'input_mask'``, ``'loss_mask'`` items to the batch. - - Args: - batch: a tuple of 2 elements: batch name and a dictionary with ``'input_ids'``, ``'subtokens_mask'``, - ``'punct_labels'``, ``'capit_labels'``. Batch name is not needed for training and inference and - discarded. - - Returns: - a batch in the form of a dictionary with items: - - ``'input_ids'``: a ``np.int32`` numpy array of shape ``[Batch, Time]``; - - ``'subtokens_mask'``: a boolean numpy array of shape ``[Batch, Time]``; - - ``'punct_labels'``: a ``np.int32`` numpy array of shape ``[Batch, Time]``; - - ``'capit_labels'``: a ``np.int32`` numpy array of shape ``[Batch, Time]``; - - ``'segment_ids'``: a ``np.int8`` numpy array of shape ``[Batch, Time]``; - - ``'input_mask'``: a boolean numpy array of shape ``[Batch, Time]``; - - ``'loss_mask'``: a boolean numpy array of shape ``[Batch, Time]``. - """ - _, batch = batch - batch_segment_ids, batch_input_mask, batch_loss_mask = create_masks_and_segment_ids( - batch['input_ids'], - batch['subtokens_mask'], - self.tokenizer.pad_id, - self.tokenizer.cls_id, - self.tokenizer.sep_id, - self.ignore_start_end, - self.ignore_extra_tokens, - ) - batch['segment_ids'] = batch_segment_ids - batch['input_mask'] = batch_input_mask - batch['loss_mask'] = batch_loss_mask - return batch - - def __iter__(self) -> Iterator[Dict[str, np.ndarray]]: - """ - Constructs an iterator of batches. The values of one batch dictionary are numpy arrays of identical shapes - ``[Batch, Time]``. - - Returns: - :obj:`Iterator[Dict[str, np.ndarray]]`: an iterator of batches with items: - - - ``'input_ids'``: ``np.int32`` array containing encoded tokens, - - ``'subtokens_mask'``: ``bool`` array which elements are ``True`` if they correspond to first token in - a word, - - ``'punct_labels'``: ``np.int32`` array with encoded punctuation labels, - - ``'capit_labels'``: ``np.int32`` array with encoded capitalization labels, - - ``'segment_ids'``: ``np.int8`` array filled with zeros (BERT token types in HuggingFace terminology), - - ``'input_mask'``: ``bool`` array which elements are ``True`` if corresponding token is not a padding - token, - - ``'loss_mask'``: ``bool`` array which elements are ``True`` if loss is computed for corresponding - token. See more in description of constructor parameters ``ignore_start_end``, ``ignore_extra_tokens``. - """ - return self._dataset.__iter__() - - def __len__(self) -> int: - return self.length - - def collate_fn(self, batches: List[Dict[str, np.ndarray]]) -> Dict[str, torch.Tensor]: - """ - Return zeroth batch of ``batches`` list passed for collating and casts ``'segment_ids'``, ``'punct_labels'``, - ``'capit_labels'`` to types supported by - :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel`. - All output tensors have shape ``[Batch, Time]``. - - .. warning:: - ``batch size`` parameter of a PyTorch data loader and sampler has to be ``1``. - - Args: - batches (:obj:`List[Dict[str, np.ndarray]]`): a list of batches passed for collating - - Returns: - :obj:`Dict[str, torch.Tensor]`: a batch dictionary with following items (for detailed description of batch - items see method :meth:`__getitem__`): - - - ``'input_ids'`` (:obj:`torch.Tensor`): :obj:`torch.int32` tensor, - - ``'subtokens_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor, - - ``'punct_labels'`` (:obj:`torch.Tensor`): :obj:`torch.int64` tensor, - - ``'capit_labels'`` (:obj:`torch.Tensor`): :obj:`torch.int64` tensor, - - ``'segment_ids'`` (:obj:`torch.Tensor`): :obj:`torch.int32` tensor, - - ``'input_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor, - - ``'loss_mask'`` (:obj:`torch.Tensor`): :obj:`torch.bool` tensor. - """ - batch = {k: torch.as_tensor(v) for k, v in batches[0].items()} - batch['segment_ids'] = batch['segment_ids'].int() - batch['punct_labels'] = batch['punct_labels'].long() - batch['capit_labels'] = batch['capit_labels'].long() - if self.use_audio: - batch['features'] = batch['features'].to(torch.float32) - return batch diff --git a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py deleted file mode 100644 index 4f49e34ce24e..000000000000 --- a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py +++ /dev/null @@ -1,353 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Utility functions for Token Classification NLP tasks -Some parts of this code were adapted from the HuggingFace library at -https://github.com/huggingface/pytorch-pretrained-BERT -""" - -import os -import pickle -import tempfile -import time -from typing import Dict, List, Optional - -import numpy as np - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.data_utils.data_preprocessing import get_stats -from nemo.core.classes import Dataset -from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType -from nemo.utils import logging -from nemo.utils.get_rank import is_global_rank_zero - -__all__ = ['BertTokenClassificationDataset', 'BertTokenClassificationInferDataset'] - - -def get_features( - queries: List[str], - tokenizer: TokenizerSpec, - max_seq_length: int = -1, - label_ids: dict = None, - pad_label: str = 'O', - raw_labels: List[str] = None, - ignore_extra_tokens: bool = False, - ignore_start_end: bool = False, -): - """ - Processes the data and returns features. - Args: - queries: text sequences - tokenizer: such as AutoTokenizer - max_seq_length: max sequence length minus 2 for [CLS] and [SEP], when -1 - use the max len from the data - pad_label: pad value use for labels. By default, it's the neutral label. - raw_labels: list of labels for every word in a sequence - label_ids: dict to map labels to label ids. - Starts with pad_label->0 and then increases in alphabetical order. - Required for training and evaluation, not needed for inference. - ignore_extra_tokens: whether to ignore extra tokens in the loss_mask - ignore_start_end: whether to ignore bos and eos tokens in the loss_mask - """ - all_subtokens = [] - all_loss_mask = [] - all_subtokens_mask = [] - all_segment_ids = [] - all_input_ids = [] - all_input_mask = [] - sent_lengths = [] - all_labels = [] - with_label = False - - if raw_labels is not None: - with_label = True - - for i, query in enumerate(queries): - words = query.strip().split() - - # add bos token - subtokens = [tokenizer.cls_token] - loss_mask = [1 - ignore_start_end] - subtokens_mask = [0] - if with_label: - pad_id = label_ids[pad_label] - labels = [pad_id] - query_labels = [label_ids[lab] for lab in raw_labels[i]] - - for j, word in enumerate(words): - word_tokens = tokenizer.text_to_tokens(word) - - # to handle emojis that could be neglected during tokenization - if len(word.strip()) > 0 and len(word_tokens) == 0: - word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)] - - subtokens.extend(word_tokens) - - loss_mask.append(1) - loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1)) - - subtokens_mask.append(1) - subtokens_mask.extend([0] * (len(word_tokens) - 1)) - - if with_label: - labels.extend([query_labels[j]] * len(word_tokens)) - # add eos token - subtokens.append(tokenizer.sep_token) - loss_mask.append(1 - ignore_start_end) - subtokens_mask.append(0) - sent_lengths.append(len(subtokens)) - all_subtokens.append(subtokens) - all_loss_mask.append(loss_mask) - all_subtokens_mask.append(subtokens_mask) - all_input_mask.append([1] * len(subtokens)) - - if with_label: - labels.append(pad_id) - all_labels.append(labels) - - max_seq_length_data = max(sent_lengths) - max_seq_length = min(max_seq_length, max_seq_length_data) if max_seq_length > 0 else max_seq_length_data - logging.info(f'Setting Max Seq length to: {max_seq_length}') - get_stats(sent_lengths) - too_long_count = 0 - - for i, subtokens in enumerate(all_subtokens): - if len(subtokens) > max_seq_length: - subtokens = [tokenizer.cls_token] + subtokens[-max_seq_length + 1 :] - all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :] - all_loss_mask[i] = [int(not ignore_start_end)] + all_loss_mask[i][-max_seq_length + 1 :] - all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :] - - if with_label: - all_labels[i] = [pad_id] + all_labels[i][-max_seq_length + 1 :] - too_long_count += 1 - - all_input_ids.append(tokenizer.tokens_to_ids(subtokens)) - - if len(subtokens) < max_seq_length: - extra = max_seq_length - len(subtokens) - all_input_ids[i] = all_input_ids[i] + [0] * extra - all_loss_mask[i] = all_loss_mask[i] + [0] * extra - all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra - all_input_mask[i] = all_input_mask[i] + [0] * extra - - if with_label: - all_labels[i] = all_labels[i] + [pad_id] * extra - - all_segment_ids.append([0] * max_seq_length) - - logging.warning(f'{too_long_count} are longer than {max_seq_length}') - - for i in range(min(len(all_input_ids), 1)): - logging.info("*** Example ***") - logging.info("i: %s", i) - logging.info("subtokens: %s", " ".join(list(map(str, all_subtokens[i])))) - logging.info("loss_mask: %s", " ".join(list(map(str, all_loss_mask[i])))) - logging.info("input_mask: %s", " ".join(list(map(str, all_input_mask[i])))) - logging.info("subtokens_mask: %s", " ".join(list(map(str, all_subtokens_mask[i])))) - if with_label: - logging.info("labels: %s", " ".join(list(map(str, all_labels[i])))) - return (all_input_ids, all_segment_ids, all_input_mask, all_subtokens_mask, all_loss_mask, all_labels) - - -class BertTokenClassificationDataset(Dataset): - """ - Creates dataset to use during training for token classification tasks with a pretrained model. - - Converts from raw data to an instance that can be used by Dataloader. - For dataset to use during inference without labels, see BertTokenClassificationInferDataset. - - Args: - text_file: file to sequences, each line should a sentence, no header. - label_file: file to labels, each line corresponds to word labels for a sentence in the text_file. No header. - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] - tokenizer: such as AutoTokenizer - num_samples: number of samples you want to use for the dataset. - If -1, use all dataset. Useful for testing. - pad_label: pad value use for labels. By default, it's the neutral label. - label_ids: label_ids (dict): dict to map labels to label ids. - Starts with pad_label->0 and then increases in alphabetical order - For dev set use label_ids generated during training to support - cases when not all labels are present in the dev set. - For training set label_ids should be None. - ignore_extra_tokens: whether to ignore extra tokens in the loss_mask - ignore_start_end: whether to ignore bos and eos tokens in the loss_mask - use_cache: whether to use processed data cache or not - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'labels': NeuralType(('B', 'T'), LabelsType()), - } - - def __init__( - self, - text_file: str, - label_file: str, - max_seq_length: int, - tokenizer: TokenizerSpec, - num_samples: int = -1, - pad_label: str = 'O', - label_ids: Dict[str, int] = None, - ignore_extra_tokens: bool = False, - ignore_start_end: bool = False, - use_cache: bool = True, - ): - """ Initializes BertTokenClassificationDataset. """ - - data_dir = os.path.dirname(text_file) - text_filename = os.path.basename(text_file) - lbl_filename = os.path.basename(label_file) - - if not text_filename.endswith('.txt'): - raise ValueError("{text_file} should have extension .txt") - - vocab_size = getattr(tokenizer, "vocab_size", 0) - features_pkl = os.path.join( - data_dir, - f"cached__{text_filename}__{lbl_filename}__{tokenizer.name}_{max_seq_length}_{vocab_size}_{num_samples}", - ) - - master_device = is_global_rank_zero() - features = None - if master_device and (not use_cache or not os.path.exists(features_pkl)): - if num_samples == 0: - raise ValueError("num_samples has to be positive", num_samples) - - with open(text_file, 'r') as f: - text_lines = f.readlines() - - labels_lines = [] - with open(label_file, 'r') as f: - for line in f: - line = line.strip().split() - labels_lines.append(line) - - if len(labels_lines) != len(text_lines): - raise ValueError("Labels file should contain labels for every word") - - if num_samples > 0: - dataset = list(zip(text_lines, labels_lines)) - dataset = dataset[:num_samples] - - dataset = list(zip(*dataset)) - text_lines = dataset[0] - labels_lines = dataset[1] - - features = get_features( - queries=text_lines, - max_seq_length=max_seq_length, - tokenizer=tokenizer, - pad_label=pad_label, - raw_labels=labels_lines, - label_ids=label_ids, - ignore_extra_tokens=ignore_extra_tokens, - ignore_start_end=ignore_start_end, - ) - - # save features to a temp file first to make sure that non-master processes don't start reading the file - # until the master process is done with writing - ofd, tmp_features_pkl = tempfile.mkstemp( - suffix='.pkl', prefix=os.path.basename(features_pkl), dir=os.path.dirname(features_pkl) - ) - with os.fdopen(ofd, 'wb') as temp_f: - pickle.dump(features, temp_f) - - os.rename(tmp_features_pkl, features_pkl) - logging.info(f'features saved to {features_pkl}') - - # wait until the master process writes to the processed data files - if not master_device: - while features is None and not os.path.exists(features_pkl): - time.sleep(10) - - if features is None: - features = pickle.load(open(features_pkl, 'rb')) - logging.info(f'features restored from {features_pkl}') - - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_subtokens_mask = features[3] - self.all_loss_mask = features[4] - self.all_labels = features[5] - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.longlong), - np.array(self.all_subtokens_mask[idx]), - np.array(self.all_loss_mask[idx]), - np.array(self.all_labels[idx]), - ) - - -class BertTokenClassificationInferDataset(Dataset): - """ - Creates dataset to use during inference for token classification tasks with a pretrained model. - For dataset to use during training with labels, see BertTokenClassificationDataset. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - } - - def __init__( - self, queries: List[str], max_seq_length: int, tokenizer: TokenizerSpec, - ): - """ - Initializes BertTokenClassificationInferDataset - Args: - queries: text sequences - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] - tokenizer: such as AutoTokenizer - """ - features = get_features(queries=queries, max_seq_length=max_seq_length, tokenizer=tokenizer) - - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_subtokens_mask = features[3] - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.longlong), - np.array(self.all_subtokens_mask[idx]), - ) diff --git a/nemo/collections/nlp/data/token_classification/token_classification_utils.py b/nemo/collections/nlp/data/token_classification/token_classification_utils.py deleted file mode 100644 index 94acd69d3b11..000000000000 --- a/nemo/collections/nlp/data/token_classification/token_classification_utils.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import pickle -import re -import string -from typing import Dict - -from nemo.collections.nlp.data.data_utils.data_preprocessing import ( - fill_class_weights, - get_freq_weights, - get_label_stats, -) -from nemo.utils import logging - -__all__ = ['get_label_ids', 'create_text_and_labels'] - - -def remove_punctuation(word: str): - """ - Removes all punctuation marks from a word except for ' - that is often a part of word: don't, it's, and so on - """ - all_punct_marks = string.punctuation.replace("'", '') - return re.sub('[' + all_punct_marks + ']', '', word) - - -def create_text_and_labels(output_dir: str, file_path: str, punct_marks: str = ',.?'): - """ - Create datasets for training and evaluation. - - Args: - output_dir: path to the output data directory - file_path: path to file name - punct_marks: supported punctuation marks - - The data will be split into 2 files: text.txt and labels.txt. \ - Each line of the text.txt file contains text sequences, where words\ - are separated with spaces. The labels.txt file contains \ - corresponding labels for each word in text.txt, the labels are \ - separated with spaces. Each line of the files should follow the \ - format: \ - [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \ - [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).' - """ - if not os.path.exists(file_path): - raise ValueError(f'{file_path} not found') - - os.makedirs(output_dir, exist_ok=True) - - base_name = os.path.basename(file_path) - labels_file = os.path.join(output_dir, 'labels_' + base_name) - text_file = os.path.join(output_dir, 'text_' + base_name) - - with open(file_path, 'r') as f: - with open(text_file, 'w') as text_f: - with open(labels_file, 'w') as labels_f: - for line in f: - line = line.split() - text = '' - labels = '' - for word in line: - label = word[-1] if word[-1] in punct_marks else 'O' - word = remove_punctuation(word) - if len(word) > 0: - if word[0].isupper(): - label += 'U' - else: - label += 'O' - - word = word.lower() - text += word + ' ' - labels += label + ' ' - - text_f.write(text.strip() + '\n') - labels_f.write(labels.strip() + '\n') - - print(f'{text_file} and {labels_file} created from {file_path}.') - - -def get_label_ids( - label_file: str, - is_training: bool = False, - pad_label: str = 'O', - label_ids_dict: Dict[str, int] = None, - get_weights: bool = True, - class_labels_file_artifact='label_ids.csv', -): - """ - Generates str to int labels mapping for training data or checks correctness of the label_ids_dict - file for non-training files or if label_ids_dict is specified - - Args: - label_file: the path of the label file to process - is_training: indicates whether the label_file is used for training - pad_label: token used for padding - label_ids_dict: str label name to int ids mapping. Required for non-training data. - If specified, the check that all labels from label_file are present in label_ids_dict will be performed. - For training data, if label_ids_dict is None, a new mapping will be generated from label_file. - get_weights: set to True to calculate class weights, required for Weighted Loss. - class_labels_file_artifact: name of the file to save in .nemo - """ - if not os.path.exists(label_file): - raise ValueError(f'File {label_file} was not found.') - - logging.info(f'Processing {label_file}') - if not is_training and label_ids_dict is None: - raise ValueError( - f'For non training data, label_ids_dict created during preprocessing of the training data ' - f'should be provided' - ) - - # collect all labels from the label_file - data_dir = os.path.dirname(label_file) - unique_labels = set(pad_label) - all_labels = [] - with open(label_file, 'r') as f: - for line in f: - line = line.strip().split() - all_labels.extend(line) - unique_labels.update(line) - - # check that all labels from label_file are present in the specified label_ids_dict - # or generate label_ids_dict from data (for training only) - if label_ids_dict: - logging.info(f'Using provided labels mapping {label_ids_dict}') - for name in unique_labels: - if name not in label_ids_dict: - raise ValueError(f'{name} class from {label_file} not found in the provided mapping: {label_ids_dict}') - else: - label_ids_dict = {pad_label: 0} - if pad_label in unique_labels: - unique_labels.remove(pad_label) - for label in sorted(unique_labels): - label_ids_dict[label] = len(label_ids_dict) - - label_ids_filename = os.path.join(data_dir, class_labels_file_artifact) - if is_training: - with open(label_ids_filename, 'w') as f: - labels, _ = zip(*sorted(label_ids_dict.items(), key=lambda x: x[1])) - f.write('\n'.join(labels)) - logging.info(f'Labels mapping {label_ids_dict} saved to : {label_ids_filename}') - - # calculate label statistics - base_name = os.path.splitext(os.path.basename(label_file))[0] - stats_file = os.path.join(data_dir, f'{base_name}_label_stats.tsv') - if os.path.exists(stats_file) and not is_training and not get_weights: - logging.info(f'{stats_file} found, skipping stats calculation.') - else: - all_labels = [label_ids_dict[label] for label in all_labels] - logging.info(f'Three most popular labels in {label_file}:') - total_labels, label_frequencies, max_id = get_label_stats(all_labels, stats_file) - logging.info(f'Total labels: {total_labels}. Label frequencies - {label_frequencies}') - - if get_weights: - class_weights_pkl = os.path.join(data_dir, f'{base_name}_weights.p') - if os.path.exists(class_weights_pkl): - class_weights = pickle.load(open(class_weights_pkl, 'rb')) - logging.info(f'Class weights restored from {class_weights_pkl}') - else: - class_weights_dict = get_freq_weights(label_frequencies) - logging.info(f'Class Weights: {class_weights_dict}') - class_weights = fill_class_weights(class_weights_dict, max_id) - - pickle.dump(class_weights, open(class_weights_pkl, "wb")) - logging.info(f'Class weights saved to {class_weights_pkl}') - else: - class_weights = None - - return label_ids_dict, label_ids_filename, class_weights diff --git a/nemo/collections/nlp/data/zero_shot_intent_recognition/__init__.py b/nemo/collections/nlp/data/zero_shot_intent_recognition/__init__.py deleted file mode 100644 index 6d56d4564a5c..000000000000 --- a/nemo/collections/nlp/data/zero_shot_intent_recognition/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import ( - ZeroShotIntentInferenceDataset, - calc_class_weights_from_dataloader, -) diff --git a/nemo/collections/nlp/data/zero_shot_intent_recognition/zero_shot_intent_dataset.py b/nemo/collections/nlp/data/zero_shot_intent_recognition/zero_shot_intent_dataset.py deleted file mode 100644 index d14e0c7b73c3..000000000000 --- a/nemo/collections/nlp/data/zero_shot_intent_recognition/zero_shot_intent_dataset.py +++ /dev/null @@ -1,283 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import pickle -from typing import Dict, List, Optional - -import torch - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.data_utils.data_preprocessing import ( - DataProcessor, - fill_class_weights, - get_freq_weights, - get_label_stats, -) -from nemo.collections.nlp.data.glue_benchmark.data_processors import InputExample -from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset -from nemo.collections.nlp.parts.utils_funcs import tensor2list -from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType -from nemo.utils import logging - -__all__ = ['ZeroShotIntentProcessor', 'ZeroShotIntentDataset', 'ZeroShotIntentInferenceDataset'] - - -class ZeroShotIntentProcessor(DataProcessor): - """ - Processor for entailment data sets used to train NLI models for zero shot intent classification. - """ - - def __init__(self, sent1_col: int, sent2_col: int, label_col: int, num_classes: int): - """ - Args: - sent1_col: the index of the column containing the premise (or sentence 1) - sent2_col: the index of the column containing the hypothesis (or sentence 2) - label_col: the index of the column containing the label - num_classes: number of classes in the data (should be either 2 or 3, corresponding to - labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"]) - """ - self.sent1_col = sent1_col - self.sent2_col = sent2_col - self.label_col = label_col - self.num_classes = num_classes - - def get_train_examples(self, file_path: str): - """Gets a collection of `InputExample`s for the train set.""" - return self._create_examples(self._read_tsv(file_path), "train") - - def get_dev_examples(self, file_path: str): - """Gets a collection of `InputExample`s for the dev set.""" - return self._create_examples(self._read_tsv(file_path), "dev") - - def get_labels(self): - """Gets the list of labels for this data set.""" - if self.num_classes == 2: - return ['not_entailment', 'entailment'] - elif self.num_classes == 3: - return ["contradiction", "entailment", "neutral"] - else: - raise ValueError("num_classes must be either 2 or 3!") - - def _create_examples(self, lines: List[str], set_type: str): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[self.sent1_col] - text_b = line[self.sent2_col] - label = line[self.label_col] - if label == "-": - continue - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class ZeroShotIntentDataset(GLUEDataset): - """ - Dataset for training a NLI model for zero shot intent recognition. Similar to GLUE/MNLI - dataset, but allows the user to specify which columns in the data files contain the - premise, hypothesis, and gold label. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'labels': NeuralType(tuple('B'), CategoricalValuesType()), - } - - def __init__( - self, - file_path: str, - tokenizer: TokenizerSpec, - max_seq_length: str, - sent1_col: int, - sent2_col: int, - label_col: int, - num_classes: int, - use_cache: bool = True, - ): - """ - Args: - file_path: path to file - tokenizer: such as AutoTokenizer - max_seq_length: max sequence length including [CLS] and [SEP] - sent1_col: the index of the column containing the premise (or sentence 1) - sent2_col: the index of the column containing the hypothesis (or sentence 2) - label_col: the index of the column containing the label - num_classes: number of classes in the data (should be either 2 or 3, corresponding to - labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"]) - use_cache: whether to use data cache - """ - self.task_name = "mnli" # for compatibility with parent class - data_dir, file_name = os.path.split(file_path) - logging.info(f'Processing {file_name}') - self.tokenizer = tokenizer - evaluate = False if 'train' in file_name else True - processor = ZeroShotIntentProcessor(sent1_col, sent2_col, label_col, num_classes) - self.label_list = processor.get_labels() - if not evaluate: - self.examples = processor.get_train_examples(file_path) - - # check the labels found in the training set - all_train_labels = [example.label for example in self.examples] - unique_labels = set(all_train_labels) - if len(unique_labels) != num_classes: - raise ValueError( - "Number of classes specified in config doesn't match the number found in the training data!" - ) - elif len(unique_labels) == 2: - if not unique_labels == set(self.label_list): - raise ValueError( - f"Found unexpected labels! For a two-class model, labels are expected to be {self.label_list}" - ) - elif len(unique_labels) == 3: - if not unique_labels == set(self.label_list): - raise ValueError( - f"Found unexpected labels! For a three-class model, labels are expected to be {self.label_list}" - ) - - # save the label map for reference - label_file = os.path.join(data_dir, "label_ids.csv") - with open(label_file, "w") as out: - out.write('\n'.join(self.label_list)) - logging.info(f'Labels: {self.label_list}') - logging.info(f'Label mapping saved to : {label_file}') - - else: - self.examples = processor.get_dev_examples(file_path) - - processor_name = type(processor).__name__ - vocab_size = getattr(tokenizer, "vocab_size", 0) - cached_features_file = os.path.join( - data_dir, - "cached_{}_{}_{}_{}_{}".format( - processor_name, file_name, tokenizer.name, str(max_seq_length), str(vocab_size) - ), - ) - - if use_cache and os.path.exists(cached_features_file): - logging.info(f"loading from {cached_features_file}") - with open(cached_features_file, "rb") as reader: - self.features = pickle.load(reader) - else: - token_params = { - 'bos_token': None, - 'eos_token': tokenizer.eos_token, - 'pad_token': tokenizer.pad_token, - 'cls_token': tokenizer.cls_token, - 'sep_token_extra': tokenizer.eos_token if 'roberta' in tokenizer.name.lower() else None, - } - - self.features = self.convert_examples_to_features( - self.examples, self.label_list, max_seq_length, tokenizer, output_mode="classification", **token_params - ) - master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0 - if master_device: - logging.info(f'Saving train features into {cached_features_file}') - with open(cached_features_file, "wb") as writer: - pickle.dump(self.features, writer) - - -class ZeroShotIntentInferenceDataset(GLUEDataset): - """ - Similar to ZeroShotIntentDataset, but gets utterances and candidate labels from lists - rather than sentence pairs and labels from a file. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'labels': NeuralType(tuple('B'), CategoricalValuesType()), - } - - def __init__( - self, - queries: List[str], - candidate_labels: List[str], - tokenizer: TokenizerSpec, - max_seq_length: int, - hypothesis_template: str, - ): - """ - Args: - queries: list of utterances to classify - candidate_labels: list of candidate labels - tokenizer: such as AutoTokenizer - max_seq_length: max sequence length including [CLS] and [SEP] - hypothesis_template: template used to turn each candidate label into a NLI-style hypothesis - """ - - logging.info(f'Processing queries for inference') - self.tokenizer = tokenizer - token_params = { - 'bos_token': None, - 'eos_token': tokenizer.eos_token, - 'pad_token': tokenizer.pad_token, - 'cls_token': tokenizer.cls_token, - 'sep_token_extra': tokenizer.eos_token if 'roberta' in tokenizer.name.lower() else None, - } - self.examples = [] - for i, query in enumerate(queries): - for j, candidate_label in enumerate(candidate_labels): - guid = "query-%s-label-%s" % (i, j) - text_a = query - text_b = hypothesis_template.format(candidate_label) - label = 3 # dummy label for inference; training labels are 0, 1, 2 or 0, 1 - self.examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - - self.features = self.convert_examples_to_features( - self.examples, [0, 1, 2, 3], max_seq_length, tokenizer, output_mode="classification", **token_params - ) - - -def calc_class_weights_from_dataloader( - dataloader: 'torch.utils.data.DataLoader', num_classes: int, data_dir: str -) -> List[float]: - """ - Calculate the weights of each class to be used for weighted loss. This is similar to the function calc_class_weights - in text_classification_dataset, but it gets the labels from a dataloader rather than from a file. - Args: - dataloader: the dataloader for the training set - num_classes: number of classes in the dataset - """ - labels = [] - for batch in dataloader: - labels.extend(tensor2list(batch[-1])) - logging.info(f'Calculating label frequency stats...') - total_sents, sent_label_freq, max_id = get_label_stats( - labels, os.path.join(data_dir, 'sentence_stats.tsv'), verbose=False - ) - if max_id >= num_classes: - raise ValueError(f'Found an invalid label! Labels should be from [0, num_classes-1].') - - class_weights_dict = get_freq_weights(sent_label_freq) - - logging.info(f'Total Sentence Pairs: {total_sents}') - logging.info(f'Class Frequencies: {sent_label_freq}') - logging.info(f'Class Weights: {class_weights_dict}') - class_weights = fill_class_weights(weights=class_weights_dict, max_id=num_classes - 1) - return class_weights diff --git a/nemo/collections/nlp/metrics/__init__.py b/nemo/collections/nlp/metrics/__init__.py deleted file mode 100644 index 794f43dcbb52..000000000000 --- a/nemo/collections/nlp/metrics/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.metrics.classification_report import ( # noqa: F401 - ClassificationReport, - MultiLabelClassificationReport, -) -from nemo.collections.nlp.metrics.sequence_perplexity import SequencePerplexity # noqa: F401 diff --git a/nemo/collections/nlp/metrics/sequence_perplexity.py b/nemo/collections/nlp/metrics/sequence_perplexity.py deleted file mode 100644 index 339f062f7cc1..000000000000 --- a/nemo/collections/nlp/metrics/sequence_perplexity.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from torchmetrics import Metric - -__all__ = ['SequencePerplexity'] - - -class SequencePerplexity(Metric): - """ - This class computes mean perplexity across the batches of sequences. - - You have to provide ``log_probs`` (float tensor of shape [batch_size x seq_length x vocab_size]) and - ``labels`` (int tensor of shape [batch_size x seq_length] with values from the range [0, vocab_size-1]) - to the :meth:`update` method. If some of the sequences are shorter than seq_length, you can also provide - an optional argument ``mask`` (bool tensor of shape [batch_size x seq_length]) which masks out tokens - not participating in perplexity computation. - - See :doc:`PyTorch Lightning Metrics` for the metric usage instructions. - - Args: - dist_sync_on_step: - Synchronize metric state across processes at each ``forward()`` before returning the value at the step. - process_group: - Specify the process group on which synchronization is called. default: ``None`` (which selects the entire - world) - dist_sync_fn: - Callback that performs the allgather operation on the metric state. When ``None``, DDP will be used - to perform the allgather. - """ - - def __init__(self, dist_sync_on_step=False, process_group=None, dist_sync_fn=None): - super().__init__( - dist_sync_on_step=dist_sync_on_step, process_group=process_group, dist_sync_fn=dist_sync_fn, - ) - - # Total sum of exponentiated average negative log likelihoods - self.add_state('perplexities_sum', default=torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum') - # Total number of sequences in all batches - self.add_state('num_sequences', default=torch.tensor(0, dtype=torch.int64), dist_reduce_fx='sum') - - def update(self, log_probs: torch.Tensor, labels: torch.Tensor, mask=None): - - if mask is None: - mask = torch.ones_like(labels) - if mask.dtype is not log_probs.dtype: - mask = mask.to(log_probs.dtype) - - target_log_probs = log_probs.gather(2, labels.unsqueeze(2)).squeeze(2) - avg_neg_ll = -(target_log_probs * mask).sum(dim=-1) / mask.sum(dim=-1) - ppl = avg_neg_ll.exp() - self.num_sequences += ppl.numel() - self.perplexities_sum += ppl.sum() - - def compute(self): - """ - Returns perplexity across all workers and resets to 0 :attr:`perplexities_sum` and :attr:`num_sequences`. - """ - if self.num_sequences.eq(0): - return None - return self.perplexities_sum / self.num_sequences diff --git a/nemo/collections/nlp/metrics/sgd_metrics.py b/nemo/collections/nlp/metrics/sgd_metrics.py deleted file mode 100644 index 53666fb08928..000000000000 --- a/nemo/collections/nlp/metrics/sgd_metrics.py +++ /dev/null @@ -1,341 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Evaluation metrics for Schema-guided dialogue. - -This library provides functions for calculating the evaluation metrics for a -single dialogue. The following metrics are defined: - -(1) Active intent accuracy: The fraction of user turns for which the active - intent has been correctly predicted. -(2) Slot tagging F1: The macro-averaged F1 score for tagging slot values for - non-categorical slots. This metric is optional to report in the final paper - if participants decide not to use slot tagging. -(3) Requested slots F1: The macro-averaged F1 score for requested slots over the - turns. For a turn, if there are no requested slots in both the ground truth - and the prediction, that turn is skipped. The reported number is the average - F1 score for all un-skipped user turns. This metric is optional to report in - the final paper. -(4) Average goal accuracy: For each turn, participants must predict a single - value for each slot present in the dialogue state. The slots which have a - non-empty assignment in the ground truth dialogue state are only considered. - This is the average accuracy of predicting the value of a slot correctly. A - fuzzy matching based score is used for non-categorical slots. -(5) Joint goal accuracy: This is the average accuracy of predicting all slot - assignments for a turn correctly. A fuzzy matching based score is used for - non-categorical slots. This is the primary evaluation metric used for ranking - submissions. More details to follow with the evaluation script. - -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/metrics.py -""" - -import collections - -import numpy as np -from rapidfuzz import fuzz - -F1Scores = collections.namedtuple("F1Scores", ["f1", "precision", "recall"]) - -# Evaluation and other relevant metrics for DSTC8/SGD Schema-guided DST. -# (1) Active intent accuracy. -ACTIVE_INTENT_ACCURACY = "active_intent_accuracy" -# (2) Slot tagging F1. -SLOT_TAGGING_F1 = "slot_tagging_f1" -SLOT_TAGGING_PRECISION = "slot_tagging_precision" -SLOT_TAGGING_RECALL = "slot_tagging_recall" -# (3) Requested slots F1. -REQUESTED_SLOTS_F1 = "requested_slots_f1" -REQUESTED_SLOTS_PRECISION = "requested_slots_precision" -REQUESTED_SLOTS_RECALL = "requested_slots_recall" -# (4) Average goal accuracy. -AVERAGE_GOAL_ACCURACY = "average_goal_accuracy" -AVERAGE_CAT_ACCURACY = "average_cat_accuracy" -AVERAGE_NONCAT_ACCURACY = "average_noncat_accuracy" -# (5) Joint goal accuracy. -JOINT_GOAL_ACCURACY = "joint_goal_accuracy" -JOINT_CAT_ACCURACY = "joint_cat_accuracy" -JOINT_NONCAT_ACCURACY = "joint_noncat_accuracy" - - -AVERAGE_CAT_STATUS_ACCURACY = "average_cat_status_accuracy" -AVERAGE_CAT_VALUE_ACCURACY = "average_cat_value_accuracy" -AVERAGE_NONCAT_STATUS_ACCURACY = "average_noncat_status_accuracy" -AVERAGE_NONCAT_VALUE_ACCURACY = "average_noncat_value_accuracy" - -JOINT_CAT_STATUS_ACCURACY = "joint_cat_status_accuracy" -JOINT_CAT_VALUE_ACCURACY = "joint_cat_value_accuracy" -JOINT_NONCAT_STATUS_ACCURACY = "joint_noncat_status_accuracy" -JOINT_NONCAT_VALUE_ACCURACY = "joint_noncat_value_accuracy" - - -NAN_VAL = "NA" - - -def compute_f1(list_ref, list_hyp): - """Compute F1 score from reference (grouth truth) list and hypothesis list. - Args: - list_ref: List of true elements. - list_hyp: List of postive (retrieved) elements. - Returns: - A F1Scores object containing F1, precision, and recall scores. - """ - - ref = collections.Counter(list_ref) - hyp = collections.Counter(list_hyp) - true = sum(ref.values()) - positive = sum(hyp.values()) - true_positive = sum((ref & hyp).values()) - precision = float(true_positive) / positive if positive else 1.0 - recall = float(true_positive) / true if true else 1.0 - if precision + recall > 0.0: - f1 = 2.0 * precision * recall / (precision + recall) - else: # The F1-score is defined to be 0 if both precision and recall are 0. - f1 = 0.0 - - return F1Scores(f1=f1, precision=precision, recall=recall) - - -def fuzzy_string_match(str_ref, str_hyp): - """Returns fuzzy string similarity score in range [0.0, 1.0]. - Args: - str_ref: reference string - str_hyp: hypothesis string - Returns: - fuzzy string similarity - """ - - # The higher the score, the higher the similarity between the two strings. - return fuzz.token_sort_ratio(str_ref, str_hyp) / 100.0 - - -def noncat_slot_value_match(str_ref_list, str_hyp, use_fuzzy_match): - """Calculate non-categorical slots correctness. - Args: - str_ref_list: a list of reference strings. - str_hyp: the hypothesis string. - use_fuzzy_match: whether to use fuzzy string matching. - Returns: - score: The highest fuzzy string match score of the references and hypotheis. - """ - score = 0.0 - for str_ref in str_ref_list: - if use_fuzzy_match: - match_score = fuzzy_string_match(str_ref, str_hyp) - else: - match_score = float(str_ref == str_hyp) - score = max(score, match_score) - return score - - -def compare_slot_values(slot_values_ref, slot_values_hyp, service, use_fuzzy_match): - """Compare and get correctness of goal state's slot_values. - - Args: - slot_values_ref: goal state slot_values from reference (ground truth). - slot_values_hyp: goal state slot_values from hypothesis (prediction). - service: a service data structure in the schema. We use it to obtain the - list of slots in the service and infer whether a slot is categorical. - use_fuzzy_match: whether to use fuzzy string matching for non-categorical - slot values - - Returns: - list_cor: list of corectness scores, each corresponding to one slot in the - service. The score is a float either 0.0 or 1.0 for categorical slot, - and in range [0.0, 1.0] for non-categorical slot. - slot_active: list indicating whether the element in list_cor corresponds to - an active ground-truth slot. - slot_cat: list indicating whether the element in list_cor corresponds to a - categorical slot. - list_cor_status: list of correct slot statuses - list_cor_value: list of correctness score only for active slots. Monactive slots are assigned -1. - """ - list_cor = [] - list_cor_status = [] - list_cor_value = [] - slot_active = [] - slot_cat = [] - - for slot in service["slots"]: - slot_name = slot["name"] - slot_cat.append(slot["is_categorical"]) - - if slot_name in slot_values_ref: # REF=active - slot_active.append(True) - if slot_name in slot_values_hyp: # HYP=active, apply matching - value_ref_list = slot_values_ref[slot_name] - value_hyp = slot_values_hyp[slot_name][0] - if slot["is_categorical"]: - cor = float(value_ref_list[0] == value_hyp) - else: - cor = noncat_slot_value_match(value_ref_list, value_hyp, use_fuzzy_match) - list_cor.append(cor) - list_cor_status.append(1.0) - list_cor_value.append(cor) - else: # HYP=off - list_cor.append(0.0) - list_cor_status.append(0.0) - list_cor_value.append(-1.0) - else: # REF=off - slot_active.append(False) - if slot_name in slot_values_hyp: # HYP=active - list_cor.append(0.0) - list_cor_status.append(0.0) - else: # HYP=off - list_cor.append(1.0) - list_cor_status.append(1.0) - list_cor_value.append(-1.0) - - assert len(list_cor) == len(service["slots"]) - assert len(slot_active) == len(service["slots"]) - assert len(slot_cat) == len(service["slots"]) - return list_cor, slot_active, slot_cat, list_cor_status, list_cor_value - - -def get_active_intent_accuracy(frame_ref, frame_hyp): - """Get active intent accuracy of a frame. - - Args: - frame_ref: single semantic frame from reference (ground truth) file. - frame_hyp: single semantic frame from hypothesis (prediction) file. - - Returns: - 1.0 if the intent prediction is correct, otherwise 0.0. - """ - return float(frame_ref["state"]["active_intent"] == frame_hyp["state"]["active_intent"]) - - -def get_slot_tagging_f1(frame_ref, frame_hyp, utt, service): - """Get slot tagging (non-categorical slots only) F1 scores of a frame. - - Args: - frame_ref: single semantic frame from reference (ground truth) file. - frame_hyp: single semantic frame from hypothesis (prediction) file. - utt: user utterance. Slot tagging annotations are the character positions in - the utterance. - service: a service data structure in the schema. We use it to infer whether - a slot is non-categorical. - - Returns: - A F1Scores object containing F1, precision, and recall scores. - """ - list_noncat_slots = [s["name"] for s in service["slots"] if not s["is_categorical"]] - if "slots" not in frame_hyp: - return None - else: - list_ref = [ - (s["slot"], utt[s["start"] : s["exclusive_end"]]) - for s in frame_ref["slots"] - if s["slot"] in list_noncat_slots - ] - list_hyp = [ - (s["slot"], utt[s["start"] : s["exclusive_end"]]) - for s in frame_hyp["slots"] - if s["slot"] in list_noncat_slots - ] - return compute_f1(list_ref, list_hyp) - - -def get_requested_slots_f1(frame_ref, frame_hyp): - """Get requested slots F1 scores of a frame. - - Args: - frame_ref: single semantic frame from reference (ground truth) file. - frame_hyp: single semantic frame from hypothesis (prediction) file. - - Returns: - A F1Scores object containing F1, precision, and recall scores. - """ - return compute_f1(frame_ref["state"]["requested_slots"], frame_hyp["state"]["requested_slots"]) - - -def get_average_and_joint_goal_accuracy(frame_ref, frame_hyp, service, use_fuzzy_match): - """Get average and joint goal accuracies of a frame. - - Args: - frame_ref: single semantic frame from reference (ground truth) file. - frame_hyp: single semantic frame from hypothesis (prediction) file. - service: a service data structure in the schema. We use it to obtain the - list of slots in the service and infer whether a slot is categorical. - use_fuzzy_match: whether to use fuzzy string matching for comparing - non-categorical slot values. - - Returns: - goal_acc: a dict whose values are average / joint - all-goal / categorical-goal / non-categorical-goal accuracies. - """ - goal_acc = {} - - list_acc, slot_active, slot_cat, list_status_acc, list_value_acc = compare_slot_values( - frame_ref["state"]["slot_values"], frame_hyp["state"]["slot_values"], service, use_fuzzy_match - ) - - # (4) Average goal accuracy. - active_acc = [acc for acc, active in zip(list_acc, slot_active) if active] - goal_acc[AVERAGE_GOAL_ACCURACY] = np.mean(active_acc) if active_acc else NAN_VAL - # (4-a) categorical. - active_cat_acc = [acc for acc, active, cat in zip(list_acc, slot_active, slot_cat) if active and cat] - goal_acc[AVERAGE_CAT_ACCURACY] = np.mean(active_cat_acc) if active_cat_acc else NAN_VAL - # (4-b) non-categorical. - active_noncat_acc = [acc for acc, active, cat in zip(list_acc, slot_active, slot_cat) if active and not cat] - goal_acc[AVERAGE_NONCAT_ACCURACY] = np.mean(active_noncat_acc) if active_noncat_acc else NAN_VAL - - # (5) Joint goal accuracy. - goal_acc[JOINT_GOAL_ACCURACY] = np.prod(list_acc) if list_acc else NAN_VAL - # (5-a) categorical. - cat_acc = [acc for acc, cat in zip(list_acc, slot_cat) if cat] - goal_acc[JOINT_CAT_ACCURACY] = np.prod(cat_acc) if cat_acc else NAN_VAL - # (5-b) non-categorical. - noncat_acc = [acc for acc, cat in zip(list_acc, slot_cat) if not cat] - goal_acc[JOINT_NONCAT_ACCURACY] = np.prod(noncat_acc) if noncat_acc else NAN_VAL - - # !!!!!!!!!!DEBUG!!!!!!!!!!!!! - # cat status acc for both active and non active - active_cat_status_acc = [acc for acc, active, cat in zip(list_status_acc, slot_active, slot_cat) if cat and active] - goal_acc[AVERAGE_CAT_STATUS_ACCURACY] = np.mean(active_cat_status_acc) if active_cat_status_acc else NAN_VAL - # joint cat status acc for both active and non active - cat_status_acc = [acc for acc, cat in zip(list_status_acc, slot_cat) if cat] - goal_acc[JOINT_CAT_STATUS_ACCURACY] = np.prod(cat_status_acc) if cat_status_acc else NAN_VAL - - # non cat status acc for both active and non active - active_noncat_status_acc = [ - acc for acc, active, cat in zip(list_status_acc, slot_active, slot_cat) if not cat and active - ] - goal_acc[AVERAGE_NONCAT_STATUS_ACCURACY] = ( - np.mean(active_noncat_status_acc) if active_noncat_status_acc else NAN_VAL - ) - # joint non cat status acc for both active and non active - noncat_status_acc = [acc for acc, cat in zip(list_status_acc, slot_cat) if not cat] - goal_acc[JOINT_NONCAT_STATUS_ACCURACY] = np.prod(noncat_status_acc) if noncat_status_acc else NAN_VAL - - # cat value acc for both active and non active - active_cat_val_acc = [ - acc for acc, active, cat in zip(list_value_acc, slot_active, slot_cat) if cat and acc > -0.5 and active - ] - goal_acc[AVERAGE_CAT_VALUE_ACCURACY] = np.mean(active_cat_val_acc) if active_cat_val_acc else NAN_VAL - # joint cat value acc for both active and non active - cat_val_acc = [acc for acc, cat in zip(list_value_acc, slot_cat) if cat and acc > -0.5] - goal_acc[JOINT_CAT_VALUE_ACCURACY] = np.prod(cat_val_acc) if cat_val_acc else NAN_VAL - - # cat non value acc for both active and non active - active_noncat_val_acc = [ - acc for acc, active, cat in zip(list_value_acc, slot_active, slot_cat) if not cat and acc > -0.5 and active - ] - goal_acc[AVERAGE_NONCAT_VALUE_ACCURACY] = np.mean(active_noncat_val_acc) if active_noncat_val_acc else NAN_VAL - # joint non cat value acc for both active and non active - noncat_val_acc = [acc for acc, cat in zip(list_value_acc, slot_cat) if not cat and acc > -0.5] - goal_acc[JOINT_NONCAT_VALUE_ACCURACY] = np.prod(noncat_val_acc) if noncat_val_acc else NAN_VAL - - return goal_acc diff --git a/nemo/collections/nlp/models/__init__.py b/nemo/collections/nlp/models/__init__.py index 6fa39cbe053b..25b4980f70c3 100644 --- a/nemo/collections/nlp/models/__init__.py +++ b/nemo/collections/nlp/models/__init__.py @@ -12,21 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - -from nemo.collections.nlp.models.entity_linking.entity_linking_model import EntityLinkingModel # noqa: F401 -from nemo.collections.nlp.models.glue_benchmark.glue_benchmark_model import GLUEModel # noqa: F401 -from nemo.collections.nlp.models.information_retrieval import BertDPRModel, BertJointIRModel # noqa: F401 -from nemo.collections.nlp.models.intent_slot_classification import ( # noqa: F401 - IntentSlotClassificationModel, - MultiLabelIntentSlotClassificationModel, -) from nemo.collections.nlp.models.language_modeling import MegatronGPTPromptLearningModel # noqa: F401 from nemo.collections.nlp.models.language_modeling.bert_lm_model import BERTLMModel # noqa: F401 from nemo.collections.nlp.models.language_modeling.transformer_lm_model import TransformerLMModel # noqa: F401 from nemo.collections.nlp.models.machine_translation import MTEncDecModel # noqa: F401 -from nemo.collections.nlp.models.token_classification import ( # noqa: F401 - PunctuationCapitalizationLexicalAudioModel, - PunctuationCapitalizationModel, - TokenClassificationModel, -) -from nemo.collections.nlp.models.zero_shot_intent_recognition import ZeroShotIntentModel # noqa: F401 diff --git a/nemo/collections/nlp/models/entity_linking/__init__.py b/nemo/collections/nlp/models/entity_linking/__init__.py deleted file mode 100644 index 925bfc18c77e..000000000000 --- a/nemo/collections/nlp/models/entity_linking/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.models.entity_linking.entity_linking_model import EntityLinkingModel diff --git a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py deleted file mode 100644 index 640520cdaaa7..000000000000 --- a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional - -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig -from transformers import AutoTokenizer - -from nemo.collections.common.losses import MultiSimilarityLoss -from nemo.collections.nlp.data import EntityLinkingDataset -from nemo.collections.nlp.models.nlp_model import NLPModel -from nemo.core.classes.common import typecheck -from nemo.core.classes.exportable import Exportable -from nemo.core.neural_types import LogitsType, NeuralType -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - -__all__ = ['EntityLinkingModel'] - - -class EntityLinkingModel(NLPModel, Exportable): - """ - Second stage pretraining of BERT based language model - for entity linking task. An implementation of Liu et. al's - NAACL 2021 paper Self-Alignment Pretraining for Biomedical Entity Representations. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return {"logits": NeuralType(('B', 'D'), LogitsType())} - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - """Initializes the SAP-BERT model for entity linking.""" - - # deprecation warning - deprecated_warning("EntityLinkingModel") - - # tokenizer needed before super().__init__() so dataset and loader can process data - self._setup_tokenizer(cfg.tokenizer) - - super().__init__(cfg=cfg, trainer=trainer) - - # Token to use for the self-alignment loss, typically the first token, [CLS] - self._idx_conditioned_on = 0 - self.loss = MultiSimilarityLoss() - - def _setup_tokenizer(self, cfg: DictConfig): - tokenizer = AutoTokenizer.from_pretrained( - cfg.tokenizer_name, vocab_file=cfg.vocab_file, do_lower_case=cfg.do_lower_case - ) - - self.tokenizer = tokenizer - - @typecheck() - def forward(self, input_ids, token_type_ids, attention_mask): - hidden_states = self.bert_model( - input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask - ) - if isinstance(hidden_states, tuple): - hidden_states = hidden_states[0] - - # normalize to unit sphere - logits = torch.nn.functional.normalize(hidden_states[:, self._idx_conditioned_on], p=2, dim=1) - return logits - - def training_step(self, batch, batch_idx): - """ - Lightning calls this inside the training loop with the data from the training dataloader - passed in as `batch`. - """ - input_ids, token_type_ids, attention_mask, concept_ids = batch - logits = self.forward(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) - train_loss = self.loss(logits=logits, labels=concept_ids) - - # No hard examples found in batch, - # shouldn't use this batch to update model weights - if train_loss == 0: - train_loss = None - lr = None - - else: - lr = self._optimizer.param_groups[0]["lr"] - self.log("train_loss", train_loss) - self.log("lr", lr, prog_bar=True) - - return {"loss": train_loss, "lr": lr} - - def validation_step(self, batch, batch_idx): - """ - Lightning calls this inside the validation loop with the data from the validation dataloader - passed in as `batch`. - """ - input_ids, input_type_ids, input_mask, concept_ids = batch - with torch.no_grad(): - logits = self.forward(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) - val_loss = self.loss(logits=logits, labels=concept_ids) - - # No hard examples found in batch, - # val loss not used to update model weights - if val_loss == 0: - val_loss = None - else: - self.log("val_loss", val_loss) - logging.info(f"val loss: {val_loss}") - - loss = {"val_loss": val_loss} - self.validation_step_outputs.append(loss) - return loss - - def on_validation_epoch_end(self): - """ - Called at the end of validation to aggregate outputs. - - Args: - outputs: list of individual outputs of each validation step. - Returns: - - """ - if self.validation_step_outputs: - avg_loss = torch.stack( - [x["val_loss"] for x in self.validation_step_outputs if x["val_loss"] != None] - ).mean() - self.log(f"val_loss", avg_loss, prog_bar=True) - self.validation_step_outputs.clear() # free memory - return {"val_loss": avg_loss} - - def setup_training_data(self, train_data_config: Optional[DictConfig]): - if not train_data_config or not train_data_config.data_file: - logging.info( - f"Dataloader config or file_path or processed data path for the train dataset is missing, \ - so no data loader for train is created!" - ) - - self._train_dl = None - return - - self._train_dl = self.setup_dataloader(cfg=train_data_config) - - def setup_validation_data(self, val_data_config: Optional[DictConfig]): - if not val_data_config or not val_data_config.data_file: - logging.info( - f"Dataloader config or file_path or processed data path for the val dataset is missing, \ - so no data loader for validation is created!" - ) - - self._validation_dl = None - return - - self._validation_dl = self.setup_dataloader(cfg=val_data_config) - - def setup_dataloader(self, cfg: Dict, is_index_data: bool = False) -> 'torch.utils.data.DataLoader': - - dataset = EntityLinkingDataset( - tokenizer=self.tokenizer, - data_file=cfg.data_file, - max_seq_length=cfg.max_seq_length, - is_index_data=is_index_data, - ) - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=cfg.batch_size, - collate_fn=dataset.collate_fn, - shuffle=cfg.get("shuffle", True), - num_workers=cfg.get("num_workers", 2), - pin_memory=cfg.get("pin_memory", False), - drop_last=cfg.get("drop_last", False), - ) - - @classmethod - def list_available_models(cls) -> Optional[Dict[str, str]]: - pass - - @classmethod - def from_pretrained(cls, name: str): - pass diff --git a/nemo/collections/nlp/models/glue_benchmark/__init__.py b/nemo/collections/nlp/models/glue_benchmark/__init__.py deleted file mode 100644 index eecc4db9100c..000000000000 --- a/nemo/collections/nlp/models/glue_benchmark/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.models.glue_benchmark.glue_benchmark_model import GLUEModel diff --git a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py deleted file mode 100644 index e90cf9d88c30..000000000000 --- a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Dict, Optional, Union - -import numpy as np -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig - -from nemo.collections.common.losses import CrossEntropyLoss, MSELoss -from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUE_TASKS_NUM_LABELS, GLUEDataset -from nemo.collections.nlp.models.glue_benchmark.metrics_for_glue import compute_metrics -from nemo.collections.nlp.models.nlp_model import NLPModel -from nemo.collections.nlp.modules.common import SequenceClassifier, SequenceRegression -from nemo.collections.nlp.parts.utils_funcs import list2str, tensor2list -from nemo.core.classes import typecheck -from nemo.core.neural_types import NeuralType -from nemo.utils import logging -from nemo.utils.decorators import deprecated_warning - -__all__ = ['GLUEModel'] - -''' -Some transformer of this code were adapted from the HuggingFace library at -https://github.com/huggingface/transformers -Example of running a pretrained BERT model on the 9 GLUE tasks, read more -about GLUE benchmark here: https://gluebenchmark.com -Download the GLUE data by running the script: -https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e - -Some of these tasks have a small dataset and training can lead to high variance -in the results between different runs. Below is the median on 5 runs -(with different seeds) for each of the metrics on the dev set of the benchmark -with an uncased BERT base model (the checkpoint bert-base-uncased) -(source https://github.com/huggingface/transformers/tree/master/examples#glue). -Task Metric Result -CoLA Matthew's corr 48.87 -SST-2 Accuracy 91.74 -MRPC F1/Accuracy 90.70/86.27 -STS-B Person/Spearman corr. 91.39/91.04 -QQP Accuracy/F1 90.79/87.66 -MNLI Matched acc./Mismatched acc. 83.70/84.83 -QNLI Accuracy 89.31 -RTE Accuracy 71.43 -WNLI Accuracy 43.66 - -''' - - -class GLUEModel(NLPModel): - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - return self.bert_model.input_types - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return self.pooler.output_types - - @property - def output_module(self): - return self.pooler - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - """ - Initializes model to use BERT model for GLUE tasks. - """ - # deprecation warning - deprecated_warning("GLUEModel") - - if cfg.task_name not in cfg.supported_tasks: - raise ValueError(f'{cfg.task_name} not in supported task. Choose from {cfg.supported_tasks}') - self.task_name = cfg.task_name - - # needed to setup validation on multiple datasets - # MNLI task has two separate dev sets: matched and mismatched - if not self._is_model_being_restored(): - if self.task_name == "mnli": - cfg.validation_ds.ds_item = [ - os.path.join(cfg.dataset.data_dir, 'dev_matched.tsv'), - os.path.join(cfg.dataset.data_dir, 'dev_mismatched.tsv'), - ] - else: - cfg.validation_ds.ds_item = os.path.join(cfg.dataset.data_dir, cfg.validation_ds.ds_item) - cfg.train_ds.ds_item = os.path.join(cfg.dataset.data_dir, cfg.train_ds.ds_item) - logging.info(f'Using {cfg.validation_ds.ds_item} for model evaluation.') - - super().__init__(cfg=cfg, trainer=trainer) - - num_labels = GLUE_TASKS_NUM_LABELS[self.task_name] - # uses [CLS] token for classification (the first token) - if self.task_name == "sts-b": - self.pooler = SequenceRegression(hidden_size=self.bert_model.config.hidden_size) - self.loss = MSELoss() - else: - self.pooler = SequenceClassifier( - hidden_size=self.bert_model.config.hidden_size, num_classes=num_labels, log_softmax=False - ) - self.loss = CrossEntropyLoss() - - def update_data_dir(self, data_dir: str) -> None: - """ - Update data directory and get data stats with Data Descriptor - Weights are later used to setup loss - - Args: - data_dir: path to data directory - """ - self._cfg.dataset.data_dir = data_dir - logging.info(f'Setting model.dataset.data_dir to {data_dir}.') - if self.task_name == "mnli": - self._cfg.validation_ds.ds_item = [ - os.path.join(data_dir, 'dev_matched.tsv'), - os.path.join(data_dir, 'dev_mismatched.tsv'), - ] - else: - self._cfg.validation_ds.ds_item = os.path.join(data_dir, 'dev.tsv') - - self._cfg.train_ds.ds_item = os.path.join(data_dir, 'train.tsv') - logging.info(f'Using {self._cfg.validation_ds.ds_item} for model evaluation.') - - @typecheck() - def forward(self, input_ids, token_type_ids, attention_mask): - hidden_states = self.bert_model( - input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask - ) - if isinstance(hidden_states, tuple): - hidden_states = hidden_states[0] - - output = self.pooler(hidden_states=hidden_states) - return output - - def training_step(self, batch, batch_idx): - input_ids, input_type_ids, input_mask, labels = batch - model_output = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) - - if self.task_name == "sts-b": - loss = self.loss(preds=model_output, labels=labels) - else: - loss = self.loss(logits=model_output, labels=labels) - - lr = self._optimizer.param_groups[0]['lr'] - - self.log('train_loss', loss) - self.log('lr', lr, prog_bar=True) - - return { - 'loss': loss, - 'lr': lr, - } - - def validation_step(self, batch, batch_idx, dataloader_idx=0): - input_ids, input_type_ids, input_mask, labels = batch - model_output = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) - - if self.task_name == "sts-b": - val_loss = self.loss(preds=model_output, labels=labels) - else: - val_loss = self.loss(logits=model_output, labels=labels) - - if self.task_name != 'sts-b': - model_output = torch.argmax(model_output, 1) - - eval_tensors = {'preds': model_output, 'labels': labels} - output = {'val_loss': val_loss, 'eval_tensors': eval_tensors} - self.validation_step_outputs.append(output) - return output - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - """ - Called at the end of validation to aggregate outputs. - outputs: list of individual outputs of each validation step. - """ - avg_loss = torch.stack([x['val_loss'] for x in self.validation_step_outputs]).mean() - preds = torch.cat([x['eval_tensors']['preds'] for x in self.validation_step_outputs]) - labels = torch.cat([x['eval_tensors']['labels'] for x in self.validation_step_outputs]) - - all_preds = [] - all_labels = [] - if torch.distributed.is_initialized(): - world_size = torch.distributed.get_world_size() - for ind in range(world_size): - all_preds.append(torch.empty_like(preds)) - all_labels.append(torch.empty_like(labels)) - torch.distributed.all_gather(all_preds, preds) - torch.distributed.all_gather(all_labels, labels) - else: - all_preds.append(preds) - all_labels.append(labels) - - if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: - preds = [] - labels = [] - for p in all_preds: - preds.extend(tensor2list(p)) - for l in all_labels: - labels.extend(tensor2list(l)) - - results = compute_metrics(self.task_name, np.array(preds), np.array(labels)) - val_name = self._validation_names[dataloader_idx].upper() - logging.info(f'{val_name} evaluation: {results}') - - # writing labels and predictions to a file in output_dir is specified in the config - output_dir = self._cfg.output_dir - if output_dir: - os.makedirs(output_dir, exist_ok=True) - filename = os.path.join(output_dir, f'{self.task_name}_{val_name}.txt') - logging.info(f'Saving labels and predictions to {filename}') - with open(filename, 'w') as f: - f.write('labels\t' + list2str(labels) + '\n') - f.write('preds\t' + list2str(preds) + '\n') - - self.log('val_loss', avg_loss) - if self.trainer.is_global_zero: - for k, v in results.items(): - self.log(f'{val_name}_{k}', v, rank_zero_only=True) - - def setup_training_data(self, train_data_config: Optional[DictConfig] = None): - if train_data_config is None: - train_data_config = self._cfg.train_ds - - self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config) - - def setup_validation_data(self, val_data_config: Optional[DictConfig] = None): - if val_data_config is None: - val_data_config = self._cfg.validation_ds - - self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config) - - def setup_multiple_validation_data(self, val_data_config: Union[DictConfig, Dict] = None): - if val_data_config is None: - val_data_config = self._cfg.validation_ds - - return super().setup_multiple_validation_data(val_data_config) - - def _setup_dataloader_from_config(self, cfg: DictConfig): - file_name = cfg.ds_item - if not os.path.exists(file_name): - raise FileNotFoundError( - "GLUE datasets not found. For more details on how to get the data, see: " - "https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e" - ) - - dataset = GLUEDataset( - file_name=file_name, - task_name=self.task_name, - tokenizer=self.tokenizer, - max_seq_length=self._cfg.dataset.max_seq_length, - use_cache=self._cfg.dataset.use_cache, - ) - - return torch.utils.data.DataLoader( - dataset=dataset, - collate_fn=dataset.collate_fn, - batch_size=cfg.batch_size, - shuffle=cfg.shuffle, - num_workers=self._cfg.dataset.num_workers, - pin_memory=self._cfg.dataset.pin_memory, - drop_last=self._cfg.dataset.drop_last, - ) - - @classmethod - def list_available_models(cls) -> Optional[Dict[str, str]]: - pass diff --git a/nemo/collections/nlp/models/glue_benchmark/metrics_for_glue.py b/nemo/collections/nlp/models/glue_benchmark/metrics_for_glue.py deleted file mode 100644 index dd4ecada1a87..000000000000 --- a/nemo/collections/nlp/models/glue_benchmark/metrics_for_glue.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, List - -from scipy.stats import pearsonr, spearmanr -from sklearn.metrics import f1_score, matthews_corrcoef - -__all__ = ['compute_metrics'] - - -def accuracy(preds: List[int], labels: List[int]): - return {"acc": (preds == labels).mean()} - - -def acc_and_f1(preds: List[int], labels: List[int]): - accuracy = (preds == labels).mean() - f1 = f1_score(y_true=labels, y_pred=preds) - return {"acc": accuracy, "f1": f1} - - -def mcc(preds: List[int], labels: List[int]): - return {"mcc": matthews_corrcoef(labels, preds)} - - -def pearson_and_spearman(preds: List[int], labels: List[int]): - pearson_corr = pearsonr(preds, labels)[0] - spearman_corr = spearmanr(preds, labels)[0] - return {"pearson": pearson_corr, "spearmanr": spearman_corr, "pear+spear av": (pearson_corr + spearman_corr) / 2} - - -def compute_metrics(task_name: str, preds: List[int], labels: List[int]) -> Dict[str, float]: - """ - Computes metrics for GLUE tasks - Args: - task_name: GLUE task name - preds: model predictions - labels: golden labels - Returns: - metrics - """ - if len(preds) != len(labels): - raise ValueError("Predictions and labels must have the same length") - - metric_fn = accuracy - if task_name == 'cola': - metric_fn = mcc - elif task_name in ['mrpc', 'qqp']: - metric_fn = acc_and_f1 - elif task_name == 'sts-b': - metric_fn = pearson_and_spearman - - return metric_fn(preds, labels) diff --git a/nemo/collections/nlp/models/information_retrieval/__init__.py b/nemo/collections/nlp/models/information_retrieval/__init__.py deleted file mode 100644 index f07a53c76cb2..000000000000 --- a/nemo/collections/nlp/models/information_retrieval/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.models.information_retrieval.bert_dpr_model import BertDPRModel -from nemo.collections.nlp.models.information_retrieval.bert_joint_ir_model import BertJointIRModel diff --git a/nemo/collections/nlp/models/information_retrieval/base_ir_model.py b/nemo/collections/nlp/models/information_retrieval/base_ir_model.py deleted file mode 100644 index 91d86fef1851..000000000000 --- a/nemo/collections/nlp/models/information_retrieval/base_ir_model.py +++ /dev/null @@ -1,212 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import Dict, Optional - -import numpy as np -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.data import BertInformationRetrievalDataset -from nemo.collections.nlp.models.nlp_model import NLPModel -from nemo.collections.nlp.modules.common.lm_utils import get_lm_model -from nemo.core.classes.common import typecheck - -__all__ = ['BaseIRModel'] - - -class BaseIRModel(NLPModel): - """ - Base class for information retrieval models. - """ - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - - self.setup_tokenizer(cfg.tokenizer) - - super().__init__(cfg=cfg, trainer=trainer) - - @typecheck() - def forward(self, *args): - pass - - def compute_scores_and_loss(self, inputs): - pass - - @staticmethod - def get_lm_model_with_padded_embedding(cfg: DictConfig): - """ - Function which ensures that vocabulary size is divisivble by 8 - for faster mixed precision training. - """ - model = get_lm_model( - config_file=cfg.language_model.config_file, - config_dict=OmegaConf.to_container(cfg.language_model.config) if cfg.language_model.config else None, - vocab_file=cfg.tokenizer.vocab_file, - trainer=trainer, - cfg=cfg, - ) - vocab_size, hidden_size = model.config.vocab_size, model.config.hidden_size - tokens_to_add = 8 * math.ceil(vocab_size / 8) - vocab_size - zeros = torch.zeros((tokens_to_add, hidden_size)) - model.embeddings.word_embeddings.weight.data = torch.cat((model.embeddings.word_embeddings.weight.data, zeros)) - return model - - @staticmethod - def calculate_mean_reciprocal_rank(query2passages, query2rel): - """ - Helper function which calculates mean reciprocal rank. - Args: - query2passages: dict which contains passage ids and corresponding - scores for each query - query2rel: dict which contains ids of relevant passages for each query - """ - reciprocal_ranks = [] - - for query in query2passages: - indices = np.argsort(query2passages[query]["scores"])[::-1] - sorted_psgs = query2passages[query]["psg_ids"][indices] - - reciprocal_ranks.append(0) - for i, psg_id in enumerate(sorted_psgs): - if psg_id in query2rel[query]: - reciprocal_ranks[-1] = 1 / (i + 1) - break - return np.mean(reciprocal_ranks) - - def training_step(self, batch, batch_idx): - """ - Lightning calls this inside the training loop with the data from the training dataloader - passed in as `batch`. - """ - scores, train_loss = self.compute_scores_and_loss(batch[:-2]) - tensorboard_logs = {"train_loss": train_loss, "lr": self._optimizer.param_groups[0]["lr"]} - return {"loss": train_loss, "log": tensorboard_logs} - - def validation_step(self, batch, batch_idx): - """ - Lightning calls this inside the validation loop with the data from the validation dataloader - passed in as `batch`. - """ - scores, val_loss = self.compute_scores_and_loss(batch[:-2]) - query_ids, passage_ids = batch[-2:] - data_for_val = { - "val_loss": val_loss, - "scores": scores, - "query_ids": query_ids, - "passage_ids": passage_ids, - } - self.validation_step_outputs.append(data_for_val) - return data_for_val - - def on_validation_epoch_end(self): - """ - Called at the end of validation to aggregate outputs. - :param outputs: list of individual outputs of each validation step. - """ - - query_ids = torch.cat([x["query_ids"] for x in self.validation_step_outputs]) - passage_ids = torch.cat([x["passage_ids"] for x in self.validation_step_outputs]) - scores = torch.cat([x["scores"] for x in self.validation_step_outputs]) - - all_query_ids, all_passage_ids, all_scores = [], [], [] - if torch.distributed.is_initialized(): - world_size = torch.distributed.get_world_size() - for ind in range(world_size): - all_query_ids.append(torch.empty_like(query_ids)) - all_passage_ids.append(torch.empty_like(passage_ids)) - all_scores.append(torch.empty_like(scores)) - torch.distributed.all_gather(all_query_ids, query_ids) - torch.distributed.all_gather(all_passage_ids, passage_ids) - torch.distributed.all_gather(all_scores, scores) - else: - all_query_ids.append(query_ids) - all_passage_ids.append(passage_ids) - all_scores.append(scores) - - val_mrr = 0 - if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: - query2passages, query2rels = {}, {} - processed_queries = set() - - for i in range(len(all_query_ids)): - - query_ids = all_query_ids[i].detach().cpu().numpy() - passage_ids = all_passage_ids[i].detach().cpu().numpy() - scores = all_scores[i].detach().cpu().numpy() - - for j, query_id in enumerate(query_ids): - - if query_id not in processed_queries: - processed_queries.add(query_id) - query2passages[query_id] = { - "psg_ids": passage_ids[j], - "scores": scores[j], - } - query2rels[query_id] = [passage_ids[j][0]] - else: - query2passages[query_id]["psg_ids"] = np.concatenate( - (query2passages[query_id]["psg_ids"], passage_ids[j][1:]) - ) - query2passages[query_id]["scores"] = np.concatenate( - (query2passages[query_id]["scores"], scores[j][1:]) - ) - - val_mrr = self.calculate_mean_reciprocal_rank(query2passages, query2rels) - - val_loss = torch.stack([x["val_loss"] for x in self.validation_step_outputs]).mean() - self.validation_step_outputs.clear() # free memory - tensorboard_logs = { - "val_mrr": val_mrr, - "val_loss": val_loss, - } - - return {"log": tensorboard_logs} - - def setup_training_data(self, train_data_config: Optional[DictConfig]): - self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config) - - def setup_validation_data(self, val_data_config: Optional[DictConfig]): - self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config) - - def setup_test_data(self, test_data_config: Optional[DictConfig]): - self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config) - - def _setup_dataloader_from_config(self, cfg: DictConfig): - - dataset = BertInformationRetrievalDataset( - tokenizer=self.tokenizer, - passages=cfg.passages, - queries=cfg.queries, - query_to_passages=cfg.query_to_passages, - num_negatives=cfg.num_negatives, - psg_cache_format=cfg.get("psg_cache_format", "pkl"), - max_query_length=cfg.get("max_query_length", 31), - max_passage_length=cfg.get("max_passage_length", 190), - ) - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=cfg.batch_size, - shuffle=cfg.shuffle, - num_workers=cfg.get("num_workers", 2), - pin_memory=cfg.get("pin_memory", False), - drop_last=cfg.get("drop_last", False), - ) - - @classmethod - def list_available_models(cls) -> Optional[Dict[str, str]]: - pass diff --git a/nemo/collections/nlp/models/information_retrieval/bert_dpr_model.py b/nemo/collections/nlp/models/information_retrieval/bert_dpr_model.py deleted file mode 100644 index bfbec123d13e..000000000000 --- a/nemo/collections/nlp/models/information_retrieval/bert_dpr_model.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional - -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig - -from nemo.collections.common.losses import SmoothedCrossEntropyLoss -from nemo.collections.nlp.data import BertInformationRetrievalDataset -from nemo.collections.nlp.models.information_retrieval.base_ir_model import BaseIRModel -from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer -from nemo.core.classes.common import typecheck -from nemo.core.neural_types import ChannelType, LogitsType, MaskType, NeuralType - -__all__ = ["BertDPRModel"] - - -class BertDPRModel(BaseIRModel): - """ - Information retrieval model which encodes query and passage separately - with two different BERT encoders and computes their similarity score - as a dot-product between corresponding [CLS] token representations. - """ - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "q_input_ids": NeuralType(("B", "T"), ChannelType()), - "q_attention_mask": NeuralType(("B", "T"), MaskType()), - "q_token_type_ids": NeuralType(("B", "T"), ChannelType()), - "p_input_ids": NeuralType(("B", "T"), ChannelType()), - "p_attention_mask": NeuralType(("B", "T"), MaskType()), - "p_token_type_ids": NeuralType(("B", "T"), ChannelType()), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return {"logits": NeuralType(("B", "D"), LogitsType())} - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - - model_name = cfg.language_model.pretrained_model_name - self.tokenizer = get_tokenizer(tokenizer_name=model_name) - - super().__init__(cfg=cfg, trainer=trainer) - - self.q_encoder = self.get_lm_model_with_padded_embedding(cfg) - self.p_encoder = self.get_lm_model_with_padded_embedding(cfg) - self.loss = SmoothedCrossEntropyLoss(pad_id=self.tokenizer.pad_id) - - @typecheck() - def forward( - self, - q_input_ids, - q_token_type_ids, - q_attention_mask, - p_input_ids, - p_token_type_ids, - p_attention_mask, - ): - - q_vectors = self.q_encoder( - input_ids=q_input_ids, - token_type_ids=q_token_type_ids, - attention_mask=q_attention_mask, - ) - q_vectors = q_vectors[:, 0] - batch_size, hidden_size = q_vectors.size() - - p_vectors = self.p_encoder( - input_ids=p_input_ids, - token_type_ids=p_token_type_ids, - attention_mask=p_attention_mask, - ) - num_passages = p_vectors.shape[0] // batch_size - p_vectors = p_vectors[:, 0].view(-1, num_passages, hidden_size) - p_positives, p_negatives = p_vectors[:, 0], p_vectors[:, 1:] - scores = torch.cat( - ( - torch.matmul(q_vectors, p_positives.T), - torch.einsum("ij,ipj->ip", q_vectors, p_negatives), - ), - dim=1, - ) - - return scores - - def compute_scores_and_loss(self, inputs): - ( - q_input_ids, - q_input_mask, - q_input_type_ids, - p_input_ids, - p_input_mask, - p_input_type_ids, - ) = inputs - batch_size, num_passages, p_seq_length = p_input_ids.size() - q_seq_length = q_input_ids.size()[-1] - - scores = self( - q_input_ids=q_input_ids.view(-1, q_seq_length), - q_token_type_ids=q_input_type_ids.view(-1, q_seq_length), - q_attention_mask=q_input_mask.view(-1, q_seq_length), - p_input_ids=p_input_ids.view(-1, p_seq_length), - p_token_type_ids=p_input_type_ids.view(-1, p_seq_length), - p_attention_mask=p_input_mask.view(-1, p_seq_length), - ).view(batch_size, 1, batch_size + num_passages - 1) - normalized_scores = torch.log_softmax(scores, dim=-1) - - labels = torch.arange(batch_size)[:, None].long().to(normalized_scores.device) - loss = self.loss( - log_probs=normalized_scores, - labels=labels, - output_mask=torch.ones_like(labels), - ) - - scores = scores[:, 0] - scores = torch.cat( - (torch.diag(scores)[:, None], scores[:, batch_size:]), - dim=1, - ) - - return scores, loss - - def _setup_dataloader_from_config(self, cfg: DictConfig): - - dataset = BertInformationRetrievalDataset( - tokenizer=self.tokenizer, - passages=cfg.passages, - queries=cfg.queries, - query_to_passages=cfg.query_to_passages, - num_negatives=cfg.num_negatives, - psg_cache_format=cfg.get("psg_cache_format", "pkl"), - max_query_length=cfg.get("max_query_length", 31), - max_passage_length=cfg.get("max_passage_length", 190), - preprocess_fn="preprocess_dpr", - ) - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=cfg.batch_size, - shuffle=cfg.shuffle, - num_workers=cfg.get("num_workers", 2), - pin_memory=cfg.get("pin_memory", False), - drop_last=cfg.get("drop_last", False), - ) diff --git a/nemo/collections/nlp/models/information_retrieval/bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/bert_embedding_model.py deleted file mode 100644 index 2f0445d4c184..000000000000 --- a/nemo/collections/nlp/models/information_retrieval/bert_embedding_model.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import warnings - -import torch -import torch.nn.functional as F -from torch import Tensor, nn - -from nemo.collections.nlp.models.language_modeling.megatron.bert.bert_model import ( - MCoreBertModelWrapperWithPostLNSupport, - NeMoBertModel, -) -from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults - -try: - HAVE_MEGATRON_CORE = True - -except (ImportError, ModuleNotFoundError): - - ModelParallelConfig = ApexGuardDefaults - - HAVE_MEGATRON_CORE = False - - -class BertEmbeddingHead(nn.Module): - """Performs mean pooling on the token embeddings.""" - - def __init__( - self, - word_embedding_dimension: int, - pooling_mode_mean_tokens: bool = True, - ): - super(BertEmbeddingHead, self).__init__() - - self.config_keys = [ - "word_embedding_dimension", - "pooling_mode_mean_tokens", - ] - self.word_embedding_dimension = word_embedding_dimension - self.pooling_mode_mean_tokens = pooling_mode_mean_tokens - - def forward(self, token_embeddings: Tensor, attention_mask: Tensor): - # pylint: disable=C0116 - token_embeddings = token_embeddings.permute(1, 0, 2) - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - - sum_mask = input_mask_expanded.sum(1) - - sum_mask = torch.clamp(sum_mask, min=1e-9) - - output_vector = sum_embeddings / sum_mask - - output_vector = F.normalize(output_vector, p=2, dim=1) - - return output_vector - - def __repr__(self): - return "Pooling({}) and Normalize".format(self.get_config_dict()) - - def get_config_dict(self): - # pylint: disable=C0116 - return {key: self.__dict__[key] for key in self.config_keys} - - -class MCoreBertEmbeddingModel(MCoreBertModelWrapperWithPostLNSupport): - """BertEmbeddingModel that wraps a BertEmbeddingHead and MCoreBertEmbeddingModel""" - - def __init__(self, *args, **kwargs): - - super(MCoreBertEmbeddingModel, self).__init__(*args, **kwargs) - # Changing the default settings of the original Bert model to make it compatible with the embedding model. - self.post_process = False - self.binary_head = None - self.lm_head = None - self.output_layer = None - self.encoder.final_layernorm = None - self.encoder.post_process = False - self.embedding_head = BertEmbeddingHead( - word_embedding_dimension=self.config.hidden_size, - pooling_mode_mean_tokens=True, - ) - - def forward( - self, - input_ids: Tensor, - attention_mask: Tensor, - tokentype_ids: Tensor = None, - lm_labels: Tensor = None, - inference_params=None, - **kwargs, - ): - """Forward function of BERT model - - Forward function of the BERT Model This function passes the input tensors - through the embedding layer, and then the encoder and finally into the post - processing layer (optional). - - It either returns the Loss values if labels are given or the final hidden units - """ - hidden_states = super(MCoreBertEmbeddingModel, self).forward( - input_ids, attention_mask, tokentype_ids, lm_labels, inference_params - ) - embeddings_out = self.embedding_head(hidden_states, attention_mask) - return embeddings_out - - -class NeMoBertEmbeddingModel(NeMoBertModel): - """ - Bert Language model. - Model returns [seq, batch, hidden] shape - """ - - def __init__(self, *args, **kwargs): - warnings.warn( - "NeMoBertModel will be deprecated mid 2024. Use MCoreBertEmbeddingModel instead.", DeprecationWarning - ) - super().__init__(*args, **kwargs) - self.embedding_head = BertEmbeddingHead( - word_embedding_dimension=self.config.hidden_size, - pooling_mode_mean_tokens=True, - ) - - def forward( - self, - bert_model_input, - attention_mask, - token_type_ids=None, - lm_labels=None, - checkpoint_activations_all_layers=None, - ): - - lm_output = super(NeMoBertEmbeddingModel, self).forward( - bert_model_input, attention_mask, token_type_ids, lm_labels, checkpoint_activations_all_layers - ) - embeddings_out = self.embedding_head(lm_output[0], attention_mask) - return embeddings_out diff --git a/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py b/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py deleted file mode 100644 index 33885e6b50c6..000000000000 --- a/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional - -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig - -from nemo.collections.common.losses import SmoothedCrossEntropyLoss -from nemo.collections.nlp.models.information_retrieval.base_ir_model import BaseIRModel -from nemo.collections.nlp.modules.common import SequenceRegression -from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer -from nemo.core.classes.common import typecheck -from nemo.core.neural_types import NeuralType - -__all__ = ["BertJointIRModel"] - - -class BertJointIRModel(BaseIRModel): - """ - Information retrieval model which jointly encodes both query and passage - and passes them to BERT encoder followed by a fully-connected layer for - similarity score prediction. - """ - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - return self.bert_model.input_types - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return self.sim_score_regressor.output_types - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - - model_name = cfg.language_model.pretrained_model_name - self.tokenizer = get_tokenizer(tokenizer_name=model_name) - - super().__init__(cfg=cfg, trainer=trainer) - - self.bert_model = self.get_lm_model_with_padded_embedding(cfg) - hidden_size = self.bert_model.config.hidden_size - self.sim_score_regressor = SequenceRegression( - hidden_size=hidden_size, - num_layers=1, - dropout=cfg.language_model.sim_score_dropout, - ) - self.loss = SmoothedCrossEntropyLoss(pad_id=self.tokenizer.pad_id) - - @typecheck() - def forward(self, input_ids, attention_mask, token_type_ids): - - hidden_states = self.bert_model( - input_ids=input_ids, - token_type_ids=token_type_ids, - attention_mask=attention_mask, - ) - if isinstance(hidden_states, tuple): - hidden_states = hidden_states[0] - - scores = self.sim_score_regressor(hidden_states=hidden_states) - - return scores - - def compute_scores_and_loss(self, inputs): - input_ids, input_mask, input_type_ids = inputs - batch_size, num_passages, seq_length = input_ids.size() - - unnormalized_scores = self( - input_ids=input_ids.view(-1, seq_length), - attention_mask=input_mask.view(-1, seq_length), - token_type_ids=input_type_ids.view(-1, seq_length), - ).view(batch_size, 1, num_passages) - scores = torch.log_softmax(unnormalized_scores, dim=-1) - - labels = torch.zeros_like(input_ids[:, :1, 0]) - loss = self.loss(log_probs=scores, labels=labels, output_mask=torch.ones_like(labels)) - - return unnormalized_scores[:, 0], loss diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py deleted file mode 100644 index 4f47c3a67216..000000000000 --- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py +++ /dev/null @@ -1,724 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# flake8: noqa -# pylint: skip-file - -import logging -import os - -import numpy as np -import torch -from lightning.pytorch.trainer.trainer import Trainer -from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec -from omegaconf import DictConfig, OmegaConf, open_dict -from omegaconf.dictconfig import DictConfig -from torch.distributed import all_gather as all_gather_no_backprop -from torch.distributed.nn.functional import all_gather as all_gather_with_backprop - -from nemo.collections.nlp.data.information_retrieval.bert_embedding_dataset import BertEmbeddingDataset -from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import ( - MegatronPretrainingRandomSampler, - MegatronPretrainingSampler, -) -from nemo.collections.nlp.models.information_retrieval.bert_embedding_model import ( - MCoreBertEmbeddingModel, - NeMoBertEmbeddingModel, -) -from nemo.collections.nlp.models.language_modeling.megatron.bert.bert_spec import ( - bert_layer_with_transformer_engine_spec_postln, -) -from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel -from nemo.collections.nlp.modules.common.megatron.utils import ( - ApexGuardDefaults, - average_losses_across_data_parallel_group, -) -from nemo.collections.nlp.parts.utils_funcs import get_last_rank -from nemo.utils import logging - -try: - from megatron.core import parallel_state - from megatron.core.pipeline_parallel.schedules import get_forward_backward_func - from megatron.core.transformer.module import Float16Module as MCoreFloat16Module - - HAVE_MEGATRON_CORE = True - -except (ImportError, ModuleNotFoundError): - TransformerConfig = ApexGuardDefaults - ModelParallelConfig = ApexGuardDefaults - - HAVE_MEGATRON_CORE = False - -try: - from megatron.core.num_microbatches_calculator import get_num_microbatches - -except (ImportError, ModuleNotFoundError): - logging.warning("Megatron num_microbatches_calculator not found, using Apex version.") - from apex.transformer.pipeline_parallel.utils import get_num_microbatches - - -def listify(tensor): - l_tensor = [] - for t in tensor: - r = t[:].unsqueeze(0).cpu() - l_tensor.append(r) - return l_tensor - - -class MegatronBertEmbeddingModel(MegatronBertModel): - """ - Megatron Bert pretraining. - Model returns [batch, seq, hidden] shape - """ - - def __init__(self, cfg: DictConfig, trainer: Trainer): - - super().__init__(cfg, trainer=trainer) - self.cross_entropy_loss = torch.nn.CrossEntropyLoss(label_smoothing=cfg.get('label_smoothing', 0.0)) - softmax_temp = cfg.get('softmax_temp', 0.05) - self.scale = 1.0 / softmax_temp - self.hard_negatives_to_train = self.cfg.data.get("hard_negatives_to_train", 4) - self.global_inbatch_negatives = self.cfg.get("global_inbatch_negatives", True) - self.backprop_type = self.cfg.get("backprop_type", "local") - assert self.backprop_type in ["local", "global"], "Backprop type must be `local` or `global`" - - def model_provider_func(self, pre_process, post_process): - cfg = self.cfg - num_tokentypes = 2 if cfg.bert_binary_head else 0 - transformer_block_type = cfg.get('transformer_block_type', 'post_ln') - if self.mcore_bert: - if transformer_block_type == 'pre_ln': - layer_spec = bert_layer_with_transformer_engine_spec - else: - layer_spec = bert_layer_with_transformer_engine_spec_postln - model = MCoreBertEmbeddingModel( - config=self.transformer_config, - transformer_layer_spec=layer_spec, - vocab_size=self.padded_vocab_size, - max_sequence_length=cfg.max_position_embeddings, - num_tokentypes=num_tokentypes, - add_binary_head=cfg.bert_binary_head, - share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True), - parallel_output=True, - pre_process=pre_process, - post_process=post_process, - transformer_block_type=transformer_block_type, - add_pooler=self.cfg.get('add_pooler', True), - ) - - else: - model = NeMoBertEmbeddingModel( - config=self.model_parallel_config, - vocab_size=self.padded_vocab_size, - hidden_size=cfg.hidden_size, - max_position_embeddings=cfg.max_position_embeddings, - num_layers=cfg.num_layers, - num_attention_heads=cfg.num_attention_heads, - apply_query_key_layer_scaling=cfg.get('apply_query_key_layer_scaling', True), - kv_channels=cfg.get('kv_channels', None), - ffn_hidden_size=cfg.ffn_hidden_size, - num_tokentypes=num_tokentypes, - parallel_output=True, - pre_process=pre_process, - post_process=post_process, - init_method_std=cfg.get('init_method_std', 0.02), - fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False), - hidden_dropout=cfg.get('hidden_dropout', 0.1), - precision=cfg.get('precision', 16), - fp32_residual_connection=cfg.get('fp32_residual_connection', False), - activations_checkpoint_granularity=self.cfg.get('activations_checkpoint_granularity', None), - activations_checkpoint_method=self.cfg.get('activations_checkpoint_method', None), - activations_checkpoint_num_layers=self.cfg.get('activations_checkpoint_num_layers', 1), - activations_checkpoint_layers_per_pipeline=self.cfg.get( - 'activations_checkpoint_layers_per_pipeline', None - ), - layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5), - masked_softmax_fusion=cfg.get('masked_softmax_fusion', True), - normalization=cfg.get('normalization', 'layernorm'), - transformer_block_type=transformer_block_type, - bias_gelu_fusion=cfg.get('bias_gelu_fusion', True), - bias_dropout_add_fusion=cfg.get("bias_dropout_add_fusion", True), - onnx_safe=cfg.get('onnx_safe', False), - add_binary_head=cfg.bert_binary_head, - megatron_legacy=cfg.get('megatron_legacy', False), - position_embedding_type=self.cfg.get("position_embedding_type", "learned_absolute"), - add_pooler=cfg.get('add_pooler', True), - add_lm_head=cfg.get('add_lm_head', False), - ) - - return model - - def build_train_valid_test_datasets(self, is_train=True): - - self._train_ds = None - self._validation_ds = None - self._test_ds = None - - if is_train: - self._train_ds = BertEmbeddingDataset( - self.cfg.data.data_train, - tokenizer=self.tokenizer, - add_bos=True, - num_hard_negatives=self.cfg.data.get("hard_negatives_to_train", 4), - max_seq_length=self.cfg.encoder_seq_length, - ) - if self.cfg.data.data_validation: - self._validation_ds = BertEmbeddingDataset( - self.cfg.data.data_validation, - tokenizer=self.tokenizer, - add_bos=True, - num_hard_negatives=self.cfg.data.get("hard_negatives_to_train", 4), - max_seq_length=self.cfg.encoder_seq_length, - ) - - else: - logging.info(f'Building test dataset') - if self.cfg.data.data_test.query_file_names is None or self.cfg.data.data_test.doc_file_names is None: - return [] - - query_dataset = BertEmbeddingDataset( - file_path=self.cfg.data.data_test.query_file_names[0], - tokenizer=self.tokenizer, - max_seq_length=self.cfg.encoder_seq_length, - add_bos=True, - add_eos=True, - data_type="query", - ) - doc_dataset = BertEmbeddingDataset( - file_path=self.cfg.data.data_test.doc_file_names[0], - tokenizer=self.tokenizer, - max_seq_length=self.cfg.encoder_seq_length, - add_bos=True, - add_eos=True, - data_type="doc", - ) - - self._test_ds = [query_dataset, doc_dataset] - - if self._train_ds is not None: - logging.info(f'Length of train dataset: {len(self._train_ds)}') - if self._validation_ds is not None: - logging.info(f'Length of val dataset: {len(self._validation_ds)}') - if self._test_ds is not None: - logging.info(f'Length of test query dataset: {len(self._test_ds[0])}') - logging.info(f'Length of test doc dataset: {len(self._test_ds[1])}') - - logging.info(f'Finished building SBert datasets.') - - return self._train_ds, self._validation_ds, self._test_ds - - def setup(self, stage=None): - """ - PTL hook that is executed after DDP spawns. - We setup datasets here as megatron datasets require DDP to instantiate. - See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information. - - Args: - stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None. - """ - - num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert() - - logging.info( - f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, ' - f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, ' - f'Number of model parameters on device: {num_parameters_on_device:.2e}. ' - f'Total number of model parameters: {total_num_parameters:.2e}.' - ) - - resume_checkpoint_path = self.trainer.ckpt_path - if resume_checkpoint_path: - init_consumed_samples = self._extract_consumed_samples_from_ckpt(resume_checkpoint_path) - else: - init_consumed_samples = 0 - self.init_consumed_samples = init_consumed_samples - self.init_global_step = self.trainer.global_step - - if stage == 'predict': - return - elif stage == 'test': - self.build_train_valid_test_datasets(is_train=False) - self.setup_test_data(self.cfg.data) - else: - # TODO: consider adding a ModelPT guard to check if model is being restored. - # allowing restored models to optionally setup datasets - if self.cfg.data.dataloader_type == "LDDL": - self.build_LDDL_data(self.cfg.data) - torch.distributed.barrier() - else: - self.build_train_valid_test_datasets() - self.setup_training_data(self.cfg.data) - self.setup_validation_data(self.cfg.data) - - # when using pipeline model parallel the final stage need to initialize word embeddings - if parallel_state.get_pipeline_model_parallel_world_size() > 1: - if isinstance(self.model, list): - for i, module in enumerate(self.model): - sync_embeddings = ( - module.setup_embeddings_and_output_layer - if self.mcore_bert - else module.sync_initial_word_embeddings - ) - sync_embeddings() - else: - sync_embeddings = ( - self.model.setup_embeddings_and_output_layer - if self.mcore_bert - else self.model.sync_initial_word_embeddings - ) - sync_embeddings() - - if self.cfg.get('transformer_engine', False) or self.cfg.get('mcore_bert', False): - self.setup_transformer_engine_tp_groups() - - @classmethod - def merge_cfg_with(cls, path, cfg): - """ - Merge a given configuration dictionary `cfg` with the configuration dictionary - obtained from restoring a MegatronBertModel at the specified `path`. - - Args: - path (str): The path to the Bert model checkpoint to be restored. - cfg (DictConfig): The configuration dictionary to merge. - - Returns: - DictConfig: The merged configuration dictionary. - - Examples: - >>> path = "/path/to/model/checkpoint" - >>> cfg = DictConfig({"model": {"key": "value"}, "trainer": {"precision": 16}}) - >>> merged_cfg = merge_cfg_with(path, cfg) - - Notes: - - The function resolves variables within the `cfg` dictionary using `OmegaConf.resolve`. - - Keys in `cfg.model` will override the corresponding keys in the output dictionary. - - If "train_ds" exists in `cfg.model.data`, it updates `micro_batch_size` and `global_batch_size`. - - If `cfg.trainer` contains a "precision" key, it updates `output.precision`. - - """ - - base_cfg = cls.restore_from(path, return_config=True) - - OmegaConf.resolve(cfg) - with open_dict(base_cfg): - for key, val in cfg.model.items(): - base_cfg[key] = val - if "train_ds" in cfg.model.data: - base_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size - base_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size - if cfg.get("trainer", None) and cfg.trainer.get("precision"): - base_cfg.precision = cfg.trainer.precision - - return base_cfg - - def build_pretraining_data_loader(self, dataset, consumed_samples): - """Buld dataloader given an input dataset.""" - - if dataset is None: - return None - - # Megatron sampler - if hasattr(self.cfg.data, 'dataloader_type') and self.cfg.data.dataloader_type is not None: - if self.cfg.data.dataloader_type == 'single': - batch_sampler = MegatronPretrainingSampler( - total_samples=len(dataset), - consumed_samples=consumed_samples, - micro_batch_size=self.cfg.micro_batch_size, - global_batch_size=self.cfg.global_batch_size, - data_parallel_rank=parallel_state.get_data_parallel_rank(), - data_parallel_size=parallel_state.get_data_parallel_world_size(), - drop_last=self.cfg.get('drop_last', False), - pad_samples_to_global_batch_size=not self.cfg.get('drop_last', False), - ) - elif self.cfg.data.dataloader_type == 'cyclic': - batch_sampler = MegatronPretrainingRandomSampler( - total_samples=len(dataset), - consumed_samples=consumed_samples, - micro_batch_size=self.cfg.micro_batch_size, - data_parallel_rank=parallel_state.get_data_parallel_rank(), - data_parallel_size=parallel_state.get_data_parallel_world_size(), - drop_last=self.cfg.get('drop_last', False), - pad_samples_to_global_batch_size=not self.cfg.get('drop_last', False), - ) - else: - raise ValueError('cfg.data.dataloader_type must be "single" or "cyclic"') - else: - raise ValueError('cfg.data.dataloader_type not found. Must be "single" or "cyclic"') - - # Torch dataloader. - - dataloader = torch.utils.data.DataLoader( - dataset, - shuffle=False, - batch_sampler=batch_sampler, - num_workers=self.cfg.data.num_workers, - pin_memory=True, - persistent_workers=True if self.cfg.data.num_workers > 0 else False, - collate_fn=dataset.collate_fn, - ) - return dataloader - - def setup_training_data(self, cfg): - if self._train_ds: - consumed_samples = self.compute_consumed_samples(0) - logging.info( - f'Setting up train dataloader with len(len(self._train_ds)): {len(self._train_ds)} and consumed samples: {consumed_samples}' - ) - self._train_dl = self.build_pretraining_data_loader(self._train_ds, consumed_samples) - - def setup_validation_data(self, cfg): - if self._validation_ds: - consumed_samples = 0 - logging.info( - f'Setting up validation dataloader with len(len(self._validation_ds)): {len(self._validation_ds)} and consumed samples: {consumed_samples}' - ) - self._validation_dl = self.build_pretraining_data_loader(self._validation_ds, consumed_samples) - - def setup_eval_dataloader(self, datasets): - dataloaders = [] - for dataset in datasets: - eval_dl = self.build_pretraining_data_loader( - dataset=dataset, - consumed_samples=0, - ) - dataloaders.append(eval_dl) - return dataloaders - - def setup_test_data(self, cfg): - if self._test_ds: - logging.info( - f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds[0])}, {len(self._test_ds[1])}' - ) - self._test_dl = self.setup_eval_dataloader(self._test_ds) - return - - def training_step(self, dataloader_iter): - - self._optimizer.zero_grad() - - if self.with_distributed_adam: - # hack to enable overlapping param sync and forward compute - # note: the distributed optimizer monkey-patches each - # parameter's __getattribute__ function so that it can - # launch parameter all-gathers the first time the - # parameter is accessed after the optimizer step. However, - # PyTorch directly passes embedding parameters into a C++, - # bypassing this process. A quick-and-dirty hack is to - # manually interact with the parameter. - modules = self.model if isinstance(self.model, list) else [self.model] - for module in modules: - if isinstance(module, (Float16Module, MCoreFloat16Module)): - module = module.module - if not self.mcore_bert: - module = module.language_model - if hasattr(module, 'embedding'): - for param in module.embedding.parameters(): - param.data_ptr() - - if self.cfg.data.dataloader_type == "LDDL": - # this is of type bert dataset - seq_length = dataloader_iter.iterator.loaders.get_seqlen() - else: - seq_length = self.cfg.encoder_seq_length - - # run forward and backwards passes for an entire global batch - # we do this inside training_step to support pipeline parallelism - fwd_bwd_function = get_forward_backward_func() - - losses_reduced_per_micro_batch = fwd_bwd_function( - forward_step_func=self.get_forward_output_and_loss_func(), - data_iterator=self._make_data_iterator_list(dataloader_iter), - model=self.model, - num_microbatches=get_num_microbatches(), - forward_only=False, - seq_length=seq_length, - micro_batch_size=self.cfg.micro_batch_size, - ) - - if losses_reduced_per_micro_batch: - loss_tensors_list = [loss_reduced['loss'] for loss_reduced in losses_reduced_per_micro_batch] - loss_tensor = torch.vstack(loss_tensors_list) - loss_mean = loss_tensor.mean(axis=0) - else: - if self.cfg.bert_binary_head == True: - loss_mean = torch.tensor([0.0, 0.0, 0.0]).cuda() - else: - loss_mean = torch.tensor([0.0, 0.0]).cuda() - - # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced - if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False): - self.allreduce_sequence_parallel_gradients() - - if self.with_distributed_adam: - # synchronize asynchronous grad reductions - # note: not necessary, but reduces performance degradation - # from multiple simultaneous NCCL calls - self._optimizer._finish_bucket_grad_sync() - elif self.megatron_amp_O2: - if self.cfg.get('pipeline_model_parallel_size', 1) > 1 or self.cfg.get('sequence_parallel', False): - # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously) - self._optimizer.allreduce_main_grads() - else: - # async grad allreduce is not currently implemented for O1/autocasting mixed precision training - # so we all-reduce gradients after the pipeline - self.allreduce_gradients() # @sangkug we think this is causing memory to blow up (hurts perf) - - if self.cfg.get('pipeline_model_parallel_size', 1) > 1: - # when using pipeline parallelism the first and last stage must keep embeddings in sync - self.allreduce_first_last_embeddings() - - torch.distributed.broadcast(loss_mean, get_last_rank()) - - if self.torch_dtype == torch.float16: - loss_scale = self.trainer.precision_plugin.scaler._scale - if loss_scale is not None: - self.log('loss_scale', loss_scale, batch_size=1) - - self.log('reduced_train_loss', loss_mean[0], prog_bar=True, batch_size=1) - if len(loss_mean) > 2: - self.log('reduced_lm_train_loss', loss_mean[1], prog_bar=True, batch_size=1) - self.log('reduced_sop_train_loss', loss_mean[2], prog_bar=True, batch_size=1) - lr = self._optimizer.param_groups[0]['lr'] - self.log('lr', lr, batch_size=1) - self.log('global_step', self.trainer.global_step, prog_bar=True, batch_size=1) - self.log( - 'consumed_samples', - self._compute_consumed_samples_after_training_step(), - prog_bar=True, - batch_size=1, - ) - return loss_mean[0] - - def get_forward_output_and_loss_func(self): - def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None): - - batches, _, dl_idx = next(dataloader_iter) - metadata = batches.pop('metadata') - batches = {k: v.cuda(non_blocking=True) for k, v in batches.items()} - - if self.mcore_bert: - - batches["tokentype_ids"] = batches.pop("token_type_ids") - output_tensor = model(**batches) - else: - output_tensor = self.forward(**batches).permute(1, 0) - - def loss_func(output_tensor): - - loss_dict = self.loss_func(output_tensor) - - if 'sop loss' in loss_dict: - lm_loss = loss_dict['lm loss'] - sop_loss = loss_dict['sop loss'] - loss = lm_loss + sop_loss - reduced_loss = average_losses_across_data_parallel_group([loss, lm_loss, sop_loss]) - else: - lm_loss = loss_dict['lm loss'] - loss = lm_loss - reduced_loss = average_losses_across_data_parallel_group([loss, lm_loss]) - - if 'hs' in loss_dict: - # metadata = batches.get('metadata', [{}] * len(batches['input_ids'])) - return loss, { - 'loss': reduced_loss, - 'd_hs': loss_dict['hs'], - 'q_hs': loss_dict['hs'], - 'metadata': metadata, - 'dl_idx': dl_idx, - } - else: - return loss, {'loss': reduced_loss} - - return output_tensor, loss_func - - return fwd_output_and_loss_func - - def validation_step(self, dataloader_iter): - prefix = "test" if self.trainer.testing else "val" - if self.cfg.data.dataloader_type == "LDDL": - seq_length = dataloader_iter.iterator.get_seqlen() - else: - seq_length = self.cfg.encoder_seq_length - - fwd_bwd_function = get_forward_backward_func() - - losses_reduced_per_micro_batch = fwd_bwd_function( - forward_step_func=self.get_forward_output_and_loss_func(), - data_iterator=self._make_data_iterator_list(dataloader_iter), - model=self.model, - num_microbatches=get_num_microbatches(), - forward_only=True, - seq_length=seq_length, - micro_batch_size=self.cfg.micro_batch_size, - ) - - if losses_reduced_per_micro_batch: - loss_tensors_list = [loss_reduced['loss'] for loss_reduced in losses_reduced_per_micro_batch] - loss_tensor = torch.vstack(loss_tensors_list) - loss_mean = loss_tensor.mean(axis=0) - else: - loss_mean = torch.tensor([0.0]).cuda() - - loss = loss_mean[0] - if prefix == 'val': - self.validation_step_outputs.append(loss) - else: - assert len(losses_reduced_per_micro_batch) == 1 - dataloader_idx = losses_reduced_per_micro_batch[0]['dl_idx'] - self.test_step_outputs[dataloader_idx].append(losses_reduced_per_micro_batch[0]) - return loss - - def on_test_epoch_end(self): - for dataloader_idx, output in enumerate(self.test_step_outputs): - self.gather_and_maybe_write_predictions(output, self.cfg.data.data_test, 'test', dataloader_idx) - - def gather_and_maybe_write_predictions(self, output, data_cfg, mode, dataloader_idx=0): - if not data_cfg.get("write_embeddings_to_file", False): - return True - gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())] - torch.distributed.all_gather_object( - gathered_output_batches, - [ - { - 'q_hs': batch['q_hs'], - 'd_hs': batch['d_hs'], - 'metadata': batch['metadata'], - } - for batch in output - ], - group=parallel_state.get_data_parallel_group(), - ) - - # Remove duplicate examples due to distributed sampler. - deduplicated_outputs = { - 'q_hs': [], - 'd_hs': [], - 'metadata': [], - } - total_size, skipped = 0, 0 - for rank in range(0, parallel_state.get_data_parallel_world_size()): - for batch in gathered_output_batches[rank]: - l_q_hs = listify(batch['q_hs']) - l_d_hs = listify(batch['d_hs']) - l_m = batch['metadata'] - assert len(l_m) == len(l_q_hs) == len(l_d_hs) - for q_hs, d_hs, metadata in zip( - l_q_hs, - l_d_hs, - l_m, - ): - total_size += 1 - if not metadata.get("__AUTOGENERATED__", False): - deduplicated_outputs['q_hs'].append(q_hs) - deduplicated_outputs['d_hs'].append(d_hs) - deduplicated_outputs['metadata'].append(metadata) - else: - skipped += 1 - - logging.info( - f"{total_size-skipped} deduplicated outputs in dataloader:{dataloader_idx}, (skipped {skipped} autogenerated examples)." - ) - - # Write predictions to file - if self.global_rank == 0 and data_cfg.get("write_embeddings_to_file", False): - logging.info( - f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['metadata'])}" - ) - - # Check if the user provided a prefix path to the file(s) they want to write. - if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None: - raise ValueError( - f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file." - ) - filename_log_key = f"{mode}_{data_cfg.names[dataloader_idx]}" - consumed_samples = self._compute_consumed_samples_after_training_step() - fldr_path = f"{data_cfg.output_file_path_prefix}/consumed_samples{consumed_samples}/{filename_log_key}" - self.write_embeddings_to_file(deduplicated_outputs, fldr_path, dataloader_idx) - return deduplicated_outputs, total_size - - def write_embeddings_to_file(self, outputs, output_file_path, d_idx): - emb_type = 'query' if d_idx == 0 else 'doc' - hs = torch.cat(outputs['q_hs' if d_idx == 0 else 'd_hs'], dim=0) - hs_npy = hs.float().numpy() - emb_fldr = f"{output_file_path}" - os.makedirs(emb_fldr, exist_ok=True) - with open(f"{output_file_path}/{emb_type}.ids", "w") as f: - for m in outputs['metadata']: - f.write(m[f"{emb_type}_id"] + "\n") - np.save(f"{emb_fldr}/{emb_type}.npy", hs_npy) - return True - - def inference_loss_func(self, eos_tensors): - hs = eos_tensors - _blank = torch.zeros(1, device=hs.device, dtype=hs.dtype)[0] - return { - 'hs': eos_tensors, - 'lm loss': _blank, - } - - def _gather_global_inbatch_representations(self, local_tensor): - local_tensor = local_tensor.contiguous() - if self.backprop_type == 'local': - global_tensors = [ - torch.zeros_like(local_tensor) for _ in range(parallel_state.get_data_parallel_world_size()) - ] - all_gather_no_backprop(global_tensors, local_tensor, group=parallel_state.get_data_parallel_group()) - global_tensors[parallel_state.get_data_parallel_rank()] = local_tensor - global_tensors = torch.cat(global_tensors, dim=0) - - else: - global_tensors = all_gather_with_backprop(local_tensor) - global_tensors = torch.cat(global_tensors, dim=0) - - return global_tensors - - def loss_func(self, output_tensor): - if self.global_inbatch_negatives and self.trainer.training: - output_tensor = self._gather_global_inbatch_representations(output_tensor) - if self.trainer.testing: - return self.inference_loss_func(output_tensor) - - num_tensors_per_example = 2 + self.hard_negatives_to_train - bs = output_tensor.shape[0] // num_tensors_per_example - chunks = output_tensor.chunk(bs) - queries = torch.stack([item[0] for item in chunks]) # shape (bs, embedding_dim) - positives = torch.stack([item[1] for item in chunks]) # shape (bs, embedding_dim) - - pos_inbatch_negs_scores = torch.mm( - queries, positives.transpose(0, 1) - ) # shape (bs, bs); each positive is negative for other queries. - - hard_negs = [ - torch.stack([item[i + 2] for item in chunks]) for i in range(self.hard_negatives_to_train) - ] # List of length "num_negatives", each tensor of shape (bs, embedding_dim) - - hard_negs_scores = ( - torch.multiply( - queries.unsqueeze(0).repeat(len(hard_negs), 1, 1), - torch.stack(hard_negs), - ) - .sum(axis=-1) - .T - ) # shape = (bs, num_negatives); Hard negatives are not shared between queries. - - scores = torch.cat([pos_inbatch_negs_scores, hard_negs_scores], axis=1) - - scores = scores.clamp(-1.0, 1.0) - scores *= self.scale - - labels = torch.tensor( - range(len(scores)), dtype=torch.long, device=scores.device - ) # Indices of the (query, positive) pairs - - return {'lm loss': self.cross_entropy_loss(scores, labels)} diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py deleted file mode 100644 index b5240ec2e170..000000000000 --- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py +++ /dev/null @@ -1,475 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -import os - -import numpy as np -import torch -from lightning.pytorch.trainer.trainer import Trainer -from omegaconf import DictConfig, ListConfig - -from nemo.collections.nlp.data.information_retrieval.gpt_embedding_dataset import GPTEmbeddingDataset -from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import ( - get_datasets_weights_and_num_samples, -) -from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset -from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel -from nemo.utils import logging - -try: - from megatron.core import parallel_state - - HAVE_MEGATRON_CORE = True - -except (ImportError, ModuleNotFoundError): - - HAVE_MEGATRON_CORE = False - - -def listify(tensor): - l_tensor = [] - for t in tensor: - for rid in range(t.shape[0]): - r = t[rid, :].unsqueeze(0).cpu() - l_tensor.append(r) - return l_tensor - - -def _gather_global_inbatch_representations(local_eos_tensor): - local_eos_tensor = local_eos_tensor.contiguous() - global_eos_tensors = [ - torch.zeros_like(local_eos_tensor) for _ in range(parallel_state.get_data_parallel_world_size()) - ] - torch.distributed.all_gather(global_eos_tensors, local_eos_tensor, group=parallel_state.get_data_parallel_group()) - global_eos_tensors[parallel_state.get_data_parallel_rank()] = local_eos_tensor - global_eos_tensors = torch.cat(global_eos_tensors, dim=0) - return global_eos_tensors - - -class MegatronGPTEmbeddingModel(MegatronGPTSFTModel): - def __init__(self, cfg: DictConfig, trainer: Trainer): - super().__init__(cfg, trainer=trainer) - self.temperature = self.cfg.get('temperature', 0.02) - self.use_all_possible_negatives = self.cfg.get("use_all_possible_negatives", True) - self.global_inbatch_negatives = self.cfg.get("global_inbatch_negatives", True) - if self.cfg.get("do_mrl", False): - min_mrl = self.cfg.get("min_mrl_dim", int(np.log2(32))) - 1 - max_mrl = int(np.log2(self.cfg.hidden_size // 2)) - self.mrl_dims = [2**i for i in range(max_mrl, min_mrl, -1)] - else: - self.mrl_dims = [] - - assert ( - self.cfg.get("post_process", False) is False - ), "post_process must be False to get hidden states in the loss_func" - - def model_provider_func(self, pre_process, post_process): - # (@adithyare) We need post_process to be False to get hidden states in the loss_func - return super().model_provider_func(pre_process, post_process=False) - - def maybe_setup_test(self): - if ( - hasattr(self.cfg.data, 'test_ds') - and self.cfg.data.test_ds.get('doc_file_names', None) is not None - and self.cfg.data.test_ds.get('query_file_names', None) is not None - ): - self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds) - return - - def maybe_build_test(self): - if ( - hasattr(self.cfg.data, 'test_ds') - and self.cfg.data.test_ds.get('doc_file_names', None) is not None - and self.cfg.data.test_ds.get('query_file_names', None) is not None - ): - logging.info('Building GPT Embedder test datasets.') - # Wrap this in a list since the general finetuning parent class supports multi-validation. - self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False) - - def _build_dataset(self, data_cfg, is_train=True): - packed_sequence = data_cfg.get("packed_sequence", False) - - # Determine if we are using a single dataset or a list of datasets. - if is_train: - # Construct the data prefix list for `get_datasets_weights_and_num_samples()` - # that is of the format [weight1,file_name1,weight2,file_name2,...] - if data_cfg.concat_sampling_probabilities is None or not isinstance( - data_cfg.concat_sampling_probabilities, ListConfig - ): - raise ValueError( - ( - f"concat_sampling_probabilities must be a ListConfig with the same number of files in file_names." - f"Found: {data_cfg.concat_sampling_probabilities}" - ) - ) - - if len(data_cfg.get('concat_sampling_probabilities', None)) != len(data_cfg.file_names): - raise ValueError( - ( - f"concat_sampling_probabilities must be of the same size as file_names.", - f"Provided size {len(data_cfg.concat_sampling_probabilities)}, number of datasets {len(data_cfg.file_names)}", - ) - ) - - data_prefix = [] - for weight, prefix in zip(data_cfg.concat_sampling_probabilities, data_cfg.file_names): - data_prefix.append(weight) - data_prefix.append(prefix) - - if self.trainer.max_steps is None or self.trainer.max_steps <= 0: - raise ValueError( - f'Trainer max_steps must be set to a positive integer. Found {self.trainer.max_steps}' - ) - num_train_samples = [self.trainer.max_steps * data_cfg.global_batch_size] - _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples) - num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset]) - else: - num_query_files = len(data_cfg.query_file_names) if data_cfg.query_file_names is not None else 0 - num_doc_files = len(data_cfg.doc_file_names) if data_cfg.doc_file_names is not None else 0 - num_query_samples_per_dataset = [[None]] * num_query_files - num_doc_samples_per_dataset = [[None]] * num_doc_files - - # Check dataset max_seq_legnth and max_position_embeddings size - if ( - self.cfg.get('position_embedding_type', None) in [None, 'learned_absolute'] - and data_cfg.max_seq_length > self.cfg.max_position_embeddings - ): - logging.warning( - f"Set dataset max_seq_length to max_position_embeddings {self.cfg.max_position_embeddings} if using learned_absolute position embedding" - ) - data_cfg.max_seq_length = self.cfg.max_position_embeddings - - # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8 - # When using sequence parallel, sequence will further be split by TP size - pad_seq_length_to_mult = ( - 8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16 - ) - if is_train: - datasets = [] - for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset): - dataset = GPTEmbeddingDataset( - file_path=file_path, - tokenizer=self.tokenizer, - max_seq_length=data_cfg.max_seq_length, - min_seq_length=data_cfg.min_seq_length, - add_bos=data_cfg.get('add_bos', False), - add_eos=data_cfg.get('add_eos', True), - max_num_samples=num_samples[0], - seed=data_cfg.get('seed', 1234), - index_mapping_dir=data_cfg.get('index_mapping_dir', None), - virtual_tokens=self.virtual_tokens, - memmap_workers=data_cfg.get( - 'memmap_workers', None - ), # used to set num. of workers to create the memmap index files - truncation_method=data_cfg.get( - 'truncation_method', 'right' - ), # used to choose truncation method. Options: ['random', 'left', 'right'] - special_tokens=self.cfg.data.get( - 'chat_prompt_tokens', None - ), # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} - ) - datasets.append(dataset) - if packed_sequence: - raise NotImplementedError("Packed sequence is not supported for MegatronGPTEmbeddingModel") - - dataset = BlendableDataset( - datasets=datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend - ) - return dataset - else: - if data_cfg.query_file_names is None or data_cfg.doc_file_names is None: - return [] - - query_dataset = GPTEmbeddingDataset( - file_path=data_cfg.query_file_names[0], - tokenizer=self.tokenizer, - max_seq_length=data_cfg.max_seq_length, - min_seq_length=data_cfg.min_seq_length, - add_bos=data_cfg.get('add_bos', False), - add_eos=data_cfg.get('add_eos', True), - max_num_samples=None, - seed=data_cfg.get('seed', 1234), - index_mapping_dir=data_cfg.get('index_mapping_dir', None), - virtual_tokens=self.virtual_tokens, - memmap_workers=data_cfg.get( - 'memmap_workers', None - ), # used to set num. of workers to create the memmap index files - truncation_method=data_cfg.get( - 'truncation_method', 'right' - ), # used to choose truncation method. Options: ['random', 'left', 'right'] - special_tokens=self.cfg.data.get( - 'chat_prompt_tokens', None - ), # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} - data_type="query", - ) - doc_dataset = GPTEmbeddingDataset( - file_path=data_cfg.doc_file_names[0], - tokenizer=self.tokenizer, - max_seq_length=data_cfg.max_seq_length, - min_seq_length=data_cfg.min_seq_length, - add_bos=data_cfg.get('add_bos', False), - add_eos=data_cfg.get('add_eos', True), - max_num_samples=None, - seed=data_cfg.get('seed', 1234), - index_mapping_dir=data_cfg.get('index_mapping_dir', None), - virtual_tokens=self.virtual_tokens, - memmap_workers=data_cfg.get( - 'memmap_workers', None - ), # used to set num. of workers to create the memmap index files - truncation_method=data_cfg.get( - 'truncation_method', 'right' - ), # used to choose truncation method. Options: ['random', 'left', 'right'] - special_tokens=self.cfg.data.get( - 'chat_prompt_tokens', None - ), # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} - data_type="doc", - ) - return [query_dataset, doc_dataset] - - def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only): - loss_mean, non_loss_tensors = self.fwd_bwd_step(dataloader_iter, forward_only) - avg_pos_cs = non_loss_tensors['avg_pos_cs'][0].item() - avg_neg_cs = non_loss_tensors['avg_neg_cs'][0].item() - diff_cs = non_loss_tensors['diff_cs'][0].item() - self.log("avg_pos_cs", avg_pos_cs, prog_bar=True, rank_zero_only=True, batch_size=1) - self.log("avg_neg_cs", avg_neg_cs, prog_bar=True, rank_zero_only=True, batch_size=1) - self.log("diff_cs", diff_cs, prog_bar=True, rank_zero_only=True, batch_size=1) - return loss_mean - - def inference_step_validation_call(self, batch, batch_idx, data_cfg, dataloader_idx=0): - metadata = batch.get('metadata', [{}] * len(batch['tokens'])) - loss, non_loss_tensors = self.local_validation_step(itertools.chain([dataloader_idx], [batch])) - outputs = { - 'loss': loss, - 'metadata': metadata, # [dict] - 'q_hs': non_loss_tensors['query_hs'], # [batch_size, hidden_size] - 'd_hs': non_loss_tensors['doc_hs'], # [batch_size, hidden_size] - } - return outputs - - def gather_and_maybe_write_predictions(self, output, data_cfg, mode, averaged_metric, dataloader_idx=0): - if not data_cfg.get("write_embeddings_to_file", False): - return True - gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())] - torch.distributed.all_gather_object( - gathered_output_batches, - [ - { - 'q_hs': batch['q_hs'], - 'd_hs': batch['d_hs'], - 'metadata': batch['metadata'], - } - for batch in output - ], - group=parallel_state.get_data_parallel_group(), - ) - - # Remove duplicate examples due to distributed sampler. - deduplicated_outputs = { - 'q_hs': [], - 'd_hs': [], - 'metadata': [], - } - total_size, skipped = 0, 0 - for rank in range(0, parallel_state.get_data_parallel_world_size()): - for batch in gathered_output_batches[rank]: - l_q_hs = listify(batch['q_hs']) - l_d_hs = listify(batch['d_hs']) - l_m = batch['metadata'] - assert len(l_m) == len(l_q_hs) == len(l_d_hs) - for q_hs, d_hs, metadata in zip( - l_q_hs, - l_d_hs, - l_m, - ): - total_size += 1 - if not metadata.get("__AUTOGENERATED__", False): - deduplicated_outputs['q_hs'].append(q_hs) - deduplicated_outputs['d_hs'].append(d_hs) - deduplicated_outputs['metadata'].append(metadata) - else: - skipped += 1 - - logging.info( - f"{total_size-skipped} deduplicated outputs in dataloader:{dataloader_idx}, (skipped {skipped} autogenerated examples)." - ) - # Compute metric score - metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name - assert metric_name == "loss", "Only loss is supported for now." - # avg_pos_cs = torch.tensor(deduplicated_outputs['avg_pos_cs']).mean().item() - # avg_neg_cs = torch.tensor(deduplicated_outputs['avg_neg_cs']).mean().item() - # diff_cs = torch.tensor(deduplicated_outputs['diff_cs']).mean().item() - # self.log('val_avg_pos_cs', avg_pos_cs, prog_bar=True, rank_zero_only=True, batch_size=1) - # self.log('val_avg_neg_cs', avg_neg_cs, prog_bar=True, rank_zero_only=True, batch_size=1) - # self.log('val_diff_cs', diff_cs, prog_bar=True, rank_zero_only=True, batch_size=1) - - # Write predictions to file - if self.global_rank == 0 and data_cfg.get("write_embeddings_to_file", False): - logging.info( - f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['metadata'])}" - ) - - # Check if the user provided a prefix path to the file(s) they want to write. - if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None: - raise ValueError( - f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file." - ) - # (@adithyare) We are not using the log key to write the embeddings to file - filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode) - consumed_samples = self._compute_consumed_samples_after_training_step() - fldr_path = f"{data_cfg.output_file_path_prefix}/consumed_samples{consumed_samples}/{filename_log_key}" - self.write_embeddings_to_file(deduplicated_outputs, fldr_path, dataloader_idx) - return deduplicated_outputs, total_size - - def write_embeddings_to_file(self, outputs, output_file_path, d_idx): - emb_type = 'query' if d_idx == 0 else 'doc' - hs = torch.cat(outputs['q_hs' if d_idx == 0 else 'd_hs'], dim=0) - hs_npy = hs.float().numpy() - emb_fldr = f"{output_file_path}" - os.makedirs(emb_fldr, exist_ok=True) - with open(f"{output_file_path}/{emb_type}.ids", "w") as f: - for m in outputs['metadata']: - f.write(m[f"{emb_type}_id"] + "\n") - np.save(f"{emb_fldr}/{emb_type}.npy", hs_npy) - return True - - def local_validation_step(self, dataloader_iter): - """ - Our dataloaders produce a micro-batch and then we fetch - a number of microbatches depending on the global batch size and model parallel size - from the dataloader to produce a list of microbatches. - The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions. - """ - # Check if iterator is exhausted - # dataloader_iter, done = self._val_iterator_done(dataloader_iter) - # if done: - # return - # Get the dataloader_idx when MegatronGPTSFTModel calls validation_step of MegatronGPTModel - next_item_dataloader = next(dataloader_iter) - if isinstance(next_item_dataloader, int): - dataloader_idx = next_item_dataloader - else: - dataloader_iter = itertools.chain([next_item_dataloader], dataloader_iter) - mode = 'test' if self.trainer.testing else 'val' - # Initialize userbuffer communicators. - if self.initialize_ub: - self.initialize_ub_func() - - if isinstance(self.model, list): - for model_module in self.model: - model_module.eval() - - if self.cfg.get('fp8', False): - first_val_step = self.prev_step_training and not self.training - self.prev_step_training = self.training - else: - first_val_step = None - - loss, non_loss_tensors = self.fwd_bwd_step(dataloader_iter, True, first_val_step) - - if isinstance(self.model, list): - for model_module in self.model: - model_module.train() - - if mode == 'val': - # MegatronGPTSFTModel class supports multiple dataloaders and uses validation_step of MegatronGPTModel. - # Supporting that case with below lines - if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - self.validation_step_outputs[dataloader_idx].append(loss) - else: - self.validation_step_outputs.append(loss) - else: - if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx].append(loss) - else: - self.test_step_outputs.append(loss) - - return loss, non_loss_tensors - - def constrastive_scores(self, pos_doc_hs, neg_doc_hs, query_hs, bs, temperature, use_all_possible_negatives=False): - all_doc_hs = torch.cat([pos_doc_hs, neg_doc_hs], dim=0) # (2bs) x hidden_size - cs = torch.mm(query_hs, all_doc_hs.transpose(0, 1)) # (bs) x (2bs) - pos_cs = cs[:, :bs].diag() - neg_cs = cs[:, bs:].diag() - if use_all_possible_negatives: - labels = torch.arange(bs, device=cs.device).long() - else: - labels = torch.zeros(bs, device=cs.device).long() - cs = torch.cat([pos_cs.unsqueeze(1), neg_cs.unsqueeze(1)], dim=1) - pos_cs = pos_cs.clone().detach().mean() - neg_cs = neg_cs.clone().detach().mean() - cs = cs.clamp(-1.0, 1.0) - cs = cs / temperature - return cs, pos_cs, neg_cs, labels - - def inference_loss_func(self, loss_mask, num_valid_tokens_in_ub, eos_tensors): - hs = eos_tensors - hs = torch.nn.functional.normalize(hs, dim=1) - _blank = torch.zeros(1, device=hs.device, dtype=hs.dtype)[0] - return { - "loss": _blank, - "query_hs": hs, - "pos_doc_hs": hs, - "pos_cs": _blank, - "neg_cs": _blank, - "diff_cs": _blank, - } - - def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor): - idx = torch.arange(output_tensor.shape[1], device=output_tensor.device) - eos_tensors = output_tensor[loss_mask, idx, :] - if self.global_inbatch_negatives and self.trainer.training: - eos_tensors = _gather_global_inbatch_representations(eos_tensors) - if not self.trainer.training: - return self.inference_loss_func(loss_mask, num_valid_tokens_in_ub, eos_tensors) - bs = eos_tensors.shape[0] // 3 - query_hs = eos_tensors[::3, :] # every third tensor is a query (bs x hidden_size) - pos_doc_hs = eos_tensors[1::3, :] # every third tensor is a positive doc (bs x hidden_size) - neg_doc_hs = eos_tensors[2::3, :] # every third tensor is a negative doc (bs x hidden_size) - - query_hs = torch.nn.functional.normalize(query_hs, dim=1) - pos_doc_hs = torch.nn.functional.normalize(pos_doc_hs, dim=1) - neg_doc_hs = torch.nn.functional.normalize(neg_doc_hs, dim=1) - - cs, pos_cs, neg_cs, labels = self.constrastive_scores( - pos_doc_hs, neg_doc_hs, query_hs, bs, self.temperature, self.use_all_possible_negatives - ) - loss = torch.nn.functional.cross_entropy(cs, labels) - if self.mrl_dims: - for dim in self.mrl_dims: - cs_dim, _, _, _ = self.constrastive_scores( - pos_doc_hs[:, :dim], - neg_doc_hs[:, :dim], - query_hs[:, :dim], - bs, - self.temperature, - self.use_all_possible_negatives, - ) - loss += torch.nn.functional.cross_entropy(cs_dim, labels) - - cp_size = self.cfg.get('context_parallel_size', 1) - if cp_size > 1: - torch.distributed.all_reduce(loss, group=parallel_state.get_context_parallel_group()) - query_hs = query_hs.clone().detach() - pos_doc_hs = pos_doc_hs.clone().detach() - diff_cs = pos_cs - neg_cs - return { - "loss": loss, - "query_hs": query_hs, - "pos_doc_hs": pos_doc_hs, - "pos_cs": pos_cs, - "neg_cs": neg_cs, - "diff_cs": diff_cs, - } diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py deleted file mode 100644 index 2f7722abc663..000000000000 --- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py +++ /dev/null @@ -1,301 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -import os - -import numpy as np -import torch -from lightning.pytorch.trainer.trainer import Trainer -from omegaconf import DictConfig, ListConfig - -from nemo.collections.nlp.data.information_retrieval.gpt_embedding_dataset import GPTRerankerDataset -from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import ( - get_datasets_weights_and_num_samples, -) -from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset -from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import ( - MegatronGPTEmbeddingModel, - _gather_global_inbatch_representations, -) -from nemo.utils import logging - -try: - from megatron.core import parallel_state - - HAVE_MEGATRON_CORE = True - -except (ImportError, ModuleNotFoundError): - - HAVE_MEGATRON_CORE = False - - -def listify(tensor): - l_tensor = [] - for t in tensor: - for rid in range(t.shape[0]): - r = t[rid, :].unsqueeze(0).cpu() - l_tensor.append(r) - return l_tensor - - -class MegatronGPTRerankerModel(MegatronGPTEmbeddingModel): - def __init__(self, cfg: DictConfig, trainer: Trainer): - self.reward_model_loss = cfg.get("reward_model_loss", False) - super().__init__(cfg, trainer=trainer) - - def model_provider_func(self, pre_process, post_process): - # (@adithyare) We need post_process to be False to get hidden states in the loss_func - return super().model_provider_func(pre_process, post_process=False) - - def maybe_setup_test(self): - if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None: - self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds) - return - - def maybe_build_test(self): - if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None: - logging.info('Building GPT Reranker test datasets.') - # Wrap this in a list since the general finetuning parent class supports multi-validation. - self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False) - - def _build_dataset(self, data_cfg, is_train=True): - packed_sequence = data_cfg.get("packed_sequence", False) - - # Determine if we are using a single dataset or a list of datasets. - if is_train: - # Construct the data prefix list for `get_datasets_weights_and_num_samples()` - # that is of the format [weight1,file_name1,weight2,file_name2,...] - if data_cfg.concat_sampling_probabilities is None or not isinstance( - data_cfg.concat_sampling_probabilities, ListConfig - ): - raise ValueError( - ( - f"concat_sampling_probabilities must be a ListConfig with the same number of files in file_names." - f"Found: {data_cfg.concat_sampling_probabilities}" - ) - ) - - if len(data_cfg.get('concat_sampling_probabilities', None)) != len(data_cfg.file_names): - raise ValueError( - ( - f"concat_sampling_probabilities must be of the same size as file_names.", - f"Provided size {len(data_cfg.concat_sampling_probabilities)}, number of datasets {len(data_cfg.file_names)}", - ) - ) - - data_prefix = [] - for weight, prefix in zip(data_cfg.concat_sampling_probabilities, data_cfg.file_names): - data_prefix.append(weight) - data_prefix.append(prefix) - - if self.trainer.max_steps is None or self.trainer.max_steps <= 0: - raise ValueError( - f'Trainer max_steps must be set to a positive integer. Found {self.trainer.max_steps}' - ) - num_train_samples = [self.trainer.max_steps * data_cfg.global_batch_size] - _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples) - num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset]) - else: - num_train_samples_per_dataset = [[None]] * len(data_cfg.file_names) - - # Check dataset max_seq_legnth and max_position_embeddings size - if ( - self.cfg.get('position_embedding_type', None) in [None, 'learned_absolute'] - and data_cfg.max_seq_length > self.cfg.max_position_embeddings - ): - logging.warning( - f"Set dataset max_seq_length to max_position_embeddings {self.cfg.max_position_embeddings} if using learned_absolute position embedding" - ) - data_cfg.max_seq_length = self.cfg.max_position_embeddings - - # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8 - # When using sequence parallel, sequence will further be split by TP size - pad_seq_length_to_mult = ( - 8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16 - ) - pad_seq_length_to_mult *= self.cfg.get('context_parallel_size', 1) - - datasets = [] - for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset): - dataset = GPTRerankerDataset( - file_path=file_path, - tokenizer=self.tokenizer, - max_seq_length=data_cfg.max_seq_length, - min_seq_length=data_cfg.min_seq_length, - add_bos=data_cfg.get('add_bos', False), - add_eos=data_cfg.get('add_eos', True), - max_num_samples=num_samples[0], - seed=data_cfg.get('seed', 1234), - index_mapping_dir=data_cfg.get('index_mapping_dir', None), - virtual_tokens=self.virtual_tokens, - memmap_workers=data_cfg.get( - 'memmap_workers', None - ), # used to set num. of workers to create the memmap index files - truncation_method=data_cfg.get( - 'truncation_method', 'right' - ), # used to choose truncation method. Options: ['random', 'left', 'right'] - special_tokens=self.cfg.data.get( - 'chat_prompt_tokens', None - ), # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} - data_type="train" if is_train else "validation", - ) - datasets.append(dataset) - if is_train: - if packed_sequence: - num_train_samples_after_blend = sum(len(dataset) for dataset in datasets) - dataset = BlendableDataset( - datasets=datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend - ) - return dataset - else: - return datasets - - def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only): - loss_mean, non_loss_tensors = self.fwd_bwd_step(dataloader_iter, forward_only) - logit_diff = non_loss_tensors['logit_diff'][0].item() - self.log("logit_diff", logit_diff, prog_bar=True, rank_zero_only=True, batch_size=1) - return loss_mean - - def inference_step_validation_call(self, batch, batch_idx, data_cfg, dataloader_idx=0): - metadata = batch.get('metadata', [{}] * len(batch['tokens'])) - loss, non_loss_tensors = self.local_validation_step(itertools.chain([dataloader_idx], [batch])) - outputs = { - 'loss': loss, - 'metadata': metadata, # [dict] - 'query_pos_doc_logit': non_loss_tensors['query_pos_doc_logit'], # [batch_size, hidden_size] - } - return outputs - - def inference_loss_func(self, loss_mask, num_valid_tokens_in_ub, eos_tensors): - query_pos_doc_hs = eos_tensors - _blank = torch.zeros(1, device=query_pos_doc_hs.device, dtype=query_pos_doc_hs.dtype)[0] - return { - "loss": _blank, - "query_pos_doc_logit": query_pos_doc_hs, - "query_neg_doc_logit": _blank, - "logit_diff": _blank, - } - - def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor): - idx = torch.arange(output_tensor.shape[1], device=output_tensor.device) - eos_tensors = output_tensor[loss_mask, idx, :] # (bs x 1) - if self.global_inbatch_negatives and self.trainer.training: - eos_tensors = _gather_global_inbatch_representations(eos_tensors) - if not self.trainer.training: - return self.inference_loss_func(loss_mask, num_valid_tokens_in_ub, eos_tensors) - bs = eos_tensors.shape[0] // 2 - query_pos_doc_hs = eos_tensors[::2, :] # every second tensor from idx 0 is a query w pos_doc (bs x 1) - query_neg_doc_hs = eos_tensors[1::2, :] # every second tensor from idx 1 is a query w negative doc (bs x 1) - - if self.reward_model_loss: - loss = -torch.nn.functional.logsigmoid(query_pos_doc_hs - query_neg_doc_hs).mean() - else: - cs = torch.cat([query_pos_doc_hs, query_neg_doc_hs], dim=1) # (bs x 2) - cs = cs / self.temperature - labels = torch.zeros(bs, device=cs.device).long() - loss = torch.nn.functional.cross_entropy(cs, labels) - - cp_size = self.cfg.get('context_parallel_size', 1) - if cp_size > 1: - torch.distributed.all_reduce(loss, group=parallel_state.get_context_parallel_group()) - query_pos_doc_hs = query_pos_doc_hs.clone().detach() - query_neg_doc_hs = query_neg_doc_hs.clone().detach() - logit_diffs = torch.mean(query_pos_doc_hs - query_neg_doc_hs) - return { - "loss": loss, - "query_pos_doc_logit": query_pos_doc_hs, - "query_neg_doc_logit": query_neg_doc_hs, - "logit_diff": logit_diffs, - } - - def gather_and_maybe_write_predictions(self, output, data_cfg, mode, averaged_metric, dataloader_idx=0): - if not data_cfg.get("write_embeddings_to_file", False): - return True - gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())] - torch.distributed.all_gather_object( - gathered_output_batches, - [ - { - 'query_pos_doc_logit': batch['query_pos_doc_logit'], - 'metadata': batch['metadata'], - } - for batch in output - ], - group=parallel_state.get_data_parallel_group(), - ) - - # Remove duplicate examples due to distributed sampler. - deduplicated_outputs = { - 'query_pos_doc_logit': [], - 'metadata': [], - } - total_size, skipped = 0, 0 - for rank in range(0, parallel_state.get_data_parallel_world_size()): - for batch in gathered_output_batches[rank]: - l_q_hs = listify(batch['query_pos_doc_logit']) - l_m = batch['metadata'] - assert len(l_m) == len(l_q_hs) - for q_hs, metadata in zip( - l_q_hs, - l_m, - ): - total_size += 1 - if not metadata.get("__AUTOGENERATED__", False): - deduplicated_outputs['query_pos_doc_logit'].append(q_hs) - deduplicated_outputs['metadata'].append(metadata) - else: - skipped += 1 - - logging.info( - f"{total_size-skipped} deduplicated outputs in dataloader:{dataloader_idx}, (skipped {skipped} autogenerated examples)." - ) - # Compute metric score - metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name - assert metric_name == "loss", "Only loss is supported for now." - # avg_pos_cs = torch.tensor(deduplicated_outputs['avg_pos_cs']).mean().item() - # avg_neg_cs = torch.tensor(deduplicated_outputs['avg_neg_cs']).mean().item() - # diff_cs = torch.tensor(deduplicated_outputs['diff_cs']).mean().item() - # self.log('val_avg_pos_cs', avg_pos_cs, prog_bar=True, rank_zero_only=True, batch_size=1) - # self.log('val_avg_neg_cs', avg_neg_cs, prog_bar=True, rank_zero_only=True, batch_size=1) - # self.log('val_diff_cs', diff_cs, prog_bar=True, rank_zero_only=True, batch_size=1) - - # Write predictions to file - if self.global_rank == 0 and data_cfg.get("write_embeddings_to_file", False): - logging.info( - f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['metadata'])}" - ) - - # Check if the user provided a prefix path to the file(s) they want to write. - if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None: - raise ValueError( - f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file." - ) - # (@adithyare) We are not using the log key to write the embeddings to file - filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode) - consumed_samples = self._compute_consumed_samples_after_training_step() - fldr_path = f"{data_cfg.output_file_path_prefix}/consumed_samples{consumed_samples}/{filename_log_key}" - self.write_embeddings_to_file(deduplicated_outputs, fldr_path, dataloader_idx) - return deduplicated_outputs, total_size - - def write_embeddings_to_file(self, outputs, output_file_path, d_idx): - hs = torch.cat(outputs['query_pos_doc_logit'], dim=0) - hs_npy = hs.float().numpy() - emb_fldr = f"{output_file_path}" - os.makedirs(emb_fldr, exist_ok=True) - with open(f"{output_file_path}/logits.ids", "w") as f: - for m in outputs['metadata']: - f.write(f"{m['query_id'].strip()} {m['doc_id']}\n") - np.save(f"{emb_fldr}/logits.npy", hs_npy) - return True diff --git a/nemo/collections/nlp/models/intent_slot_classification/__init__.py b/nemo/collections/nlp/models/intent_slot_classification/__init__.py deleted file mode 100644 index 80f5f92bd80b..000000000000 --- a/nemo/collections/nlp/models/intent_slot_classification/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.models.intent_slot_classification.intent_slot_classification_model import ( - IntentSlotClassificationModel, -) -from nemo.collections.nlp.models.intent_slot_classification.multi_label_intent_slot_classification_model import ( - MultiLabelIntentSlotClassificationModel, -) diff --git a/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py b/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py deleted file mode 100644 index a49bc699ab24..000000000000 --- a/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py +++ /dev/null @@ -1,467 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import pathlib -from typing import Dict, List, Optional - -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig, OmegaConf -from torch.utils.data import DataLoader - -from nemo.collections.common.losses import AggregatorLoss, CrossEntropyLoss -from nemo.collections.nlp.data.intent_slot_classification import ( - IntentSlotClassificationDataset, - IntentSlotDataDesc, - IntentSlotInferenceDataset, -) -from nemo.collections.nlp.metrics.classification_report import ClassificationReport -from nemo.collections.nlp.models.nlp_model import NLPModel -from nemo.collections.nlp.modules.common import SequenceTokenClassifier -from nemo.collections.nlp.parts.utils_funcs import tensor2list -from nemo.core.classes import typecheck -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging - - -class IntentSlotClassificationModel(NLPModel): - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - """Initializes BERT Joint Intent and Slot model.""" - self.max_seq_length = cfg.language_model.max_seq_length - # init superclass - # Check the presence of data_dir. - if not cfg.data_dir or not os.path.exists(cfg.data_dir): - # Set default values of data_desc. - self._set_defaults_data_desc(cfg) - else: - self.data_dir = cfg.data_dir - # Update configuration of data_desc. - self._set_data_desc_to_cfg(cfg, cfg.data_dir, cfg.train_ds, cfg.validation_ds) - super().__init__(cfg=cfg, trainer=trainer) - # Initialize Classifier. - self._reconfigure_classifier() - - def _set_defaults_data_desc(self, cfg): - """ - Method makes sure that cfg.data_desc params are set. - If not, set's them to "dummy" defaults. - """ - if not hasattr(cfg, "data_desc"): - OmegaConf.set_struct(cfg, False) - cfg.data_desc = {} - # Intents. - cfg.data_desc.intent_labels = " " - cfg.data_desc.intent_label_ids = {" ": 0} - cfg.data_desc.intent_weights = [1] - # Slots. - cfg.data_desc.slot_labels = " " - cfg.data_desc.slot_label_ids = {" ": 0} - cfg.data_desc.slot_weights = [1] - - cfg.data_desc.pad_label = "O" - OmegaConf.set_struct(cfg, True) - - def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds): - """Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc.""" - # Save data from data desc to config - so it can be reused later, e.g. in inference. - data_desc = IntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix]) - OmegaConf.set_struct(cfg, False) - if not hasattr(cfg, "data_desc") or cfg.data_desc is None: - cfg.data_desc = {} - # Intents. - cfg.data_desc.intent_labels = list(data_desc.intents_label_ids.keys()) - cfg.data_desc.intent_label_ids = data_desc.intents_label_ids - cfg.data_desc.intent_weights = data_desc.intent_weights - # Slots. - cfg.data_desc.slot_labels = list(data_desc.slots_label_ids.keys()) - cfg.data_desc.slot_label_ids = data_desc.slots_label_ids - cfg.data_desc.slot_weights = data_desc.slot_weights - - cfg.data_desc.pad_label = data_desc.pad_label - - # for older(pre - 1.0.0.b3) configs compatibility - if not hasattr(cfg, "class_labels") or cfg.class_labels is None: - cfg.class_labels = {} - cfg.class_labels = OmegaConf.create( - {'intent_labels_file': 'intent_labels.csv', 'slot_labels_file': 'slot_labels.csv'} - ) - - slot_labels_file = os.path.join(data_dir, pathlib.Path(cfg.class_labels.slot_labels_file).name) - intent_labels_file = os.path.join(data_dir, pathlib.Path(cfg.class_labels.intent_labels_file).name) - self._save_label_ids(data_desc.slots_label_ids, slot_labels_file) - self._save_label_ids(data_desc.intents_label_ids, intent_labels_file) - - self.register_artifact('class_labels.intent_labels_file', intent_labels_file) - self.register_artifact('class_labels.slot_labels_file', slot_labels_file) - OmegaConf.set_struct(cfg, True) - - def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None: - """Saves label ids map to a file""" - with open(filename, 'w') as out: - labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1])) - out.write('\n'.join(labels)) - logging.info(f'Labels: {label_ids}') - logging.info(f'Labels mapping saved to : {out.name}') - - def _reconfigure_classifier(self): - """Method reconfigures the classifier depending on the settings of model cfg.data_desc""" - - self.classifier = SequenceTokenClassifier( - hidden_size=self.hidden_size, - num_intents=len(self.cfg.data_desc.intent_labels), - num_slots=len(self.cfg.data_desc.slot_labels), - dropout=self.cfg.head.fc_dropout, - num_layers=self.cfg.head.num_output_layers, - log_softmax=False, - ) - - # define losses - if self.cfg.class_balancing == 'weighted_loss': - # You may need to increase the number of epochs for convergence when using weighted_loss - self.intent_loss = CrossEntropyLoss(logits_ndim=2, weight=self.cfg.data_desc.intent_weights) - self.slot_loss = CrossEntropyLoss(logits_ndim=3, weight=self.cfg.data_desc.slot_weights) - else: - self.intent_loss = CrossEntropyLoss(logits_ndim=2) - self.slot_loss = CrossEntropyLoss(logits_ndim=3) - - self.total_loss = AggregatorLoss( - num_inputs=2, weights=[self.cfg.intent_loss_weight, 1.0 - self.cfg.intent_loss_weight] - ) - - # setup to track metrics - self.intent_classification_report = ClassificationReport( - num_classes=len(self.cfg.data_desc.intent_labels), - label_ids=self.cfg.data_desc.intent_label_ids, - dist_sync_on_step=True, - mode='micro', - ) - self.slot_classification_report = ClassificationReport( - num_classes=len(self.cfg.data_desc.slot_labels), - label_ids=self.cfg.data_desc.slot_label_ids, - dist_sync_on_step=True, - mode='micro', - ) - - def update_data_dir_for_training(self, data_dir: str, train_ds, validation_ds) -> None: - """ - Update data directory and get data stats with Data Descriptor. - Also, reconfigures the classifier - to cope with data with e.g. different number of slots. - - Args: - data_dir: path to data directory - """ - logging.info(f'Setting data_dir to {data_dir}.') - self.data_dir = data_dir - # Update configuration with new data. - self._set_data_desc_to_cfg(self.cfg, data_dir, train_ds, validation_ds) - # Reconfigure the classifier for different settings (number of intents, slots etc.). - self._reconfigure_classifier() - - def update_data_dir_for_testing(self, data_dir) -> None: - """ - Update data directory. - - Args: - data_dir: path to data directory - """ - logging.info(f'Setting data_dir to {data_dir}.') - self.data_dir = data_dir - - @typecheck() - def forward(self, input_ids, attention_mask, token_type_ids): - """ - No special modification required for Lightning, define it as you normally would - in the `nn.Module` in vanilla PyTorch. - """ - hidden_states = self.bert_model( - input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask - ) - if isinstance(hidden_states, tuple): - hidden_states = hidden_states[0] - - intent_logits, slot_logits = self.classifier(hidden_states=hidden_states) - return intent_logits.float(), slot_logits.float() - - def training_step(self, batch, batch_idx): - """ - Lightning calls this inside the training loop with the data from the training dataloader - passed in as `batch`. - """ - # forward pass - input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, intent_labels, slot_labels = batch - intent_logits, slot_logits = self( - input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask - ) - - # calculate combined loss for intents and slots - intent_loss = self.intent_loss(logits=intent_logits, labels=intent_labels) - slot_loss = self.slot_loss(logits=slot_logits, labels=slot_labels, loss_mask=loss_mask) - train_loss = self.total_loss(loss_1=intent_loss, loss_2=slot_loss) - lr = self._optimizer.param_groups[0]['lr'] - - self.log('train_loss', train_loss) - self.log('lr', lr, prog_bar=True) - - return { - 'loss': train_loss, - 'lr': lr, - } - - def validation_step(self, batch, batch_idx): - """ - Lightning calls this inside the validation loop with the data from the validation dataloader - passed in as `batch`. - """ - input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, intent_labels, slot_labels = batch - intent_logits, slot_logits = self( - input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask - ) - - # calculate combined loss for intents and slots - intent_loss = self.intent_loss(logits=intent_logits, labels=intent_labels) - slot_loss = self.slot_loss(logits=slot_logits, labels=slot_labels, loss_mask=loss_mask) - val_loss = self.total_loss(loss_1=intent_loss, loss_2=slot_loss) - - # calculate accuracy metrics for intents and slot reporting - # intents - preds = torch.argmax(intent_logits, axis=-1) - self.intent_classification_report.update(preds, intent_labels) - # slots - subtokens_mask = subtokens_mask > 0.5 - preds = torch.argmax(slot_logits, axis=-1)[subtokens_mask] - slot_labels = slot_labels[subtokens_mask] - self.slot_classification_report.update(preds, slot_labels) - - loss = { - 'val_loss': val_loss, - 'intent_tp': self.intent_classification_report.tp, - 'intent_fn': self.intent_classification_report.fn, - 'intent_fp': self.intent_classification_report.fp, - 'slot_tp': self.slot_classification_report.tp, - 'slot_fn': self.slot_classification_report.fn, - 'slot_fp': self.slot_classification_report.fp, - } - self.validation_step_outputs.append(loss) - return loss - - def on_validation_epoch_end(self): - """ - Called at the end of validation to aggregate outputs. - :param outputs: list of individual outputs of each validation step. - """ - prefix = "test" if self.trainer.testing else "val" - if prefix == "val": - outputs = self.validation_step_outputs - else: - outputs = self.test_step_outputs - avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() - - # calculate metrics and log classification report (separately for intents and slots) - intent_precision, intent_recall, intent_f1, intent_report = self.intent_classification_report.compute() - logging.info(f'Intent report: {intent_report}') - - slot_precision, slot_recall, slot_f1, slot_report = self.slot_classification_report.compute() - logging.info(f'Slot report: {slot_report}') - - self.log(f'{prefix}_loss', avg_loss) - self.log('intent_precision', intent_precision) - self.log('intent_recall', intent_recall) - self.log('intent_f1', intent_f1) - self.log('slot_precision', slot_precision) - self.log('slot_recall', slot_recall) - self.log('slot_f1', slot_f1) - - self.intent_classification_report.reset() - self.slot_classification_report.reset() - self.validation_step_outputs.clear() if prefix == 'val' else self.test_step_outputs.clear() - - return { - f'{prefix}_loss': avg_loss, - 'intent_precision': intent_precision, - 'intent_recall': intent_recall, - 'intent_f1': intent_f1, - 'slot_precision': slot_precision, - 'slot_recall': slot_recall, - 'slot_f1': slot_f1, - } - - def test_step(self, batch, batch_idx): - """ - Lightning calls this inside the test loop with the data from the test dataloader - passed in as `batch`. - """ - loss = self.validation_step(batch, batch_idx) - self.test_step_outputs.append(loss) - return loss - - def on_test_epoch_end(self): - """ - Called at the end of test to aggregate outputs. - :param outputs: list of individual outputs of each test step. - """ - return self.on_validation_epoch_end() - - def setup_training_data(self, train_data_config: Optional[DictConfig]): - self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config) - - def setup_validation_data(self, val_data_config: Optional[DictConfig]): - self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config) - - def setup_test_data(self, test_data_config: Optional[DictConfig]): - self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config) - - def _setup_dataloader_from_config(self, cfg: DictConfig): - input_file = f'{self.data_dir}/{cfg.prefix}.tsv' - slot_file = f'{self.data_dir}/{cfg.prefix}_slots.tsv' - - if not (os.path.exists(input_file) and os.path.exists(slot_file)): - raise FileNotFoundError( - f'{input_file} or {slot_file} not found. Please refer to the documentation for the right format \ - of Intents and Slots files.' - ) - - dataset = IntentSlotClassificationDataset( - input_file=input_file, - slot_file=slot_file, - tokenizer=self.tokenizer, - max_seq_length=self.max_seq_length, - num_samples=cfg.num_samples, - pad_label=self.cfg.data_desc.pad_label, - ignore_extra_tokens=self.cfg.ignore_extra_tokens, - ignore_start_end=self.cfg.ignore_start_end, - ) - - return DataLoader( - dataset=dataset, - batch_size=cfg.batch_size, - shuffle=cfg.shuffle, - num_workers=cfg.num_workers, - pin_memory=cfg.pin_memory, - drop_last=cfg.drop_last, - collate_fn=dataset.collate_fn, - ) - - def _setup_infer_dataloader(self, queries: List[str], test_ds) -> 'torch.utils.data.DataLoader': - """ - Setup function for a infer data loader. - Args: - queries: text - batch_size: batch size to use during inference - Returns: - A pytorch DataLoader. - """ - - dataset = IntentSlotInferenceDataset( - tokenizer=self.tokenizer, queries=queries, max_seq_length=-1, do_lower_case=False - ) - - return torch.utils.data.DataLoader( - dataset=dataset, - collate_fn=dataset.collate_fn, - batch_size=test_ds.batch_size, - shuffle=test_ds.shuffle, - num_workers=test_ds.num_workers, - pin_memory=test_ds.pin_memory, - drop_last=test_ds.drop_last, - ) - - def predict_from_examples(self, queries: List[str], test_ds) -> List[List[str]]: - """ - Get prediction for the queries (intent and slots) - Args: - queries: text sequences - test_ds: Dataset configuration section. - Returns: - predicted_intents, predicted_slots: model intent and slot predictions - """ - predicted_intents = [] - predicted_slots = [] - mode = self.training - try: - device = 'cuda' if torch.cuda.is_available() else 'cpu' - - # Retrieve intent and slot vocabularies from configuration. - intent_labels = self.cfg.data_desc.intent_labels - slot_labels = self.cfg.data_desc.slot_labels - - # Initialize tokenizer. - # if not hasattr(self, "tokenizer"): - # self._setup_tokenizer(self.cfg.tokenizer) - # Initialize modules. - # self._reconfigure_classifier() - - # Switch model to evaluation mode - self.eval() - self.to(device) - - # Dataset. - infer_datalayer = self._setup_infer_dataloader(queries, test_ds) - - for batch in infer_datalayer: - input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch - - intent_logits, slot_logits = self.forward( - input_ids=input_ids.to(device), - token_type_ids=input_type_ids.to(device), - attention_mask=input_mask.to(device), - ) - - # predict intents and slots for these examples - # intents - intent_preds = tensor2list(torch.argmax(intent_logits, axis=-1)) - - # convert numerical outputs to Intent and Slot labels from the dictionaries - for intent_num in intent_preds: - if intent_num < len(intent_labels): - predicted_intents.append(intent_labels[int(intent_num)]) - else: - # should not happen - predicted_intents.append("Unknown Intent") - - # slots - slot_preds = torch.argmax(slot_logits, axis=-1) - - for slot_preds_query, mask_query in zip(slot_preds, subtokens_mask): - query_slots = '' - for slot, mask in zip(slot_preds_query, mask_query): - if mask == 1: - if slot < len(slot_labels): - query_slots += slot_labels[int(slot)] + ' ' - else: - query_slots += 'Unknown_slot ' - predicted_slots.append(query_slots.strip()) - - finally: - # set mode back to its original value - self.train(mode=mode) - - return predicted_intents, predicted_slots - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - result = [] - model = PretrainedModelInfo( - pretrained_model_name="Joint_Intent_Slot_Assistant", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemonlpmodels/versions/1.0.0a5/files/Joint_Intent_Slot_Assistant.nemo", - description="This models is trained on this https://github.com/xliuhw/NLU-Evaluation-Data dataset which includes 64 various intents and 55 slots. Final Intent accuracy is about 87%, Slot accuracy is about 89%.", - ) - result.append(model) - return result diff --git a/nemo/collections/nlp/models/intent_slot_classification/multi_label_intent_slot_classification_model.py b/nemo/collections/nlp/models/intent_slot_classification/multi_label_intent_slot_classification_model.py deleted file mode 100644 index 7a2bec1f2cc0..000000000000 --- a/nemo/collections/nlp/models/intent_slot_classification/multi_label_intent_slot_classification_model.py +++ /dev/null @@ -1,471 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import List, Optional, Tuple - -import numpy as np -import numpy.typing as npt -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig, OmegaConf -from sklearn.metrics import f1_score, precision_score, recall_score -from torch.utils.data import DataLoader - -from nemo.collections.common.losses import AggregatorLoss, BCEWithLogitsLoss, CrossEntropyLoss -from nemo.collections.nlp.data.intent_slot_classification import ( - MultiLabelIntentSlotClassificationDataset, - MultiLabelIntentSlotDataDesc, -) -from nemo.collections.nlp.metrics.classification_report import ClassificationReport, MultiLabelClassificationReport -from nemo.collections.nlp.models.intent_slot_classification import IntentSlotClassificationModel -from nemo.collections.nlp.modules.common import SequenceTokenClassifier -from nemo.collections.nlp.parts.utils_funcs import tensor2list -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging - - -class MultiLabelIntentSlotClassificationModel(IntentSlotClassificationModel): - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - """ - Initializes BERT Joint Intent and Slot model. - - Args: - cfg: configuration object - trainer: trainer for Pytorch Lightning - """ - self.max_seq_length = cfg.language_model.max_seq_length - - # Optimal Threshold - self.threshold = 0.5 - self.max_f1 = 0 - - # Check the presence of data_dir. - if not cfg.data_dir or not os.path.exists(cfg.data_dir): - # Set default values of data_desc. - self._set_defaults_data_desc(cfg) - else: - self.data_dir = cfg.data_dir - # Update configuration of data_desc. - self._set_data_desc_to_cfg(cfg, cfg.data_dir, cfg.train_ds, cfg.validation_ds) - - # init superclass - super().__init__(cfg=cfg, trainer=trainer) - - # Initialize Classifier. - self._reconfigure_classifier() - - def _set_data_desc_to_cfg( - self, cfg: DictConfig, data_dir: str, train_ds: DictConfig, validation_ds: DictConfig - ) -> None: - """ - Creates MultiLabelIntentSlotDataDesc and copies generated values to Configuration object's data descriptor. - - Args: - cfg: configuration object - data_dir: data directory - train_ds: training dataset file name - validation_ds: validation dataset file name - - Returns: - None - """ - # Save data from data desc to config - so it can be reused later, e.g. in inference. - data_desc = MultiLabelIntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix]) - OmegaConf.set_struct(cfg, False) - if not hasattr(cfg, "data_desc") or cfg.data_desc is None: - cfg.data_desc = {} - # Intents. - cfg.data_desc.intent_labels = list(data_desc.intents_label_ids.keys()) - cfg.data_desc.intent_label_ids = data_desc.intents_label_ids - cfg.data_desc.intent_weights = data_desc.intent_weights - # Slots. - cfg.data_desc.slot_labels = list(data_desc.slots_label_ids.keys()) - cfg.data_desc.slot_label_ids = data_desc.slots_label_ids - cfg.data_desc.slot_weights = data_desc.slot_weights - - cfg.data_desc.pad_label = data_desc.pad_label - - # for older(pre - 1.0.0.b3) configs compatibility - if not hasattr(cfg, "class_labels") or cfg.class_labels is None: - cfg.class_labels = {} - cfg.class_labels = OmegaConf.create( - { - "intent_labels_file": "intent_labels.csv", - "slot_labels_file": "slot_labels.csv", - } - ) - - slot_labels_file = os.path.join(data_dir, cfg.class_labels.slot_labels_file) - intent_labels_file = os.path.join(data_dir, cfg.class_labels.intent_labels_file) - self._save_label_ids(data_desc.slots_label_ids, slot_labels_file) - self._save_label_ids(data_desc.intents_label_ids, intent_labels_file) - - self.register_artifact("class_labels.intent_labels_file", intent_labels_file) - self.register_artifact("class_labels.slot_labels_file", slot_labels_file) - OmegaConf.set_struct(cfg, True) - - def _reconfigure_classifier(self) -> None: - """Method reconfigures the classifier depending on the settings of model cfg.data_desc""" - - self.classifier = SequenceTokenClassifier( - hidden_size=self.bert_model.config.hidden_size, - num_intents=len(self.cfg.data_desc.intent_labels), - num_slots=len(self.cfg.data_desc.slot_labels), - dropout=self.cfg.head.fc_dropout, - num_layers=self.cfg.head.num_output_layers, - log_softmax=False, - ) - - # define losses - if self.cfg.class_balancing == "weighted_loss": - # You may need to increase the number of epochs for convergence when using weighted_loss - self.intent_loss = BCEWithLogitsLoss(logits_ndim=2, pos_weight=self.cfg.data_desc.intent_weights) - self.slot_loss = CrossEntropyLoss(logits_ndim=3, weight=self.cfg.data_desc.slot_weights) - else: - self.intent_loss = BCEWithLogitsLoss(logits_ndim=2) - self.slot_loss = CrossEntropyLoss(logits_ndim=3) - - self.total_loss = AggregatorLoss( - num_inputs=2, - weights=[self.cfg.intent_loss_weight, 1.0 - self.cfg.intent_loss_weight], - ) - - # setup to track metrics - self.intent_classification_report = MultiLabelClassificationReport( - num_classes=len(self.cfg.data_desc.intent_labels), - label_ids=self.cfg.data_desc.intent_label_ids, - dist_sync_on_step=True, - mode="micro", - ) - self.slot_classification_report = ClassificationReport( - num_classes=len(self.cfg.data_desc.slot_labels), - label_ids=self.cfg.data_desc.slot_label_ids, - dist_sync_on_step=True, - mode="micro", - ) - - def validation_step(self, batch, batch_idx) -> None: - """ - Validation Loop. Pytorch Lightning calls this inside the validation loop with the data from the validation dataloader - passed in as `batch`. - - Args: - batch: batches of data from DataLoader - batch_idx: batch idx from DataLoader - - Returns: - None - """ - ( - input_ids, - input_type_ids, - input_mask, - loss_mask, - subtokens_mask, - intent_labels, - slot_labels, - ) = batch - intent_logits, slot_logits = self( - input_ids=input_ids, - token_type_ids=input_type_ids, - attention_mask=input_mask, - ) - - # calculate combined loss for intents and slots - intent_loss = self.intent_loss(logits=intent_logits, labels=intent_labels) - slot_loss = self.slot_loss(logits=slot_logits, labels=slot_labels, loss_mask=loss_mask) - val_loss = self.total_loss(loss_1=intent_loss, loss_2=slot_loss) - - intent_probabilities = torch.round(torch.sigmoid(intent_logits)) - - self.intent_classification_report.update(intent_probabilities, intent_labels) - # slots - subtokens_mask = subtokens_mask > 0.5 - preds = torch.argmax(slot_logits, axis=-1)[subtokens_mask] - slot_labels = slot_labels[subtokens_mask] - self.slot_classification_report.update(preds, slot_labels) - - loss = { - "val_loss": val_loss, - "intent_tp": self.intent_classification_report.tp, - "intent_fn": self.intent_classification_report.fn, - "intent_fp": self.intent_classification_report.fp, - "slot_tp": self.slot_classification_report.tp, - "slot_fn": self.slot_classification_report.fn, - "slot_fp": self.slot_classification_report.fp, - } - self.validation_step_outputs.append(loss) - return loss - - def _setup_dataloader_from_config(self, cfg: DictConfig) -> DataLoader: - """ - Creates the DataLoader from the configuration object - - Args: - cfg: configuration object - - Returns: - DataLoader for model's data - """ - - input_file = f"{self.data_dir}/{cfg.prefix}.tsv" - slot_file = f"{self.data_dir}/{cfg.prefix}_slots.tsv" - intent_dict_file = self.data_dir + "/dict.intents.csv" - - lines = open(intent_dict_file, "r").readlines() - lines = [line.strip() for line in lines if line.strip()] - num_intents = len(lines) - - if not (os.path.exists(input_file) and os.path.exists(slot_file)): - raise FileNotFoundError( - f"{input_file} or {slot_file} not found. Please refer to the documentation for the right format \ - of Intents and Slots files." - ) - - dataset = MultiLabelIntentSlotClassificationDataset( - input_file=input_file, - slot_file=slot_file, - num_intents=num_intents, - tokenizer=self.tokenizer, - max_seq_length=self.max_seq_length, - num_samples=cfg.num_samples, - pad_label=self.cfg.data_desc.pad_label, - ignore_extra_tokens=self.cfg.ignore_extra_tokens, - ignore_start_end=self.cfg.ignore_start_end, - ) - - return DataLoader( - dataset=dataset, - batch_size=cfg.batch_size, - shuffle=cfg.shuffle, - num_workers=cfg.num_workers, - pin_memory=cfg.pin_memory, - drop_last=cfg.drop_last, - collate_fn=dataset.collate_fn, - ) - - def prediction_probabilities(self, queries: List[str], test_ds: DictConfig) -> npt.NDArray: - """ - Get prediction probabilities for the queries (intent and slots) - - Args: - queries: text sequences - test_ds: Dataset configuration section. - - Returns: - numpy array of intent probabilities - """ - - probabilities = [] - - mode = self.training - try: - device = "cuda" if torch.cuda.is_available() else "cpu" - - # Switch model to evaluation mode - self.eval() - self.to(device) - - # Dataset. - infer_datalayer = self._setup_infer_dataloader(queries, test_ds) - - for batch in infer_datalayer: - input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch - - intent_logits, slot_logits = self.forward( - input_ids=input_ids.to(device), - token_type_ids=input_type_ids.to(device), - attention_mask=input_mask.to(device), - ) - - # predict intents for these examples - probabilities.append(torch.sigmoid(intent_logits).detach().cpu().numpy()) - - probabilities = np.concatenate(probabilities) - - finally: - # set mode back to its original value - self.train(mode=mode) - - return probabilities - - def optimize_threshold(self, test_ds: DictConfig, file_name: str) -> None: - """ - Set the optimal threshold of the model from performance on validation set. This threshold is used to round the - logits to 0 or 1. - - Args: - test_ds: location of test dataset - file_name: name of input file to retrieve validation set - - Returns: - None - """ - - input_file = f"{self.data_dir}/{file_name}.tsv" - - with open(input_file, "r") as f: - input_lines = f.readlines()[1:] # Skipping headers at index 0 - - dataset = list(input_lines) - - metrics_labels, sentences = [], [] - - for input_line in dataset: - sentence = input_line.strip().split("\t")[0] - sentences.append(sentence) - parts = input_line.strip().split("\t")[1:][0] - parts = list(map(int, parts.split(","))) - parts = [1 if label in parts else 0 for label in range(len(self.cfg.data_desc.intent_labels))] - metrics_labels.append(parts) - - # Retrieve class probabilities for each sentence - intent_probabilities = self.prediction_probabilities(sentences, test_ds) - - metrics_dict = {} - # Find optimal logits rounding threshold for intents - for i in np.arange(0.5, 0.96, 0.01): - predictions = (intent_probabilities >= i).tolist() - precision = precision_score(metrics_labels, predictions, average='micro') - recall = recall_score(metrics_labels, predictions, average='micro') - f1 = f1_score(metrics_labels, predictions, average='micro') - metrics_dict[i] = [precision, recall, f1] - - max_precision = max(metrics_dict, key=lambda x: metrics_dict[x][0]) - max_recall = max(metrics_dict, key=lambda x: metrics_dict[x][1]) - max_f1_score = max(metrics_dict, key=lambda x: metrics_dict[x][2]) - - logging.info( - f'Best Threshold for F1-Score: {max_f1_score}, [Precision, Recall, F1-Score]: {metrics_dict[max_f1_score]}' - ) - logging.info( - f'Best Threshold for Precision: {max_precision}, [Precision, Recall, F1-Score]: {metrics_dict[max_precision]}' - ) - logging.info( - f'Best Threshold for Recall: {max_recall}, [Precision, Recall, F1-Score]: {metrics_dict[max_recall]}' - ) - - if metrics_dict[max_f1_score][2] > self.max_f1: - self.max_f1 = metrics_dict[max_f1_score][2] - - logging.info(f'Setting Threshold to: {max_f1_score}') - - self.threshold = max_f1_score - - def predict_from_examples( - self, queries: List[str], test_ds: DictConfig, threshold: float = None - ) -> Tuple[List[List[Tuple[str, float]]], List[str], List[List[int]]]: - """ - Get prediction for the queries (intent and slots) - - - Args: - queries: text sequences - test_ds: Dataset configuration section. - threshold: Threshold for rounding prediction logits - - Returns: - predicted_intents: model intent predictions with their probabilities - Example: [[('flight', 0.84)], [('airfare', 0.54), - ('flight', 0.73), ('meal', 0.24)]] - predicted_slots: model slot predictions - Example: ['O B-depart_date.month_name B-depart_date.day_number', - 'O O B-flight_stop O O O'] - - predicted_vector: model intent predictions for each individual query. Binary values within each list - indicate whether a class is prediced for the given query (1 for True, 0 for False) - Example: [[1,0,0,0,0,0], [0,0,1,0,0,0]] - """ - predicted_intents = [] - - if threshold is None: - threshold = self.threshold - logging.info(f'Using threshold = {threshold}') - - predicted_slots = [] - predicted_vector = [] - - mode = self.training - try: - device = "cuda" if torch.cuda.is_available() else "cpu" - - # Retrieve intent and slot vocabularies from configuration. - intent_labels = self.cfg.data_desc.intent_labels - slot_labels = self.cfg.data_desc.slot_labels - - # Switch model to evaluation mode - self.eval() - self.to(device) - - # Dataset. - infer_datalayer = self._setup_infer_dataloader(queries, test_ds) - - for batch in infer_datalayer: - input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch - - intent_logits, slot_logits = self.forward( - input_ids=input_ids.to(device), - token_type_ids=input_type_ids.to(device), - attention_mask=input_mask.to(device), - ) - - # predict intents and slots for these examples - # intents - intent_preds = tensor2list(torch.sigmoid(intent_logits)) - # convert numerical outputs to Intent and Slot labels from the dictionaries - for intents in intent_preds: - intent_lst = [] - temp_list = [] - for intent_num, probability in enumerate(intents): - if probability >= threshold: - intent_lst.append((intent_labels[int(intent_num)], round(probability, 2))) - temp_list.append(1) - else: - temp_list.append(0) - - predicted_vector.append(temp_list) - predicted_intents.append(intent_lst) - - # slots - slot_preds = torch.argmax(slot_logits, axis=-1) - temp_slots_preds = [] - - for slot_preds_query, mask_query in zip(slot_preds, subtokens_mask): - temp_slots = "" - query_slots = "" - for slot, mask in zip(slot_preds_query, mask_query): - if mask == 1: - if slot < len(slot_labels): - query_slots += slot_labels[int(slot)] + " " - temp_slots += f"{slot} " - else: - query_slots += "Unknown_slot " - temp_slots += "0 " - predicted_slots.append(query_slots.strip()) - temp_slots_preds.append(temp_slots) - - finally: - # set mode back to its original value - self.train(mode=mode) - - return predicted_intents, predicted_slots, predicted_vector - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - To be added - """ - result = [] - return result diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py index 6cc317d1efea..e6eaee440acb 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +# pylint: skip-file +# flake8: noqa + import itertools import re from collections import OrderedDict @@ -24,7 +27,15 @@ from torch import Tensor from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer -from nemo.collections.nlp.metrics.prompt_learning_metrics import AccuracyScore, BLEUScore, ROUGEScores + +try: + from nemo.collections.nlp.metrics.prompt_learning_metrics import AccuracyScore, BLEUScore, ROUGEScores +except ModuleNotFoundError: + from abc import ABC + + AccuracyScore = ABC + BLEUScore = ABC + ROUGEScores = ABC from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel from nemo.collections.nlp.modules.common import ( PromptEncoder, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py index d3829c3e8de1..e459f69c1fcd 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py @@ -11,13 +11,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +# pylint: skip-file +# flake8: noqa + from lightning.pytorch.trainer.trainer import Trainer from omegaconf.dictconfig import DictConfig -from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import ( - TextToTextGLUEDataset, - TextToTextXNLIDataset, -) +try: + from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import ( + TextToTextGLUEDataset, + TextToTextXNLIDataset, + ) +except ModuleNotFoundError: + from abc import ABC + + TextToTextGLUEDataset = ABC + TextToTextXNLIDataset = ABC from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel from nemo.utils import logging diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index 3b5c9f1161bb..1ea2481fae22 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -27,7 +27,15 @@ from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer from nemo.collections.nlp.data.language_modeling.megatron.gpt_prompt_learning_dataset import GPTPromptLearningDataset -from nemo.collections.nlp.metrics.prompt_learning_metrics import AccuracyScore, BLEUScore, ROUGEScores + +try: + from nemo.collections.nlp.metrics.prompt_learning_metrics import AccuracyScore, BLEUScore, ROUGEScores +except ModuleNotFoundError: + from abc import ABC + + AccuracyScore = ABC + BLEUScore = ABC + ROUGEScores = ABC from nemo.collections.nlp.models.language_modeling.megatron_base_prompt_learning_model import ( MegatronBasePromptLearningModel, ) diff --git a/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py b/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py index 3b8e1f819ea1..769ea3a0ddd7 100644 --- a/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py +++ b/nemo/collections/nlp/models/language_modeling/transformer_lm_model.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# pylint: skip-file + import json import math from typing import Dict, Optional @@ -26,7 +28,13 @@ from nemo.collections.common.metrics import GlobalAverageLossMetric from nemo.collections.common.parts import transformer_weights_init from nemo.collections.nlp.data import SentenceDataset, TarredSentenceDataset -from nemo.collections.nlp.metrics import SequencePerplexity + +try: + from nemo.collections.nlp.metrics import SequencePerplexity +except ModuleNotFoundError: + from abc import ABC + + SequencePerplexity = ABC from nemo.collections.nlp.modules.common import TokenClassifier from nemo.collections.nlp.modules.common.lm_utils import get_transformer from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer diff --git a/nemo/collections/nlp/models/token_classification/__init__.py b/nemo/collections/nlp/models/token_classification/__init__.py deleted file mode 100644 index c903cc8812cb..000000000000 --- a/nemo/collections/nlp/models/token_classification/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from nemo.collections.nlp.models.token_classification.punctuation_capitalization_config import ( - PunctuationCapitalizationModelConfig, -) -from nemo.collections.nlp.models.token_classification.punctuation_capitalization_lexical_audio_model import ( - PunctuationCapitalizationLexicalAudioModel, -) -from nemo.collections.nlp.models.token_classification.punctuation_capitalization_model import ( - PunctuationCapitalizationModel, -) -from nemo.collections.nlp.models.token_classification.token_classification_model import TokenClassificationModel diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_config.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_config.py deleted file mode 100644 index 86bf12b92315..000000000000 --- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_config.py +++ /dev/null @@ -1,419 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field -from typing import Any, Dict, Optional - -from omegaconf.omegaconf import MISSING, DictConfig, OmegaConf, open_dict - -from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig -from nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset import ( - PunctuationCapitalizationEvalDataConfig, - PunctuationCapitalizationTrainDataConfig, - legacy_data_config_to_new_data_config, -) -from nemo.core.config import TrainerConfig -from nemo.core.config.modelPT import NemoConfig -from nemo.utils.exp_manager import ExpManagerConfig - - -@dataclass -class FreezeConfig: - is_enabled: bool = False - """Freeze audio encoder weight and add Conformer Layers on top of it""" - d_model: Optional[int] = 256 - """`d_model` parameter of ``ConformerLayer``""" - d_ff: Optional[int] = 1024 - """``d_ff`` parameter of ``ConformerLayer``""" - num_layers: Optional[int] = 8 - """``num_layers`` number of ``ConformerLayer`` modules to add on top of audio encoder""" - - -@dataclass -class AdapterConfig: - config: Optional[LinearAdapterConfig] = None - """Linear adapter config see ``collections.common.parts.LinearAdapterConfig``""" - enable: bool = False - """Use adapters for audio encoder""" - - -@dataclass -class FusionConfig: - num_layers: Optional[int] = 4 - """"Number of layers to use in fusion""" - num_attention_heads: Optional[int] = 4 - """Number of attention heads to use in fusion""" - inner_size: Optional[int] = 2048 - """Fusion inner size""" - - -@dataclass -class AudioEncoderConfig: - pretrained_model: str = MISSING - """A configuration for restoring pretrained audio encoder""" - freeze: Optional[FreezeConfig] = None - adapter: Optional[AdapterConfig] = None - fusion: Optional[FusionConfig] = None - - -@dataclass -class TokenizerConfig: - """A structure and default values of source text tokenizer.""" - - vocab_file: Optional[str] = None - """A path to vocabulary file which is used in ``'word'``, ``'char'``, and HuggingFace tokenizers""" - - tokenizer_name: str = MISSING - """A name of the tokenizer used for tokenization of source sequences. Possible options are ``'sentencepiece'``, - ``'word'``, ``'char'``, HuggingFace tokenizers (e.g. ``'bert-base-uncased'``). For more options see function - ``nemo.collections.nlp.modules.common.get_tokenizer``. The tokenizer must have properties ``cls_id``, ``pad_id``, - ``sep_id``, ``unk_id``.""" - - special_tokens: Optional[Dict[str, str]] = None - """A dictionary with special tokens passed to constructors of ``'char'``, ``'word'``, ``'sentencepiece'``, and - various HuggingFace tokenizers.""" - - tokenizer_model: Optional[str] = None - """A path to a tokenizer model required for ``'sentencepiece'`` tokenizer.""" - - -@dataclass -class LanguageModelConfig: - """ - A structure and default values of language model configuration of punctuation and capitalization model. BERT like - HuggingFace models are supported. Provide a valid ``pretrained_model_name`` and, optionally, you may - reinitialize model via ``config_file`` or ``config``. - - Alternatively you can initialize the language model using ``lm_checkpoint``. - - This config is a part of :class:`PunctuationCapitalizationModelConfig` config. - """ - - pretrained_model_name: str = MISSING - """A mandatory parameter containing name of HuggingFace pretrained model. For example, ``'bert-base-uncased'``.""" - - config_file: Optional[str] = None - """A path to a file with HuggingFace model config which is used to reinitialize language model.""" - - config: Optional[Dict] = None - """A HuggingFace config which is used to reinitialize language model.""" - - lm_checkpoint: Optional[str] = None - """A path to a ``torch`` checkpoint of a language model.""" - - -@dataclass -class HeadConfig: - """ - A structure and default values of configuration of capitalization or punctuation model head. This config defines a - multilayer perceptron which is applied to output of a language model. Number of units in the hidden layer is equal - to the dimension of the language model. - - This config is a part of :class:`PunctuationCapitalizationModelConfig` config. - """ - - num_fc_layers: int = 1 - """A number of hidden layers in a multilayer perceptron.""" - - fc_dropout: float = 0.1 - """A dropout used in an MLP.""" - - activation: str = 'relu' - """An activation used in hidden layers.""" - - use_transformer_init: bool = True - """Whether to initialize the weights of the classifier head with the approach that was used for language model - initialization.""" - - -@dataclass -class ClassLabelsConfig: - """ - A structure and default values of a mandatory part of config which contains names of files which are saved in .nemo - checkpoint. These files can also be used for passing label vocabulary to the model. For using them as label - vocabularies you will need to provide path these files in parameter - ``model.common_dataset_parameters.label_vocab_dir``. Each line in labels files - contains 1 label. The values are sorted, ``==