diff --git a/experiments/QNLI/roberta/params.yaml b/experiments/QNLI/roberta/params.yaml new file mode 100644 index 0000000..b439dad --- /dev/null +++ b/experiments/QNLI/roberta/params.yaml @@ -0,0 +1,16 @@ +dev_file: data/preprocessed/QNLI/dev_roberta.jsonl +meta_dir: data/preprocessed/QNLI/ +train_file: data/preprocessed/QNLI/train_roberta.jsonl +#dev_file: data/preprocessed/QQPdebug/dev_roberta.jsonl +#meta_dir: data/preprocessed/QQPdebug/ +#train_file: data/preprocessed/QQPdebug/train_roberta.jsonl +network: roberta +fix_embeddings: false +use_cuda: true +batch_size: 32 +epoches: 10 +optimizer: bert-adam +length_limit: 128 +learning_rate: 1.0e-5 +warmup_proportion: 0.1 +model_dir: ../roberta-base-py/ diff --git a/lion/common/tokenizer.py b/lion/common/tokenizer.py index 66788d0..2bfe50a 100644 --- a/lion/common/tokenizer.py +++ b/lion/common/tokenizer.py @@ -3,10 +3,14 @@ from __future__ import (absolute_import, division, print_function, unicode_literals) import os +import re import six +import sys import copy +import json import spacy import logging +import regex as re import unicodedata import collections from shutil import copyfile @@ -532,10 +536,7 @@ class XLNetTokenizer(Tokenizer): - requires `SentencePiece `_ """ - max_model_input_sizes = {} - vocab_files_names = {} - - def __init__(self, vocab_file, max_len=None, do_lower_case=False, remove_space=True, keep_accents=False, **kwargs): + def __init__(self, vocab_file, max_len=None, do_lower_case=False, remove_space=True, keep_accents=False): super(XLNetTokenizer, self).__init__() self.max_len = max_len if max_len is not None else int(1e12) @@ -683,18 +684,10 @@ def convert_ids_to_tokens(self, ids, skip_special_tokens=False): skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False """ if isinstance(ids, int): - if ids in self.added_tokens_decoder: - return self.added_tokens_decoder[ids] - else: - return self._convert_id_to_token(ids) + return self._convert_id_to_token(ids) tokens = [] for index in ids: - if skip_special_tokens and index in self.all_special_ids: - continue - if index in self.added_tokens_decoder: - tokens.append(self.added_tokens_decoder[index]) - else: - tokens.append(self._convert_id_to_token(index)) + tokens.append(self._convert_id_to_token(index)) return tokens def _convert_id_to_token(self, index, return_unicode=True): @@ -724,7 +717,223 @@ def save_vocabulary(self, save_directory): return (out_vocab_file,) +class RobertaTokenizer(Tokenizer): + """ + GPT-2 BPE tokenizer. Peculiarities: + - Byte-level Byte-Pair-Encoding + - Requires a space to start the input string => the encoding and tokenize methods should be called with the + ``add_prefix_space`` flag set to ``True``. + Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve + the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello" + """ + def __init__(self, vocab_file, merges_file, max_len=None): + super(RobertaTokenizer, self).__init__() + self.unk_token = "" + self.max_len = max_len if max_len is not None else int(1e12) + self.encoder = json.load(open(vocab_file, encoding="utf-8")) + self.decoder = {v: k for k, v in self.encoder.items()} + self.byte_encoder = self.bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_data] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + + self.cache = {} + + # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + @property + def vocab_size(self): + return len(self.encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = self.get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = self.get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def tokenize(self, text, add_prefix_space=False): + split_tokens = [] + bpe_tokens = self._tokenize(text, add_prefix_space) + bpe_token_ids = self.convert_tokens_to_ids(bpe_tokens) + for bpe_token in bpe_token_ids: + split_tokens.append(( + bpe_token, + None, + None, + None, + None, + None, + )) + # return split_tokens + # Set special option for non-entity tag: '' vs 'O' in spaCy + return Tokens(split_tokens, opts={'non_ent': ''}) + + def _tokenize(self, text, add_prefix_space=True): + """ Tokenize a string. + return_unicode is used only for py2 + """ + if add_prefix_space: + text = ' ' + text + + bpe_tokens = [] + for token in re.findall(self.pat, text): + if sys.version_info[0] == 2: + # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) + token = ''.join(self.byte_encoder[ord(b)] for b in token) + else: + # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) + + return bpe_tokens + + def convert_tokens_to_ids(self, tokens): + """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id + (resp. a sequence of ids), using the vocabulary. + """ + if tokens is None: + return None + + if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): + return self._convert_token_to_id(tokens) + + ids = [] + for token in tokens: + ids.append(self._convert_token_to_id(token)) + if len(ids) > self.max_len: + logger.warning( + "Token indices sequence length is longer than the specified maximum " + " sequence length for this XLNET model ({} > {}). Running this" + " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) + ) + return ids + + def _convert_token_to_id(self, token): + """ Converts a token (str/unicode) in an id using the vocab. """ + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + """ Converts a single index or a sequence of indices (integers) in a token " + (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens. + Args: + skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False + """ + if isinstance(ids, int): + return self._convert_id_to_token(ids) + tokens = [] + for index in ids: + tokens.append(self._convert_id_to_token(index)) + return tokens + + def _convert_id_to_token(self, index, return_unicode=True): + """Converts an index (integer) in a token (string/unicode) using the vocab.""" + return self.decoder.get(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + text = ''.join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors='replace') + return text + + def save_vocabulary(self, save_directory): + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + vocab_file = os.path.join(save_directory, 'vocab.json') + merge_file = os.path.join(save_directory, 'merges.txt') + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write(u'#version: 0.2\n') + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file)) + index = token_index + writer.write(' '.join(bpe_tokens) + u'\n') + index += 1 + + return vocab_file, merge_file + + def bytes_to_unicode(self): + """ + Returns list of utf-8 byte and a mapping to unicode strings. + We specifically avoids mapping to whitespace/control characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + """ + _chr = unichr if sys.version_info[0] == 2 else chr + bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list( + range(ord("®"), ord("ÿ") + 1)) + cs = bs[:] + n = 0 + for b in range(2 ** 8): + if b not in bs: + bs.append(b) + cs.append(2 ** 8 + n) + n += 1 + cs = [_chr(n) for n in cs] + return dict(zip(bs, cs)) + + def get_pairs(self, word): + """Return set of symbol pairs in a word. + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + def get_class(name): + name = str(name).lower() if name == 'spacy': return SpacyTokenizer elif name == 'bert': @@ -733,5 +942,7 @@ def get_class(name): return JiebaTokenizer elif name == 'xlnet': return XLNetTokenizer + elif name == 'roberta': + return RobertaTokenizer else: raise ValueError("Unspport tokenize algorithm:{}".format(name)) diff --git a/lion/data/dataset.py b/lion/data/dataset.py index 668b6a5..d2ae1e1 100644 --- a/lion/data/dataset.py +++ b/lion/data/dataset.py @@ -63,7 +63,7 @@ def make_char(char_dict, token, word_length=16): # Not index words oriAtoken = ex['Atokens'] oriBtoken = ex['Btokens'] - if self.args.network == 'xlnet': + if self.args.network == 'xlnet' or self.args.network == 'roberta': Atoken = torch.LongTensor(ex['Atokens']) Btoken = torch.LongTensor(ex['Btokens']) Achar = torch.zeros(len(ex['Atokens']), 16) diff --git a/lion/data/processor.py b/lion/data/processor.py index e83b4a4..4c902a5 100644 --- a/lion/data/processor.py +++ b/lion/data/processor.py @@ -44,17 +44,31 @@ def process_datum(datum, tokenizer, label2index, max_length): rv['Bpos'] = B.pos() rv['Bner'] = B.entities() if tokenizer.__class__.__name__ == 'BertTokenizer': - # Adapt to bert input format + """ + Adds special tokens to a sequence pair for sequence classification tasks. + A Bert sequence pair has the following format: [CLS] A [SEP] B [SEP] + """ truncate_seq_pair(rv['Atokens'], rv['Btokens'], max_length-3) rv['Atokens'] = ["[CLS]"] + rv['Atokens'] + ["[SEP]"] rv['Btokens'] = rv['Btokens'] + ["[SEP]"] if tokenizer.__class__.__name__ == 'XLNetTokenizer': - # Adapt to xlnet input format - # special_symbols = {SEG_ID_A: 0, SEG_ID_B: 1, SEG_ID_CLS: 2, "": 3, "": 4, - # SEG_ID_SEP: 3, SEG_ID_PAD: 4} + """ + Adds special tokens to a sequence pair for sequence classification tasks. + A xlnet sequence pair has the following format: A [SEP] B [SEP] [CLS] + special_symbols = {SEG_ID_A: 0, SEG_ID_B: 1, SEG_ID_CLS: 2, "": 3, "": 4, + SEG_ID_SEP: 3, SEG_ID_PAD: 4} + """ truncate_seq_pair(rv['Atokens'], rv['Btokens'], max_length-3) rv['Atokens'] = rv['Atokens'] + [4] rv['Btokens'] = rv['Btokens'] + [4] + [3] + if tokenizer.__class__.__name__ == 'RobertaTokenizer': + """ + Adds special tokens to a sequence pair for sequence classification tasks. + A RoBERTa sequence pair has the following format: A B + """ + truncate_seq_pair(rv['Atokens'], rv['Btokens'], max_length - 3) + rv['Atokens'] = [0] + rv['Atokens'] + [2] + [2] + rv['Btokens'] = rv['Btokens'] + [2] return rv @@ -75,15 +89,20 @@ def truncate_seq_pair(tokens_a, tokens_b, max_length): def process_dataset(in_dir, out_dir, splits=['train', 'dev', 'test'], - tokenizer_name='spacy', vocab_file=None, max_length=128): + tokenizer_name='spacy', vocab_file=None, max_length=128, **kwargs): def jsondump(data, filename): json.dump(data, open(osp.join(out_dir, filename), 'w'), indent=2, ensure_ascii=False) - if tokenizer_name == 'bert': - tokenizer = get_class(tokenizer_name)(vocab_file) - elif tokenizer_name == 'xlnet': + + if tokenizer_name == 'bert' or tokenizer_name == 'xlnet': + if not vocab_file: + raise ValueError('Bert model should use an existing vocab') tokenizer = get_class(tokenizer_name)(vocab_file) + elif tokenizer_name == 'roberta': + merges_file = kwargs.pop('merges_file') + tokenizer = get_class(tokenizer_name)(vocab_file, merges_file) else: tokenizer = get_class(tokenizer_name)() + if not osp.exists(out_dir): os.makedirs(out_dir, exist_ok=True) @@ -96,15 +115,10 @@ def jsondump(data, filename): processed = [] for datum in tqdm(dataset): try: - processed.append(process_datum(datum, tokenizer, label2index)) - except Exception as e: - print(e) processed.append(process_datum(datum, tokenizer, label2index, max_length)) - except: - raise ValueError('Bae line {}'.format(datum)) - #with Pool(30) as p: - # processed = p.map(tokenizer.tokenize, dataset) - if tokenizer_name != 'xlnet': + except Exception as e: + raise ValueError('Exception: {}, Bae line {}'.format(e, datum, ensure_ascii=False)) + if tokenizer_name != 'xlnet' and tokenizer_name != 'roberta': char_dict, word_dict, pos_dict, ner_dict = gather_dict(processed) jsondump(char_dict, 'char.json') jsondump(word_dict, 'word.json') @@ -113,6 +127,10 @@ def jsondump(data, filename): out_file = open(osp.join(out_dir, 'train_{}.jsonl'.format(tokenizer_name)), 'w') for datum in processed: out_file.write('{}\n'.format(json.dumps(datum, ensure_ascii=False))) + + if 'train' not in splits: + raise ValueError('`splits` argument must contain `train` otherwise `label2index` will be NontType!') + if 'dev' in splits: split = 'dev.jsonl' filename = osp.join(in_dir, split) @@ -122,8 +140,8 @@ def jsondump(data, filename): for datum in tqdm(dataset): try: processed.append(process_datum(datum, tokenizer, label2index, max_length)) - except: - raise ValueError('Bae line {}'.format(datum, ensure_ascii=False)) + except Exception as e: + raise ValueError('Exception: {}, Bae line {}'.format(e, datum, ensure_ascii=False)) for datum in processed: out_file.write('{}\n'.format(json.dumps(datum))) if 'test' in splits: @@ -135,8 +153,8 @@ def jsondump(data, filename): for datum in tqdm(dataset): try: processed.append(process_datum(datum, tokenizer, label2index, max_length)) - except: - raise ValueError('Bae line {}'.format(datum)) + except Exception as e: + raise ValueError('Exception: {}, Bae line {}'.format(e, datum, ensure_ascii=False)) for datum in processed: out_file.write('{}\n'.format(json.dumps(datum, ensure_ascii=False))) diff --git a/lion/models/__init__.py b/lion/models/__init__.py index 637c691..448cca9 100644 --- a/lion/models/__init__.py +++ b/lion/models/__init__.py @@ -7,6 +7,7 @@ from .esim import ESIM from .bert import BertForSequenceClassification from .xlnet import XLNetForSequenceClassification +from .roberta import RobertaForSequenceClassification def get_model_class(name): @@ -20,4 +21,6 @@ def get_model_class(name): return BertForSequenceClassification if name == 'xlnet': return XLNetForSequenceClassification + if name == 'roberta': + return RobertaForSequenceClassification raise RuntimeError('Invalid model %s' % name) diff --git a/lion/models/bert.py b/lion/models/bert.py index 7a87d2f..37c4bfd 100644 --- a/lion/models/bert.py +++ b/lion/models/bert.py @@ -112,6 +112,8 @@ class BertPreTrainedModel(nn.Module): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + base_model_prefix = "bert" + def __init__(self, config, *inputs, **kwargs): super(BertPreTrainedModel, self).__init__() if not isinstance(config, BertConfig): @@ -165,7 +167,7 @@ def from_pretrained(cls, pretrained_model_path, *inputs, **kwargs): (ex: num_labels for BertForSequenceClassification) """ # Load config - config_file = os.path.join(pretrained_model_path, 'bert_config.json') + config_file = os.path.join(pretrained_model_path, '{}_config.json'.format(cls.base_model_prefix)) if not os.path.exists(config_file): # Backward compatibility with old naming format config_file = os.path.join(pretrained_model_path, 'config.json') @@ -211,8 +213,8 @@ def load(module, prefix=''): if child is not None: load(child, prefix + name + '.') start_prefix = '' - if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()): - start_prefix = 'bert.' + if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()): + start_prefix = cls.base_model_prefix + '.' load(model, prefix=start_prefix) if len(missing_keys) > 0: logger.info("Weights of {} not initialized from pretrained model: {}".format( @@ -239,10 +241,11 @@ def __init__(self, config): self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward(self, input_ids, token_type_ids=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None): seq_length = input_ids.size(1) - position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) - position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if position_ids is None: + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) if token_type_ids is None: token_type_ids = torch.zeros_like(input_ids) diff --git a/lion/models/roberta.py b/lion/models/roberta.py new file mode 100644 index 0000000..1ca9016 --- /dev/null +++ b/lion/models/roberta.py @@ -0,0 +1,86 @@ +import torch +import torch.nn as nn + +from lion.models.bert import BertEmbeddings, BertModel, BertPreTrainedModel + + +class RobertaEmbeddings(BertEmbeddings): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + def __init__(self, config): + super(RobertaEmbeddings, self).__init__(config) + self.padding_idx = 1 + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, + padding_idx=self.padding_idx) + + def forward(self, input_ids=None, position_ids=None): + seq_length = input_ids.size(1) + if position_ids is None: + # Position numbers begin at padding_idx+1. Padding symbols are ignored. + # cf. fairseq's `utils.make_positions` + position_ids = torch.arange(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=torch.long, + device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + return super(RobertaEmbeddings, self).forward(input_ids, position_ids=position_ids) + + +class RobertaModel(BertModel): + base_model_prefix = "roberta" + + def __init__(self, config): + super(RobertaModel, self).__init__(config) + + self.embeddings = RobertaEmbeddings(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, attention_mask=None, output_all_encoded_layers=False): + if input_ids[:, 0].sum().item() != 0: + print("A sequence with no special tokens has been passed to the RoBERTa model. " + "This model requires special tokens in order to work. " + "Please specify add_special_tokens=True in your encoding.") + return super(RobertaModel, self).forward(input_ids, attention_mask=attention_mask, + output_all_encoded_layers=output_all_encoded_layers) + + +class RobertaForSequenceClassification(BertPreTrainedModel): + base_model_prefix = "roberta" + + def __init__(self, config, num_labels): + super(RobertaForSequenceClassification, self).__init__(config) + self.num_labels = num_labels + + self.roberta = RobertaModel(config) + self.classifier = RobertaClassificationHead(config, self.num_labels) + + def forward(self, ex): + A = ex['Atoken_ids'] + B = ex['Btoken_ids'] + Amask = ex['Amask'] + Bmask = ex['Bmask'] + input_ids = torch.cat([A, B], dim=-1) + attention_mask = torch.cat([Amask, Bmask], dim=-1) + + sequence_output, _ = self.roberta(input_ids, attention_mask=attention_mask, output_all_encoded_layers=False) + logits = self.classifier(sequence_output) + return logits + + +class RobertaClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, num_labels): + super(RobertaClassificationHead, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.out_proj = nn.Linear(config.hidden_size, num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + return x \ No newline at end of file diff --git a/lion/training/model.py b/lion/training/model.py index 31fece4..f359bc6 100644 --- a/lion/training/model.py +++ b/lion/training/model.py @@ -18,7 +18,7 @@ class MatchingModel: def __init__(self, params, state_dict=None): self.params = params - if params.network == 'bert' or params.network == 'xlnet': + if params.network == 'bert' or params.network == 'xlnet' or params.network == 'roberta': self.network = get_model_class(params.network).from_pretrained(params.model_dir, params.classes) else: self.network = get_model_class(params.network)(params)