diff --git a/experiments/QNLI/roberta/params.yaml b/experiments/QNLI/roberta/params.yaml
new file mode 100644
index 0000000..b439dad
--- /dev/null
+++ b/experiments/QNLI/roberta/params.yaml
@@ -0,0 +1,16 @@
+dev_file: data/preprocessed/QNLI/dev_roberta.jsonl
+meta_dir: data/preprocessed/QNLI/
+train_file: data/preprocessed/QNLI/train_roberta.jsonl
+#dev_file: data/preprocessed/QQPdebug/dev_roberta.jsonl
+#meta_dir: data/preprocessed/QQPdebug/
+#train_file: data/preprocessed/QQPdebug/train_roberta.jsonl
+network: roberta
+fix_embeddings: false
+use_cuda: true
+batch_size: 32
+epoches: 10
+optimizer: bert-adam
+length_limit: 128
+learning_rate: 1.0e-5
+warmup_proportion: 0.1
+model_dir: ../roberta-base-py/
diff --git a/lion/common/tokenizer.py b/lion/common/tokenizer.py
index 66788d0..2bfe50a 100644
--- a/lion/common/tokenizer.py
+++ b/lion/common/tokenizer.py
@@ -3,10 +3,14 @@
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 import os
+import re
 import six
+import sys
 import copy
+import json
 import spacy
 import logging
+import regex as re
 import unicodedata
 import collections
 from shutil import copyfile
@@ -532,10 +536,7 @@ class XLNetTokenizer(Tokenizer):
 
             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
     """
-    max_model_input_sizes = {}
-    vocab_files_names = {}
-
-    def __init__(self, vocab_file, max_len=None, do_lower_case=False, remove_space=True, keep_accents=False, **kwargs):
+    def __init__(self, vocab_file, max_len=None, do_lower_case=False, remove_space=True, keep_accents=False):
         super(XLNetTokenizer, self).__init__()
         self.max_len = max_len if max_len is not None else int(1e12)
 
@@ -683,18 +684,10 @@ def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
                 skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
         """
         if isinstance(ids, int):
-            if ids in self.added_tokens_decoder:
-                return self.added_tokens_decoder[ids]
-            else:
-                return self._convert_id_to_token(ids)
+            return self._convert_id_to_token(ids)
         tokens = []
         for index in ids:
-            if skip_special_tokens and index in self.all_special_ids:
-                continue
-            if index in self.added_tokens_decoder:
-                tokens.append(self.added_tokens_decoder[index])
-            else:
-                tokens.append(self._convert_id_to_token(index))
+            tokens.append(self._convert_id_to_token(index))
         return tokens
 
     def _convert_id_to_token(self, index, return_unicode=True):
@@ -724,7 +717,223 @@ def save_vocabulary(self, save_directory):
         return (out_vocab_file,)
 
 
+class RobertaTokenizer(Tokenizer):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level Byte-Pair-Encoding
+        - Requires a space to start the input string => the encoding and tokenize methods should be called with the
+          ``add_prefix_space`` flag set to ``True``.
+          Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
+          the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"
+    """
+    def __init__(self, vocab_file, merges_file, max_len=None):
+        super(RobertaTokenizer, self).__init__()
+        self.unk_token = "<unk>"
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.byte_encoder = self.bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = self.get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = self.get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text, add_prefix_space=False):
+        split_tokens = []
+        bpe_tokens = self._tokenize(text, add_prefix_space)
+        bpe_token_ids = self.convert_tokens_to_ids(bpe_tokens)
+        for bpe_token in bpe_token_ids:
+            split_tokens.append((
+                bpe_token,
+                None,
+                None,
+                None,
+                None,
+                None,
+            ))
+        # return split_tokens
+        # Set special option for non-entity tag: '' vs 'O' in spaCy
+        return Tokens(split_tokens, opts={'non_ent': ''})
+
+    def _tokenize(self, text, add_prefix_space=True):
+        """ Tokenize a string.
+            return_unicode is used only for py2
+        """
+        if add_prefix_space:
+            text = ' ' + text
+
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
+            (resp. a sequence of ids), using the vocabulary.
+        """
+        if tokens is None:
+            return None
+
+        if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
+            return self._convert_token_to_id(tokens)
+
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_id(token))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this XLNET model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """ Converts a single index or a sequence of indices (integers) in a token "
+            (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
+            Args:
+                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
+        """
+        if isinstance(ids, int):
+            return self._convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            tokens.append(self._convert_id_to_token(index))
+        return tokens
+
+    def _convert_id_to_token(self, index, return_unicode=True):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        text = ''.join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors='replace')
+        return text
+
+    def save_vocabulary(self, save_directory):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        vocab_file = os.path.join(save_directory, 'vocab.json')
+        merge_file = os.path.join(save_directory, 'merges.txt')
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        return vocab_file, merge_file
+
+    def bytes_to_unicode(self):
+        """
+        Returns list of utf-8 byte and a mapping to unicode strings.
+        We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
+
+        The reversible bpe codes work on unicode strings.
+        This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+        When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+        This is a signficant percentage of your normal, say, 32K bpe vocab.
+        To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+        """
+        _chr = unichr if sys.version_info[0] == 2 else chr
+        bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(
+            range(ord("®"), ord("ÿ") + 1))
+        cs = bs[:]
+        n = 0
+        for b in range(2 ** 8):
+            if b not in bs:
+                bs.append(b)
+                cs.append(2 ** 8 + n)
+                n += 1
+        cs = [_chr(n) for n in cs]
+        return dict(zip(bs, cs))
+
+    def get_pairs(self, word):
+        """Return set of symbol pairs in a word.
+        Word is represented as tuple of symbols (symbols being variable-length strings).
+        """
+        pairs = set()
+        prev_char = word[0]
+        for char in word[1:]:
+            pairs.add((prev_char, char))
+            prev_char = char
+        return pairs
+
+
 def get_class(name):
+    name = str(name).lower()
     if name == 'spacy':
         return SpacyTokenizer
     elif name == 'bert':
@@ -733,5 +942,7 @@ def get_class(name):
         return JiebaTokenizer
     elif name == 'xlnet':
         return XLNetTokenizer
+    elif name == 'roberta':
+        return RobertaTokenizer
     else:
         raise ValueError("Unspport tokenize algorithm:{}".format(name))
diff --git a/lion/data/dataset.py b/lion/data/dataset.py
index 668b6a5..d2ae1e1 100644
--- a/lion/data/dataset.py
+++ b/lion/data/dataset.py
@@ -63,7 +63,7 @@ def make_char(char_dict, token, word_length=16):
             # Not index words
             oriAtoken = ex['Atokens']
             oriBtoken = ex['Btokens']
-        if self.args.network == 'xlnet':
+        if self.args.network == 'xlnet' or self.args.network == 'roberta':
             Atoken = torch.LongTensor(ex['Atokens'])
             Btoken = torch.LongTensor(ex['Btokens'])
             Achar = torch.zeros(len(ex['Atokens']), 16)
diff --git a/lion/data/processor.py b/lion/data/processor.py
index e83b4a4..4c902a5 100644
--- a/lion/data/processor.py
+++ b/lion/data/processor.py
@@ -44,17 +44,31 @@ def process_datum(datum, tokenizer, label2index, max_length):
     rv['Bpos'] = B.pos()
     rv['Bner'] = B.entities()
     if tokenizer.__class__.__name__ == 'BertTokenizer':
-        # Adapt to bert input format
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        A Bert sequence pair has the following format: [CLS] A [SEP] B [SEP]
+        """
         truncate_seq_pair(rv['Atokens'], rv['Btokens'], max_length-3)
         rv['Atokens'] = ["[CLS]"] + rv['Atokens'] + ["[SEP]"]
         rv['Btokens'] = rv['Btokens'] + ["[SEP]"]
     if tokenizer.__class__.__name__ == 'XLNetTokenizer':
-        # Adapt to xlnet input format
-        # special_symbols = {SEG_ID_A: 0, SEG_ID_B: 1, SEG_ID_CLS: 2, "<cls>": 3, "<sep>": 4,
-        # SEG_ID_SEP: 3, SEG_ID_PAD: 4}
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        A xlnet sequence pair has the following format: A [SEP] B [SEP] [CLS]
+        special_symbols = {SEG_ID_A: 0, SEG_ID_B: 1, SEG_ID_CLS: 2, "<cls>": 3, "<sep>": 4,
+        SEG_ID_SEP: 3, SEG_ID_PAD: 4}
+        """
         truncate_seq_pair(rv['Atokens'], rv['Btokens'], max_length-3)
         rv['Atokens'] = rv['Atokens'] + [4]
         rv['Btokens'] = rv['Btokens'] + [4] + [3]
+    if tokenizer.__class__.__name__ == 'RobertaTokenizer':
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
+        """
+        truncate_seq_pair(rv['Atokens'], rv['Btokens'], max_length - 3)
+        rv['Atokens'] = [0] + rv['Atokens'] + [2] + [2]
+        rv['Btokens'] = rv['Btokens'] + [2]
     return rv
 
 
@@ -75,15 +89,20 @@ def truncate_seq_pair(tokens_a, tokens_b, max_length):
 
 
 def process_dataset(in_dir, out_dir, splits=['train', 'dev', 'test'],
-                    tokenizer_name='spacy', vocab_file=None, max_length=128):
+                    tokenizer_name='spacy', vocab_file=None, max_length=128, **kwargs):
     def jsondump(data, filename):
         json.dump(data, open(osp.join(out_dir, filename), 'w'), indent=2, ensure_ascii=False)
-    if tokenizer_name == 'bert':
-        tokenizer = get_class(tokenizer_name)(vocab_file)
-    elif tokenizer_name == 'xlnet':
+
+    if tokenizer_name == 'bert' or tokenizer_name == 'xlnet':
+        if not vocab_file:
+            raise ValueError('Bert model should use an existing vocab')
         tokenizer = get_class(tokenizer_name)(vocab_file)
+    elif tokenizer_name == 'roberta':
+        merges_file = kwargs.pop('merges_file')
+        tokenizer = get_class(tokenizer_name)(vocab_file, merges_file)
     else:
         tokenizer = get_class(tokenizer_name)()
+
     if not osp.exists(out_dir):
         os.makedirs(out_dir, exist_ok=True)
 
@@ -96,15 +115,10 @@ def jsondump(data, filename):
         processed = []
         for datum in tqdm(dataset):
             try:
-                processed.append(process_datum(datum, tokenizer, label2index))
-            except Exception as e:
-                print(e)
                 processed.append(process_datum(datum, tokenizer, label2index, max_length))
-            except:
-                raise ValueError('Bae line {}'.format(datum))
-        #with Pool(30) as p:
-        #    processed = p.map(tokenizer.tokenize, dataset)
-        if tokenizer_name != 'xlnet':
+            except Exception as e:
+                raise ValueError('Exception: {}, Bae line {}'.format(e, datum, ensure_ascii=False))
+        if tokenizer_name != 'xlnet' and tokenizer_name != 'roberta':
             char_dict, word_dict, pos_dict, ner_dict = gather_dict(processed)
             jsondump(char_dict, 'char.json')
             jsondump(word_dict, 'word.json')
@@ -113,6 +127,10 @@ def jsondump(data, filename):
         out_file = open(osp.join(out_dir, 'train_{}.jsonl'.format(tokenizer_name)), 'w')
         for datum in processed:
             out_file.write('{}\n'.format(json.dumps(datum, ensure_ascii=False)))
+
+    if 'train' not in splits:
+        raise ValueError('`splits` argument must contain `train` otherwise `label2index` will be NontType!')
+
     if 'dev' in splits:
         split = 'dev.jsonl'
         filename = osp.join(in_dir, split)
@@ -122,8 +140,8 @@ def jsondump(data, filename):
         for datum in tqdm(dataset):
             try:
                 processed.append(process_datum(datum, tokenizer, label2index, max_length))
-            except:
-                raise ValueError('Bae line {}'.format(datum, ensure_ascii=False))
+            except Exception as e:
+                raise ValueError('Exception: {}, Bae line {}'.format(e, datum, ensure_ascii=False))
         for datum in processed:
             out_file.write('{}\n'.format(json.dumps(datum)))
     if 'test' in splits:
@@ -135,8 +153,8 @@ def jsondump(data, filename):
         for datum in tqdm(dataset):
             try:
                 processed.append(process_datum(datum, tokenizer, label2index, max_length))
-            except:
-                raise ValueError('Bae line {}'.format(datum))
+            except Exception as e:
+                raise ValueError('Exception: {}, Bae line {}'.format(e, datum, ensure_ascii=False))
         for datum in processed:
             out_file.write('{}\n'.format(json.dumps(datum, ensure_ascii=False)))
 
diff --git a/lion/models/__init__.py b/lion/models/__init__.py
index 637c691..448cca9 100644
--- a/lion/models/__init__.py
+++ b/lion/models/__init__.py
@@ -7,6 +7,7 @@
 from .esim import ESIM
 from .bert import BertForSequenceClassification
 from .xlnet import XLNetForSequenceClassification
+from .roberta import RobertaForSequenceClassification
 
 
 def get_model_class(name):
@@ -20,4 +21,6 @@ def get_model_class(name):
         return BertForSequenceClassification
     if name == 'xlnet':
         return XLNetForSequenceClassification
+    if name == 'roberta':
+        return RobertaForSequenceClassification
     raise RuntimeError('Invalid model %s' % name)
diff --git a/lion/models/bert.py b/lion/models/bert.py
index 7a87d2f..37c4bfd 100644
--- a/lion/models/bert.py
+++ b/lion/models/bert.py
@@ -112,6 +112,8 @@ class BertPreTrainedModel(nn.Module):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+    base_model_prefix = "bert"
+
     def __init__(self, config, *inputs, **kwargs):
         super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
@@ -165,7 +167,7 @@ def from_pretrained(cls, pretrained_model_path, *inputs, **kwargs):
                 (ex: num_labels for BertForSequenceClassification)
         """
         # Load config
-        config_file = os.path.join(pretrained_model_path, 'bert_config.json')
+        config_file = os.path.join(pretrained_model_path, '{}_config.json'.format(cls.base_model_prefix))
         if not os.path.exists(config_file):
             # Backward compatibility with old naming format
             config_file = os.path.join(pretrained_model_path, 'config.json')
@@ -211,8 +213,8 @@ def load(module, prefix=''):
                 if child is not None:
                     load(child, prefix + name + '.')
         start_prefix = ''
-        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
-            start_prefix = 'bert.'
+        if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+            start_prefix = cls.base_model_prefix + '.'
         load(model, prefix=start_prefix)
         if len(missing_keys) > 0:
             logger.info("Weights of {} not initialized from pretrained model: {}".format(
@@ -239,10 +241,11 @@ def __init__(self, config):
         self.LayerNorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, input_ids, token_type_ids=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
         seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
-        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
 
diff --git a/lion/models/roberta.py b/lion/models/roberta.py
new file mode 100644
index 0000000..1ca9016
--- /dev/null
+++ b/lion/models/roberta.py
@@ -0,0 +1,86 @@
+import torch
+import torch.nn as nn
+
+from lion.models.bert import BertEmbeddings, BertModel, BertPreTrainedModel
+
+
+class RobertaEmbeddings(BertEmbeddings):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+    def __init__(self, config):
+        super(RobertaEmbeddings, self).__init__(config)
+        self.padding_idx = 1
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
+                                                padding_idx=self.padding_idx)
+
+    def forward(self, input_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None:
+            # Position numbers begin at padding_idx+1. Padding symbols are ignored.
+            # cf. fairseq's `utils.make_positions`
+            position_ids = torch.arange(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=torch.long,
+                                        device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        return super(RobertaEmbeddings, self).forward(input_ids, position_ids=position_ids)
+
+
+class RobertaModel(BertModel):
+    base_model_prefix = "roberta"
+
+    def __init__(self, config):
+        super(RobertaModel, self).__init__(config)
+
+        self.embeddings = RobertaEmbeddings(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, attention_mask=None, output_all_encoded_layers=False):
+        if input_ids[:, 0].sum().item() != 0:
+            print("A sequence with no special tokens has been passed to the RoBERTa model. "
+                  "This model requires special tokens in order to work. "
+                  "Please specify add_special_tokens=True in your encoding.")
+        return super(RobertaModel, self).forward(input_ids, attention_mask=attention_mask,
+                                                 output_all_encoded_layers=output_all_encoded_layers)
+
+
+class RobertaForSequenceClassification(BertPreTrainedModel):
+    base_model_prefix = "roberta"
+
+    def __init__(self, config, num_labels):
+        super(RobertaForSequenceClassification, self).__init__(config)
+        self.num_labels = num_labels
+
+        self.roberta = RobertaModel(config)
+        self.classifier = RobertaClassificationHead(config, self.num_labels)
+
+    def forward(self, ex):
+        A = ex['Atoken_ids']
+        B = ex['Btoken_ids']
+        Amask = ex['Amask']
+        Bmask = ex['Bmask']
+        input_ids = torch.cat([A, B], dim=-1)
+        attention_mask = torch.cat([Amask, Bmask], dim=-1)
+
+        sequence_output, _ = self.roberta(input_ids, attention_mask=attention_mask, output_all_encoded_layers=False)
+        logits = self.classifier(sequence_output)
+        return logits
+
+
+class RobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config, num_labels):
+        super(RobertaClassificationHead, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
\ No newline at end of file
diff --git a/lion/training/model.py b/lion/training/model.py
index 31fece4..f359bc6 100644
--- a/lion/training/model.py
+++ b/lion/training/model.py
@@ -18,7 +18,7 @@
 class MatchingModel:
     def __init__(self, params, state_dict=None):
         self.params = params
-        if params.network == 'bert' or params.network == 'xlnet':
+        if params.network == 'bert' or params.network == 'xlnet' or params.network == 'roberta':
             self.network = get_model_class(params.network).from_pretrained(params.model_dir, params.classes)
         else:
             self.network = get_model_class(params.network)(params)