diff --git a/README.md b/README.md
index 530687c..1028832 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ Below is a example for English Question-Answering. We also provide an example fo
 ```python
 from bertserini.reader.base import Question, Context
 from bertserini.reader.bert_reader import BERT
-from bertserini.utils.utils_new import get_best_answer
+from bertserini.utils.utils import get_best_answer
 
 model_name = "rsvp-ai/bertserini-bert-base-squad"
 tokenizer_name = "rsvp-ai/bertserini-bert-base-squad"
diff --git a/bertserini/experiments/eval/evaluate_v1.py b/bertserini/experiments/eval/evaluate_v1.py
index 8c32312..300cf9f 100755
--- a/bertserini/experiments/eval/evaluate_v1.py
+++ b/bertserini/experiments/eval/evaluate_v1.py
@@ -4,6 +4,12 @@
 import argparse
 import json
 
+from rouge_metric import PyRouge
+rouge = PyRouge(rouge_n=(2,), rouge_su=True, skip_gap=4)
+#from rouge_score import rouge_scorer
+#rouge1_scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
+#rougel_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+
 from bertserini.utils.utils import normalize_answer, init_logger
 
 logger = init_logger("evluation")
@@ -67,6 +73,32 @@ def overlap_score(prediction, ground_truth):
 def exact_match_score(prediction, ground_truth):
     return normalize_answer(prediction) == normalize_answer(ground_truth)
 
+def rouge2_r_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-2"]["r"]
+    #return rouge1_scorer.score(prediction, ground_truth)
+
+def rouge2_f_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-2"]["f"]
+
+def rougesu4_r_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-su4"]["r"]
+
+def rougesu4_f_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-su4"]["f"]
+
+#def rougel_score(prediction, ground_truth):
+#    print(rougel_scorer.score(prediction, ground_truth))
+#    input()
+#    return rougel_scorer.score(prediction, ground_truth)
+
 
 def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
     scores_for_ground_truths = []
@@ -92,7 +124,7 @@ def metric_max_recall(metric_fn, prediction, ground_truths):
 
 
 def evaluate(dataset, predictions):
-    sentence_cover = precision = cover = sentence_recall = recall = f1 = exact_match = total = overlap = 0
+    sentence_cover = precision = cover = sentence_recall = recall = f1 = exact_match = total = overlap = rouge2_r = rouge2_f = rougesu4_r = rougesu4_f = 0
     for article in dataset:
         for paragraph in article['paragraphs']:
             for qa in paragraph['qas']:
@@ -104,6 +136,11 @@ def evaluate(dataset, predictions):
                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
                 prediction = [predictions[qa['id']]]
                 #prediction_sentence = predictions[qa['id']]['sentences']
+                rouge2_r += metric_max_recall(rouge2_r_score,  prediction, ground_truths)
+                rouge2_f += metric_max_recall(rouge2_f_score,  prediction, ground_truths)
+                rougesu4_r += metric_max_recall(rougesu4_r_score,  prediction, ground_truths)
+                rougesu4_f += metric_max_recall(rougesu4_f_score,  prediction, ground_truths)
+                #rougel += metric_max_recall(rougel_score,  prediction, ground_truths)
                 cover += metric_max_recall(cover_score, prediction, ground_truths)
                 exact_match += metric_max_recall(
                     exact_match_score, prediction, ground_truths)
@@ -124,21 +161,27 @@ def evaluate(dataset, predictions):
     overlap = 100.0 * overlap / total
     cover = 100.0 * cover / total
     precision = 100.0 * precision / total
+    rouge2_r = 100.0 * rouge2_r / total
+    rouge2_f = 100.0 * rouge2_f / total
+    rougesu4_r = 100.0 * rougesu4_r / total
+    rougesu4_f = 100.0 * rougesu4_f / total
+    #rougel = 100.0 * rougel / total
     #sentence_recall = 100.0 * sentence_recall / total
     #sentence_cover = 100.0 * sentence_cover / total
 
     return {'exact_match': exact_match, 'f1': f1, "recall": recall, 
             #"sentence_recall": sentence_recall, "sentence_cover": sentence_cover,
-            "precision": precision, "cover": cover, "overlap": overlap}
+            "precision": precision, "cover": cover, "overlap": overlap, 
+            "rouge2_r": rouge2_r, "rouge2_f":rouge2_f, "rougesu4_r": rougesu4_r, "rougesu4_f": rougesu4_f}
 
 
 def squad_v1_eval(dataset_filename, prediction_filename):
     expected_version = '1.1'
     with open(dataset_filename) as dataset_file:
         dataset_json = json.load(dataset_file)
-        if dataset_json['version'] != expected_version:
-            logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
-                expected_version, dataset_json['version']))
+        #if dataset_json['version'] != expected_version:
+        #    logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
+        #        expected_version, dataset_json['version']))
         dataset = dataset_json['data']
     with open(prediction_filename) as prediction_file:
         predictions = json.load(prediction_file)
diff --git a/bertserini/experiments/inference.py b/bertserini/experiments/inference.py
index a204218..a88e622 100644
--- a/bertserini/experiments/inference.py
+++ b/bertserini/experiments/inference.py
@@ -2,8 +2,9 @@
 from tqdm import tqdm
 from bertserini.reader.bert_reader import BERT
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
-from bertserini.utils.utils_new import extract_squad_questions
+from bertserini.utils.utils import extract_squad_questions
 from bertserini.experiments.args import *
+import time
 
 if __name__ == "__main__":
     questions = extract_squad_questions(args.dataset_path, do_strip_accents=args.strip_accents)
@@ -13,8 +14,11 @@
 
     all_answer = []
     for question in tqdm(questions):
+        # print("before retriever:", time.time())
         contexts = retriever(question, searcher, args.topk)
+        # print("before reader:", time.time())
         final_answers = bert_reader.predict(question, contexts)
+        # print("after reader:", time.time())
         final_answers_lst = []
         for ans in final_answers:
             final_answers_lst.append(
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index b0256ce..173193a 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -2,33 +2,16 @@
 
 import torch
 from torch.utils.data import DataLoader, SequentialSampler
-from transformers import AutoTokenizer, AutoModelForQuestionAnswering, SquadExample, squad_convert_examples_to_features
-from transformers.data.processors.squad import SquadResult
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, default_data_collator, EvalPrediction
+from datasets import Dataset
+import numpy as np
 
+from bertserini.utils.utils_qa import postprocess_qa_predictions
 from bertserini.reader.base import Reader, Question, Context, Answer
-from bertserini.utils.utils_squad_metrics import compute_predictions_logits
 
-__all__ = ['BERT']
-
-from bertserini.train.run_squad import to_list
-
-def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]:
-    examples = []
-    for idx, ctx in enumerate(contexts):
-        examples.append(
-            SquadExample(
-                qas_id=idx,
-                question_text=question.text,
-                context_text=ctx.text,
-                answer_text=None,
-                start_position_character=None,
-                title="",
-                is_impossible=False,
-                answers=[],
-            )
-        )
-    return examples
+from datasets.utils import logging
 
+__all__ = ['BERT']
 
 class BERT(Reader):
     def __init__(self, args):
@@ -37,7 +20,7 @@ def __init__(self, args):
             self.model_args.tokenizer_name = self.model_args.model_name_or_path
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_args.model_name_or_path).to(self.device).eval()
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_args.tokenizer_name, do_lower_case=True, use_fast=False)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_args.tokenizer_name, do_lower_case=True)
         self.args = {
             "max_seq_length": 384,
             "doc_stride": 128,
@@ -45,7 +28,7 @@ def __init__(self, args):
             "threads": 1,
             "tqdm_enabled": False,
             "n_best_size": 20,
-            "max_answer_length": 30,
+            "max_answer_length": 384,
             "do_lower_case": True,
             "output_prediction_file": False,
             "output_nbest_file": self.model_args.output_nbest_file,
@@ -53,6 +36,7 @@ def __init__(self, args):
             "verbose_logging": False,
             "version_2_with_negative": True,
             "null_score_diff_threshold": 0,
+            "pad_on_right": False,
         }
 
     def update_args(self, args_to_change):
@@ -60,77 +44,155 @@ def update_args(self, args_to_change):
             self.args[key] = args_to_change[key]
 
     def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
-        examples = craft_squad_examples(question, contexts)
-
-        features, dataset = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=self.tokenizer,
-            max_seq_length=self.args["max_seq_length"],
-            doc_stride=self.args["doc_stride"],
-            max_query_length=self.args["max_query_length"],
-            is_training=False,
-            return_dataset="pt",
-            threads=self.args["threads"],
-            tqdm_enabled=self.args["tqdm_enabled"]
-        )
+        logging.disable_progress_bar()
+
+        def prepare_validation_features(examples):
+            question_column_name = "question"
+            context_column_name = "context"
+            # answer_column_name = "answers" if "answers" in column_names else column_names[2]
+            # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+            # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+            # left whitespace
+            examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+            # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+            # in one example possible giving several features when a context is long, each of those features having a
+            # context that overlaps a bit the context of the previous feature.
+            tokenized_examples = self.tokenizer(
+                examples[question_column_name if self.args["pad_on_right"] else context_column_name],
+                examples[context_column_name if self.args["pad_on_right"] else question_column_name],
+                truncation="only_second" if self.args["pad_on_right"] else "only_first",
+                max_length=self.args["max_seq_length"],
+                stride=self.args["doc_stride"],
+                return_overflowing_tokens=True,
+                return_offsets_mapping=True,
+                verbose=False,
+                padding="max_length",
+            )
 
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(dataset)
-        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.model_args.eval_batch_size)
+            # Since one example might give us several features if it has a long context, we need a map from a feature to
+            # its corresponding example. This key gives us just that.
+            sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+            # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+            # corresponding example_id and we will store the offset mappings.
+            tokenized_examples["example_id"] = []
+
+            for i in range(len(tokenized_examples["input_ids"])):
+                # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+                sequence_ids = tokenized_examples.sequence_ids(i)
+                context_index = 1 if self.args["pad_on_right"] else 0
+
+                # One example can give several spans, this is the index of the example containing this span of text.
+                sample_index = sample_mapping[i]
+                tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+                # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+                # position is part of the context or not.
+                tokenized_examples["offset_mapping"][i] = [
+                    (o if sequence_ids[k] == context_index else None)
+                    for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+                ]
+            return tokenized_examples
+
+        def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
+            """
+            Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+
+            Args:
+                start_or_end_logits(:obj:`tensor`):
+                    This is the output predictions of the model. We can only enter either start or end logits.
+                eval_dataset: Evaluation dataset
+                max_len(:obj:`int`):
+                    The maximum length of the output tensor. ( See the model.eval() part for more details )
+            """
+
+            step = 0
+            # create a numpy array and fill it with -100.
+            logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64)
+            # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather
+            for i, output_logit in enumerate(start_or_end_logits):  # populate columns
+                # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
+                # And after every iteration we have to change the step
+
+                batch_size = output_logit.shape[0]
+                cols = output_logit.shape[1]
+
+                if step + batch_size < len(dataset):
+                    logits_concat[step: step + batch_size, :cols] = output_logit
+                else:
+                    logits_concat[step:, :cols] = output_logit[: len(dataset) - step]
+
+                step += batch_size
+
+            return logits_concat
+
+        def post_processing_function(examples, features, predictions, stage="eval"):
+            # Post-processing: we match the start logits and end logits to answers in the original context.
+            _, all_nbest_json = postprocess_qa_predictions(
+                examples=examples,
+                features=features,
+                predictions=predictions,
+                version_2_with_negative=self.args["version_2_with_negative"],
+                n_best_size=self.args["n_best_size"],
+                max_answer_length=self.args["max_answer_length"],
+                null_score_diff_threshold=self.args["null_score_diff_threshold"],
+                output_dir="./tmp/",
+                prefix=stage,
+            )
+            return all_nbest_json
+
+
+        inputs = {"question": [], "context": [], "id": []}
+        for i, ctx in enumerate(contexts):
+            inputs["question"].append(question.text)
+            inputs["context"].append(contexts[i].text)
+            inputs["id"].append(i)
+        eval_examples = Dataset.from_dict(inputs)
+        column_names = eval_examples.column_names
+        eval_dataset = eval_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=1,
+            remove_columns=column_names,
+        )
 
-        all_results = []
+        eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])
 
+        eval_dataloader = DataLoader(
+            eval_dataset_for_model,
+            collate_fn=default_data_collator,
+            batch_size=self.model_args.eval_batch_size,
+        )
+        self.model.eval()
+        all_start_logits = []
+        all_end_logits = []
         for batch in eval_dataloader:
-            self.model.eval()
-            batch = tuple(t.to(self.device) for t in batch)
+            for k in batch:
+                batch[k] = batch[k].to(self.device)
             with torch.no_grad():
-                inputs = {
-                    "input_ids": batch[0],
-                    "attention_mask": batch[1],
-                    "token_type_ids": batch[2],
-                }
-                feature_indices = batch[3]
-                outputs = self.model(**inputs)
-
-            for i, feature_index in enumerate(feature_indices):
-                eval_feature = features[feature_index.item()]
-                unique_id = int(eval_feature.unique_id)
-
-                output = [outputs[oname][i] for oname in outputs]
-                start_logits = outputs.start_logits[i]
-                end_logits = outputs.end_logits[i]
-                try:
-                    start_logits = start_logits.item()
-                    end_logits = end_logits.item()
-                except:
-                    pass
-
-                result = SquadResult(unique_id, start_logits, end_logits)
-                all_results.append(result)
-
-        answers, nbest = compute_predictions_logits(
-            all_examples=examples,
-            all_features=features,
-            all_results=all_results,
-            n_best_size=self.args["n_best_size"],
-            max_answer_length=self.args["max_answer_length"],
-            do_lower_case=self.args["do_lower_case"],
-            output_prediction_file=self.args["output_prediction_file"],
-            output_nbest_file=self.args["output_nbest_file"],
-            output_null_log_odds_file=self.args["output_null_log_odds_file"],
-            verbose_logging=self.args["verbose_logging"],
-            version_2_with_negative=self.args["version_2_with_negative"],
-            null_score_diff_threshold=self.args["null_score_diff_threshold"],
-            tokenizer=self.tokenizer,
-            language=question.language
-        )
+                outputs = self.model(**batch)
+                start_logits = outputs.start_logits
+                end_logits = outputs.end_logits
+                all_start_logits.append(start_logits.cpu().numpy())
+                all_end_logits.append(end_logits.cpu().numpy())
 
-        all_answers = []
+        start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, self.args["max_answer_length"])
+        end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset,  self.args["max_answer_length"])
 
-        for idx, ans in enumerate(nbest):
+        del all_start_logits
+        del all_end_logits
+
+        outputs_numpy = (start_logits_concat, end_logits_concat)
+
+        all_nbest_json = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
+
+        all_answers = []
+        for idx, ans in enumerate(all_nbest_json):
             all_answers.append(Answer(
-                text=nbest[ans][0]["text"],
-                score=nbest[ans][0]["start_logit"] + nbest[ans][0]["end_logit"],
+                text=all_nbest_json[ans][0]["text"],
+                score=all_nbest_json[ans][0]["probability"],
+                # score=all_nbest_json[ans][0]["start_logit"] + all_nbest_json[ans][0]["end_logit"],
                 ctx_score=contexts[idx].score,
                 language=question.language
             ))
diff --git a/bertserini/reader/t5_reader.py b/bertserini/reader/t5_reader.py
new file mode 100644
index 0000000..347181a
--- /dev/null
+++ b/bertserini/reader/t5_reader.py
@@ -0,0 +1,208 @@
+from typing import List
+
+import torch
+from torch.utils.data import DataLoader, SequentialSampler
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, EvalPrediction
+import datasets
+from datasets import Dataset
+import numpy as np
+from typing import List, Optional, Tuple
+
+from bertserini.reader.base import Reader, Question, Context, Answer
+
+from datasets.utils import logging
+
+__all__ = ['T5']
+class T5(Reader):
+    def __init__(self, args):
+        self.model_args = args
+        if self.model_args.tokenizer_name is None:
+            self.model_args.tokenizer_name = self.model_args.model_name_or_path
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_args.model_name_or_path).to(self.device).eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_args.tokenizer_name, do_lower_case=True)
+        self.question_column = 'question'
+        self.context_column = 'context'
+        self.answer_column = 'answers'
+        '''
+          --per_device_train_batch_size 4 \
+          --per_device_eval_batch_size 1 \
+          --output_dir ./models/s2s_squad2_0train/ \
+          --eval_accumulation_steps 1 \
+          --predict_with_generate \
+        '''
+        self.args = {
+            "max_seq_length": 384,
+            "doc_stride": 128,
+            "max_query_length": 64,
+            "threads": 1,
+            "tqdm_enabled": False,
+            "n_best_size": 20,
+            "max_answer_length": 384,
+            "do_lower_case": True,
+            "output_prediction_file": False,
+            "output_nbest_file": self.model_args.output_nbest_file,
+            "output_null_log_odds_file": None,
+            "verbose_logging": False,
+            "version_2_with_negative": True,
+            "null_score_diff_threshold": 0,
+            "ignore_pad_token_for_loss": True
+        }
+
+    def update_args(self, args_to_change):
+        for key in args_to_change:
+            self.args[key] = args_to_change[key]
+
+    def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
+        logging.disable_progress_bar()
+
+        def preprocess_squad_batch(
+                examples,
+                question_column: str,
+                context_column: str,
+                answer_column: str,
+        ) -> Tuple[List[str], List[str]]:
+            questions = examples[question_column]
+            contexts = examples[context_column]
+            answers = examples.get(answer_column,[])
+
+            def generate_input(_question, _context):
+                return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()])
+
+            inputs = [generate_input(question, context) for question, context in zip(questions, contexts)]
+            targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers]
+            return inputs, targets
+
+        def preprocess_function(examples):
+            inputs, targets = preprocess_squad_batch(examples, self.question_column, self.context_column, self.answer_column)
+
+            model_inputs = self.tokenizer(inputs, max_length=self.args["max_seq_length"], padding='max_length', truncation=True)
+            # Setup the tokenizer for targets
+            with self.tokenizer.as_target_tokenizer():
+                labels = self.tokenizer(targets, max_length=self.args['max_answer_length'], padding='max_length', truncation=True)
+
+            # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+            # padding in the loss.
+            if self.args['ignore_pad_token_for_loss']:
+                labels["input_ids"] = [
+                    [(l if l != self.tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+                ]
+
+            model_inputs["labels"] = labels["input_ids"]
+            return model_inputs
+
+        # Validation preprocessing
+        def preprocess_validation_function(examples):
+            inputs, targets = preprocess_squad_batch(examples, self.question_column, self.context_column, self.answer_column)
+
+            model_inputs = self.tokenizer(
+                inputs,
+                max_length=self.args["max_seq_length"],
+                padding='max_length',
+                truncation=True,
+                return_offsets_mapping=True,
+            )
+
+            if targets:
+                # Setup the tokenizer for targets
+                with self.tokenizer.as_target_tokenizer():
+                    labels = self.tokenizer(targets, max_length=self.args['max_answer_length'], padding='max_length', truncation=True)
+
+            # Since one example might give us several features if it has a long context, we need a map from a feature to
+            # its corresponding example. This key gives us just that.
+            # sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
+            sample_mapping = list(range(len(model_inputs["input_ids"])))
+
+            # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+            # corresponding example_id and we will store the offset mappings.
+            model_inputs["example_id"] = []
+
+            for i in range(len(model_inputs["input_ids"])):
+                # One example can give several spans, this is the index of the example containing this span of text.
+                sample_index = sample_mapping[i]
+                model_inputs["example_id"].append(examples["id"][sample_index])
+
+            if targets:
+                # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+                # padding in the loss.
+                if self.args['ignore_pad_token_for_loss']:
+                    labels["input_ids"] = [
+                        [(l if l != self.tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+                    ]
+
+                model_inputs["labels"] = labels["input_ids"]
+
+            return model_inputs
+
+        def post_processing_function(examples: datasets.Dataset, features: datasets.Dataset, outputs, stage="eval"):
+            # Decode the predicted tokens.
+            decoded_preds = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+            # Build a map example to its corresponding features.
+            example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+            feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
+            predictions = {}
+            # Let's loop over all the examples!
+            for example_index, example in enumerate(examples):
+                # This is the index of the feature associated to the current example.
+                feature_index = feature_per_example[example_index]
+                predictions[example["id"]] = decoded_preds[feature_index]
+
+            # Format the result to the format the metric expects.
+            if self.args['version_2_with_negative']:
+                formatted_predictions = [
+                    {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+                ]
+            else:
+                formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+            # references = [{"id": ex["id"], "answers": ex[self.answer_column]} for ex in examples]
+            # return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+            return formatted_predictions
+
+
+
+        inputs = {"question": [], "context": [], "id": []}
+        for i, ctx in enumerate(contexts):
+            inputs["question"].append(question.text)
+            inputs["context"].append(contexts[i].text)
+            inputs["id"].append(i)
+        print(inputs)
+        eval_examples = Dataset.from_dict(inputs)
+        column_names = eval_examples.column_names
+        eval_dataset = eval_examples.map(
+            preprocess_validation_function,
+            batched=True,
+            num_proc=1,
+            remove_columns=column_names,
+        )
+
+        eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])
+
+        eval_dataloader = DataLoader(
+            eval_dataset_for_model,
+            collate_fn=default_data_collator,
+            batch_size=self.model_args.eval_batch_size,
+        )
+        raw_predict = []
+        for batch in eval_dataloader:
+            for k in batch:
+                batch[k] = batch[k].to(self.device)
+            outs = self.model.generate(input_ids=batch['input_ids'],
+                                  attention_mask=batch['attention_mask'],
+                                  max_length=16,
+                                  early_stopping=True)
+            raw_predict.extend(outs)
+        all_nbest_json = post_processing_function(eval_examples, eval_dataset, raw_predict)
+
+        all_answers = []
+        for item in all_nbest_json:
+            all_answers.append(Answer(
+                text=item["prediction_text"],
+                score=0.0,
+                # score=all_nbest_json[ans][0]["start_logit"] + all_nbest_json[ans][0]["end_logit"],
+                ctx_score=contexts[item['id']].score,
+                language=question.language
+            ))
+        return all_answers
+
diff --git a/bertserini/retriever/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py
index fdd283e..ad17c27 100644
--- a/bertserini/retriever/pyserini_retriever.py
+++ b/bertserini/retriever/pyserini_retriever.py
@@ -86,5 +86,6 @@ def hits_to_contexts(hits: List[JLuceneSearcherResult], language="en", field='ra
             if s in t:
                 continue
         metadata = {}
-        contexts.append(Context(t, language, metadata, score))
+
+        contexts.append(Context(hit, language=language, metadata=metadata, score=score))
     return contexts
diff --git a/bertserini/utils/utils.py b/bertserini/utils/utils.py
index 5a3d2e4..ad4f2b6 100644
--- a/bertserini/utils/utils.py
+++ b/bertserini/utils/utils.py
@@ -5,7 +5,9 @@
 import re
 import zhon
 import numpy as np
+from hanziconv import HanziConv
 
+from bertserini.reader.base import Question
 
 def strip_accents(text):
     return "".join(char for char in unicodedata.normalize('NFKD', text)
@@ -174,3 +176,25 @@ def remove_punc(text):
         return ''.join(ch for ch in text if ch not in exclude)
 
     return remove_punc(s)
+
+def get_best_answer(candidates, weight=0.5):
+    for ans in candidates:
+        ans.aggregate_score(weight)
+    return sorted(candidates, key=lambda x: x.total_score, reverse=True)[0]
+
+
+def extract_squad_questions(squad_filename, do_strip_accents=False, language="en"):
+    data = json.load(open(squad_filename, 'r'))
+    data = data["data"]
+    questions = []
+    for article in data:
+        for paragraph in article["paragraphs"]:
+            for qa in paragraph["qas"]:
+                id_ = qa["id"]
+                question = qa["question"]
+                if do_strip_accents:
+                    question = strip_accents(question)
+                if language == "zh":
+                    HanziConv.toSimplified(question)
+                questions.append(Question(question, id_, language))
+    return questions
diff --git a/bertserini/utils/utils_new.py b/bertserini/utils/utils_new.py
deleted file mode 100644
index 09ee01f..0000000
--- a/bertserini/utils/utils_new.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import json
-from hanziconv import HanziConv
-
-from bertserini.reader.base import Question
-from bertserini.utils.utils import strip_accents
-
-
-def get_best_answer(candidates, weight=0.5):
-    for ans in candidates:
-        ans.aggregate_score(weight)
-    return sorted(candidates, key=lambda x: x.total_score, reverse=True)[0]
-
-
-def extract_squad_questions(squad_filename, do_strip_accents=False, language="en"):
-    data = json.load(open(squad_filename, 'r'))
-    data = data["data"]
-    questions = []
-    for article in data:
-        for paragraph in article["paragraphs"]:
-            for qa in paragraph["qas"]:
-                id_ = qa["id"]
-                question = qa["question"]
-                if do_strip_accents:
-                    question = strip_accents(question)
-                if language == "zh":
-                    HanziConv.toSimplified(question)
-                questions.append(Question(question, id_, language))
-    return questions
\ No newline at end of file
diff --git a/bertserini/utils/utils_qa.py b/bertserini/utils/utils_qa.py
new file mode 100644
index 0000000..dedcd52
--- /dev/null
+++ b/bertserini/utils/utils_qa.py
@@ -0,0 +1,434 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Post-processing utilities for question answering.
+"""
+import collections
+import json
+import logging
+import os
+from typing import Optional, Tuple
+
+import numpy as np
+from tqdm.auto import tqdm
+
+
+logger = logging.getLogger(__name__)
+
+
+def postprocess_qa_predictions(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    null_score_diff_threshold: float = 0.0,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
+    original contexts. This is the base postprocessing functions for models that only return start and end logits.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
+            The threshold used to select the null answer: if the best answer has a score that is less than the score of
+            the null answer minus this threshold, the null answer is selected for this example (note that the score of
+            the null answer for an example giving several features is the minimum of the scores for the null answer on
+            each feature: all features must be aligned on the fact they `want` to predict a null answer).
+
+            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    if len(predictions) != 2:
+        raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
+    all_start_logits, all_end_logits = predictions
+
+    if len(predictions[0]) != len(features):
+        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    if version_2_with_negative:
+        scores_diff_json = collections.OrderedDict()
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(examples):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_prediction = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_logits = all_start_logits[feature_index]
+            end_logits = all_end_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction.
+            feature_null_score = start_logits[0] + end_logits[0]
+            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
+                min_null_prediction = {
+                    "offsets": (0, 0),
+                    "score": feature_null_score,
+                    "start_logit": start_logits[0],
+                    "end_logit": end_logits[0],
+                }
+
+            # Go through all possibilities for the `n_best_size` greater start and end logits.
+            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
+                    # to part of the input_ids that are not in the context.
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or len(offset_mapping[start_index]) < 2
+                        or offset_mapping[end_index] is None
+                        or len(offset_mapping[end_index]) < 2
+                    ):
+                        continue
+                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_logits[start_index] + end_logits[end_index],
+                            "start_logit": start_logits[start_index],
+                            "end_logit": end_logits[end_index],
+                        }
+                    )
+        if version_2_with_negative:
+            # Add the minimum null prediction
+            prelim_predictions.append(min_null_prediction)
+            null_score = min_null_prediction["score"]
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Add back the minimum null prediction if it was removed because of its low score.
+        if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
+            predictions.append(min_null_prediction)
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
+            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        probs = np.array([pred.pop("score") for pred in predictions])
+        # exp_scores = np.exp(scores - np.max(scores))
+        # probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction. If the null answer is not possible, this is easy.
+        if not version_2_with_negative:
+            all_predictions[example["id"]] = predictions[0]["text"]
+        else:
+            # Otherwise we first need to find the best non-empty prediction.
+            i = 0
+            while predictions[i]["text"] == "":
+                i += 1
+            best_non_null_pred = predictions[i]
+
+            # Then we compare to the null prediction using the threshold.
+            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
+            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example["id"]] = ""
+            else:
+                all_predictions[example["id"]] = best_non_null_pred["text"]
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        if not os.path.isdir(output_dir):
+            raise EnvironmentError(f"{output_dir} is not a directory.")
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions, all_nbest_json
+
+
+def postprocess_qa_predictions_with_beam_search(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    start_n_top: int = 5,
+    end_n_top: int = 5,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
+    original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
+    cls token predictions.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        start_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
+        end_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    if len(predictions) != 5:
+        raise ValueError("`predictions` should be a tuple with five elements.")
+    start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
+
+    if len(predictions[0]) != len(features):
+        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(examples):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_score = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_log_prob = start_top_log_probs[feature_index]
+            start_indexes = start_top_index[feature_index]
+            end_log_prob = end_top_log_probs[feature_index]
+            end_indexes = end_top_index[feature_index]
+            feature_null_score = cls_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction
+            if min_null_score is None or feature_null_score < min_null_score:
+                min_null_score = feature_null_score
+
+            # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_index = int(start_indexes[i])
+                    j_index = i * end_n_top + j
+                    end_index = int(end_indexes[j_index])
+                    # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
+                    # p_mask but let's not take any risk)
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                    ):
+                        continue
+                    # Don't consider answers with a length negative or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_log_prob[i] + end_log_prob[j_index],
+                            "start_log_prob": start_log_prob[i],
+                            "end_log_prob": end_log_prob[j_index],
+                        }
+                    )
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0:
+            predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction and set the probability for the null answer.
+        all_predictions[example["id"]] = predictions[0]["text"]
+        if version_2_with_negative:
+            scores_diff_json[example["id"]] = float(min_null_score)
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        if not os.path.isdir(output_dir):
+            raise EnvironmentError(f"{output_dir} is not a directory.")
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions, scores_diff_json
diff --git a/bertserini/utils/utils_squad_metrics.py b/bertserini/utils/utils_squad_metrics.py
index e3b5716..284af9e 100644
--- a/bertserini/utils/utils_squad_metrics.py
+++ b/bertserini/utils/utils_squad_metrics.py
@@ -251,7 +251,7 @@ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_
     return evaluation
 
 
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False, language="en", tokenizer_name="rsvp-ai/bertserini-bert-base-squad"):
+def get_final_text(pred_text, orig_text, do_lower_case, tokenizer, verbose_logging=False, language="en", tokenizer_name="rsvp-ai/bertserini-bert-base-squad"):
     """Project the tokenized prediction back to the original text."""
 
     # When we created the data, we kept track of the alignment between original
@@ -295,7 +295,7 @@ def _strip_spaces(text):
     # NOT the same length, the heuristic has failed. If they are the same
     # length, we assume the characters are one-to-one aligned.
 
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=False)
+    #tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
     if language=="zh":
         tok_text = "".join(tokenizer.tokenize(orig_text))
     else:
@@ -422,6 +422,8 @@ def compute_predictions_logits(
     all_predictions = collections.OrderedDict()
     all_nbest_json = collections.OrderedDict()
     scores_diff_json = collections.OrderedDict()
+    import os
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
     for (example_index, example) in enumerate(all_examples):
         features = example_index_to_features[example_index]
@@ -491,7 +493,9 @@ def compute_predictions_logits(
 
         seen_predictions = {}
         nbest = []
+        c = 0
         for pred in prelim_predictions:
+            c += 1
             if len(nbest) >= n_best_size:
                 break
             feature = features[pred.feature_index]
@@ -518,7 +522,7 @@ def compute_predictions_logits(
                     tok_text = " ".join(tok_text.split())
                     orig_text = " ".join(orig_tokens)
 
-                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging, language=language)
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, tokenizer, verbose_logging=verbose_logging, language=language)
                 if "##" in final_text or "[UNK]" in final_text:
                     print(final_text, "||", tok_text, "||", orig_text)
 
@@ -736,7 +740,7 @@ def compute_predictions_log_probs(
             else:
                 do_lower_case = tokenizer.do_lowercase_and_remove_accent
 
-            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging, tokenizer)
 
             if final_text in seen_predictions:
                 continue
diff --git a/inference_test.py b/inference_test.py
index 69bf0f7..edc06a7 100644
--- a/inference_test.py
+++ b/inference_test.py
@@ -1,15 +1,16 @@
 from bertserini.reader.base import Question, Context
 from bertserini.reader.bert_reader import BERT
 from bertserini.reader.dpr_reader import DPR
-from bertserini.utils.utils_new import get_best_answer
+from bertserini.reader.t5_reader import T5
+from bertserini.utils.utils import get_best_answer
 from bertserini.experiments.args import *
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
 
-ENG_reader = "BERT"
+ENG_reader = "T5"
 do_local_test = True
 do_bm25_test = True
-do_dpr_test = True
-do_chinese_test = True
+do_dpr_test = False
+do_chinese_test = False
 
 if ENG_reader == "BERT":
     args.model_name_or_path = "rsvp-ai/bertserini-bert-base-squad"
@@ -21,19 +22,23 @@
     args.tokenizer_name = "facebook/dpr-reader-multiset-base"
     bert_reader = DPR(args)
 
-question = Question("Why did Mark Twain call the 19th century the glied age?")
+elif ENG_reader == "T5":
+    args.model_name_or_path = "/data/aileen/workspace/t5_test2/models/gpu/checkpoint-10500"
+    args.tokenizer_name = "t5-base"
+    bert_reader = T5(args)
+
+# question = Question("Why did Mark Twain call the 19th century the glied age?")
+question = Question("What is the capital city of China?")
+
 print(question.text)
 
 if do_local_test:
     print("######################### Testing Local Context #########################")
-    contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.')]
+    contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.'),
+                Context('The "Gilded Age"')]
     candidates = bert_reader.predict(question, contexts)
     answer = get_best_answer(candidates, 1.0)
     print("Answer:", answer.text)
-    if answer.text == "there had been a dramatic expansion of American wealth and prosperity":
-        print("Local Context Test Passed")
-    else:
-        print("Wrong Answer")
 
 if do_bm25_test:
     print("######################### Testing BM25 Context #########################")
@@ -41,9 +46,10 @@
     searcher = build_searcher(args)
     contexts = retriever(question, searcher, 10)
     candidates = bert_reader.predict(question, contexts)
+    print(candidates)
     answer = get_best_answer(candidates, 0.45)
     print("Answer:", answer.text) # todo: no context returned. is the context included? maybe update to another question
-    print("BM25 Test Passed")
+    # print("BM25 Test Passed")
 
 if do_dpr_test:
     print("######################### Testing DPR Context #########################")
@@ -51,7 +57,7 @@
     args.encoder = "facebook/dpr-question_encoder-multiset-base"
     args.query_tokenizer_name = "facebook/dpr-question_encoder-multiset-base"
     args.index_path = "../pyserini/dpr-ctx_encoder-multiset-base.ik-nlp-22_slp" # todo: replicate dpr on wiki and release dpr-indexes
-    args.device = "cuda:0"
+    args.device = "cuda:cpu"
     args.sparse_index = "../anserini/lucene-index.ik-nlp-22"
     searcher = build_searcher(args)
     contexts = retriever(question, searcher, 10)